In [80]:
import nltk
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors

#reading the data set - news data sets

data_set = pd.read_csv("news_summary.csv")
print(data_set.columns)
data_set
len(data_set)

Index(['headlines', 'text'], dtype='object')


98401

In [14]:
# Dividing Dataset into X and Y
text_review = data_set.drop(columns = 'headlines' , axis = 1)
text_review = text_review.iloc[:5000,:]
head_lines = data_set.drop(columns = 'text', axis = 1)
head_lines = head_lines.iloc[:5000,:]

Unnamed: 0,headlines
0,upGrad learner switches to career in ML & Al w...
1,Delhi techie wins free food from Swiggy for on...
2,New Zealand end Rohit Sharma-led India's 12-ma...
3,Aegon life iTerm insurance plan helps customer...
4,"Have known Hirani for yrs, what if MeToo claim..."
...,...
4995,25 fake call centres busted since July in Noid...
4996,Govt to meet fiscal deficit target despite GST...
4997,"Ikea to invest â¹5,000 cr, create 8,000 jobs ..."
4998,"GST on under-construction flats, houses may be..."


In [41]:
# Text Preprocessing

#punctuations removal

def cleaning(matter):
    temp = []
    for content in matter:
        cleaned_text = re.sub(r'\[[0-9]*\]',' ',content)
        final_text=re.sub(r'\s+',' ',cleaned_text)
        final_text=re.sub(r'\[[a-zA-Z]*\]',' ',final_text)
        final_text=re.sub(r'\s+',' ',final_text)
        final_text=re.sub(r'\(([^)]*)\)',' ',final_text)
        final_text=re.sub(r'\s+',' ',final_text)
        temp.append(final_text)
    return temp
    

In [42]:
#cleaning the text
p_text = cleaning(text_review.text)
p_headlines = cleaning(head_lines.headlines)

In [43]:
#contractions
#https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is / how does",
"I'd": "I had / I would",
"I'd've": "I would have",
"I'll": "I shall / I will",
"I'll've": "I shall have / I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}

In [55]:
#stop word removal and contractions replacing
sw=nltk.corpus.stopwords.words("english")
text_final = []
text_headlines = []
for text_1 in p_text:
    temp = text_1.split()
    new_text = []
    for word in temp:
        word = word.lower()
        if word in contractions.keys():
            new_text.append(contractions[word])
        else:
            new_text.append(word)
    t = " ".join(new_text)
    text_final.append(t)
for text_2 in p_headlines:
    temp = text_2.split()
    new_text1 = []
    for word in temp:
        word = word.lower()
        if word in contractions.keys():
            new_text1.append(contractions[word])
        else:
            new_text1.append(word)
    t1 = " ".join(new_text1)
    text_headlines.append(t1)
print(type(text_final))
print(type(text_headlines))

<class 'list'>
<class 'list'>


In [52]:
#counting the number of words and sentences in the text
word_count = {}
for sentence in text_final:
  for word in nltk.word_tokenize(sentence):
    if word in word_count.keys():
      word_count[word] += 1
    else:
      word_count[word] = 1

for sentence in text_headlines:
  for word in nltk.word_tokenize(sentence):
    if word in word_count.keys():
      word_count[word]+=1
    else:
      word_count[word] = 1
print(len(word_count))#total words

25407


In [103]:
#introducing the word embeddings for the data
#preparing the data for word_vectorization
temp1 = [nltk.word_tokenize(sentence) for sentence in text_final]
temp2 = [nltk.word_tokenize(sentence) for sentence in text_headlines]
total_text = temp1+temp2

from gensim.models import Word2Vec

# generating the word embeddings using the input text
#considering the words in the input text that appear atleast 5 times
#default size(dimension) of Word2Vec are 100

word_embeddings = Word2Vec(total_text, min_count = 1)
word_embeddings_vocab = embeddings_model.wv.vocab
type(word_embeddings) #word embeddings from input



gensim.models.word2vec.Word2Vec

In [104]:
#converting words to int 
#limiting the vocab that is allowed to appear
threshold = 25  #words repeating atleast 25 times

vocab_to_int = {}
value = 0
for word,count in word_count.items():
    if count >= threshold:
        vocab_to_int[word] = value
        value +=1 
print(len(vocab_to_int))

# Special tokens assigined to the vocab
codes = ["<UNK>","<PAD>","<EOS>","<GO>"]   

# Add codes to vocab
for code in codes:
    vocab_to_int[code] = len(vocab_to_int)
    

# Dictionary to convert integers to words
int_to_vocab = {}
for word, value in vocab_to_int.items():
    int_to_vocab[value] = word


print(len(int_to_vocab))


1869
1873


In [106]:

embedding_dim = 100
words_size = len(vocab_to_int)

# Creating an embedding matrix with default values of zero
word_embedding_matrix = np.zeros((words_size,embedding_dim),dtype=np.float32)

for word,i in vocab_to_int.items():
    if word in word_embeddings:
        word_embedding_matrix[i] = word_embeddings[word]


# Check if value matches len(vocab_to_int)
print(len(word_embedding_matrix))
print(word_embedding_matrix) 

1873
[[-1.2565484  -0.04227275  0.05313071 ... -0.3204583   1.342404
   0.55143905]
 [-0.953348   -1.5597372   0.5479414  ... -0.22231664  1.2996067
   0.09413171]
 [-0.8027061  -0.94861317  1.1931536  ... -0.5199362   0.8372211
   0.18912937]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]


  
  if __name__ == '__main__':
