# Search Engine for Medium Articles

- **Tokenization**
- **Word Co Occurence Matrix**
- **Continouous Bag of Words (CBoW)**
- **Word2Vec**
- **Search Articles**

In [66]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gensim

import nltk, re, string, contractions
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
raw_data = pd.read_csv(r'F:\Muthu_2023\Personal\NextStep\NLP\NLP\Dataset\medium_articles_v3.csv')
raw_data.head()

Unnamed: 0,link,title,sub_title,author,reading_time,text,id
0,https://towardsdatascience.com/ensemble-method...,"Ensemble methods: bagging, boosting and stacking",Understanding the key concepts of ensemble lea...,Joseph Rocca,20,This post was co-written with Baptiste Rocca.\...,1
1,https://towardsdatascience.com/understanding-a...,Understanding AUC - ROC Curve,"In Machine Learning, performance measurement i...",Sarang Narkhede,5,"In Machine Learning, performance measurement i...",2
2,https://towardsdatascience.com/how-to-work-wit...,How to work with object detection datasets in ...,"A comprehensive guide to defining, loading, ex...",Eric Hofesmann,10,Microsoft's Common Objects in Context dataset ...,3
3,https://towardsdatascience.com/11-dimensionali...,11 Dimensionality reduction techniques you sho...,Reduce the size of your dataset while keeping ...,Rukshan Pramoditha,16,"In both Statistics and Machine Learning, the n...",4
4,https://towardsdatascience.com/the-time-series...,The Time Series Transformer,Attention Is All You Need they said. Is it a m...,Theodoros Ntakouris,6,Attention Is All You Need they said. Is it a m...,5


In [164]:
raw_data = raw_data.drop(66)

`As per Analysis, Article 67 contains 10000+ unique words due to the presence of names of google scholars`

## Text Preprocessing

In [151]:
def text_preprocess(text):
    sent_tokens = sent_tokenize(text)
    stop_words = stopwords.words('English')
    sent_processed = []
    for sent in sent_tokens:
        sent = re.sub(r'[^a-zA-Z0-9 ]',' ', contractions.fix(sent.lower()))
        sent = re.sub(r'https://[^\s\n\r]+', '', sent) #Remove links
        sent = re.sub(r'http://[^\s\n\r]+', '', sent)
        sent = re.sub(r'[^a-zA-Z0-9 ]',' ', sent)
        word_list = []
        for word in sent.split():
            if word not in stop_words and len(word.strip()) > 1 and not word.isnumeric() and not bool(re.search(r'\d', word)) and len(word.strip()) < 20:
                word_list.append(word)
        if len(word_list)>0:
            sent_processed.append(' '.join(word_list))
    return(sent_processed)

In [165]:
raw_data['transformed_text'] = raw_data['text'].apply(text_preprocess)
raw_data.head()

Unnamed: 0,link,title,sub_title,author,reading_time,text,id,transformed_text
0,https://towardsdatascience.com/ensemble-method...,"Ensemble methods: bagging, boosting and stacking",Understanding the key concepts of ensemble lea...,Joseph Rocca,20,This post was co-written with Baptiste Rocca.\...,1,"[post co written baptiste rocca, unity strengt..."
1,https://towardsdatascience.com/understanding-a...,Understanding AUC - ROC Curve,"In Machine Learning, performance measurement i...",Sarang Narkhede,5,"In Machine Learning, performance measurement i...",2,[machine learning performance measurement esse...
2,https://towardsdatascience.com/how-to-work-wit...,How to work with object detection datasets in ...,"A comprehensive guide to defining, loading, ex...",Eric Hofesmann,10,Microsoft's Common Objects in Context dataset ...,3,[microsoft common objects context dataset coco...
3,https://towardsdatascience.com/11-dimensionali...,11 Dimensionality reduction techniques you sho...,Reduce the size of your dataset while keeping ...,Rukshan Pramoditha,16,"In both Statistics and Machine Learning, the n...",4,[statistics machine learning number attributes...
4,https://towardsdatascience.com/the-time-series...,The Time Series Transformer,Attention Is All You Need they said. Is it a m...,Theodoros Ntakouris,6,Attention Is All You Need they said. Is it a m...,5,"[attention need said, robust convolution, hack..."


In [109]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   link              208 non-null    object
 1   title             208 non-null    object
 2   sub_title         208 non-null    object
 3   author            208 non-null    object
 4   reading_time      208 non-null    int64 
 5   text              208 non-null    object
 6   id                208 non-null    int64 
 7   transformed_text  208 non-null    object
dtypes: int64(2), object(6)
memory usage: 13.1+ KB


In [110]:
raw_data['id'].nunique()

208

## Word Co Occurence Matrix

In [166]:
sent_list = raw_data['transformed_text'].explode()
voc_list = sent_list.str.split().explode().unique()
print('Vocabulary Size: ', len(voc_list), 'No. of sentences: ', len(sent_list))

In [240]:
d = {}
for sentence in sent_list:
    words = sentence.split()
    for i in range(len(words)-2):
        if (words[i], words[i+1]) not in d:
            if (words[i+1], words[i]) not in d:
                d[(words[i], words[i+1])] = 1
            else:
                d[(words[i+1], words[i])] += 1
        else:
            d[(words[i], words[i+1])] += 1
            
        if (words[i], words[i+2]) not in d:
            if (words[i+2], words[i]) not in d:
                d[(words[i], words[i+2])] = 1
            else:
                d[(words[i+2], words[i])] += 1
        else:
            d[(words[i], words[i+2])] += 1

In [228]:
x_list = []
y_list = []
for sentence in sent_list:
    words = sentence.split()
    for ind in range(len(words)):
        pair_list = []
        for sub_ind in range(ind - 2, ind + 3):
            if sub_ind != ind and sub_ind >= 0 and sub_ind < len(words):
                pair_list.append(words[sub_ind])                
        if len(pair_list) > 0:
            x_list.append(pair_list)
            y_list.append([words[ind]])

In [229]:
len(x_list), len(y_list)

(244928, 244928)

In [231]:
mlb = MultiLabelBinarizer(classes = voc_list, sparse_output=True) # Generates Multi label Encoding with sparse output

In [257]:
xtrain = mlb.fit_transform(x_list)

In [258]:
ytrain = mlb.fit_transform(y_list)

In [234]:
mlb.classes_

array(['post', 'co', 'written', ..., 'serotonin', 'gobbled', 'critic'],
      dtype=object)

In [262]:
xtrain.to_coo()

AttributeError: to_coo not found

## Prepare Model

In [247]:
import tensorflow as tf

In [252]:
from tensorflow.keras.layers import Dense, InputLayer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras import Sequential

In [250]:
vec_size = 10
voc_size = len(voc_list)

In [255]:
model = Sequential()
model.add(InputLayer(input_shape=(voc_size,), sparse=True))
model.add(Dense(vec_size, activation='relu'))
model.add(Dense(voc_size, activation='softmax'))

In [256]:
model.compile(optimizer='adam', loss='SparseCategoricalCrossentropy', metrics='accuracy')

In [276]:
model.fit(csr_matrix.sorted_indices(xtrain), csr_matrix.sorted_indices(ytrain), epochs=1, verbose=1)

TypeError: in user code:

    File "C:\Users\ADMIN\anaconda3\Lib\site-packages\keras\engine\training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\ADMIN\anaconda3\Lib\site-packages\keras\engine\training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\ADMIN\anaconda3\Lib\site-packages\keras\engine\training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\ADMIN\anaconda3\Lib\site-packages\keras\engine\training.py", line 1051, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\ADMIN\anaconda3\Lib\site-packages\keras\engine\training.py", line 1109, in compute_loss
        return self.compiled_loss(
    File "C:\Users\ADMIN\anaconda3\Lib\site-packages\keras\engine\compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\ADMIN\anaconda3\Lib\site-packages\keras\losses.py", line 142, in __call__
        losses = call_fn(y_true, y_pred)
    File "C:\Users\ADMIN\anaconda3\Lib\site-packages\keras\losses.py", line 268, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\ADMIN\anaconda3\Lib\site-packages\keras\losses.py", line 2078, in sparse_categorical_crossentropy
        return backend.sparse_categorical_crossentropy(
    File "C:\Users\ADMIN\anaconda3\Lib\site-packages\keras\backend.py", line 5607, in sparse_categorical_crossentropy
        target = tf.convert_to_tensor(target)

    TypeError: Failed to convert elements of SparseTensor(indices=Tensor("DeserializeSparse_1:0", shape=(None, 2), dtype=int64), values=Tensor("DeserializeSparse_1:1", shape=(None,), dtype=int32), dense_shape=Tensor("stack_1:0", shape=(2,), dtype=int64)) to Tensor. Consider casting elements to a supported type. See https://www.tensorflow.org/api_docs/python/tf/dtypes for supported TF dtypes.


In [243]:
sentences = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect
we conjure the spirits of the computer with our spells.""".lower()

In [244]:
words = sentences.split()
vocab = set(words)

In [245]:
data = []
for i in range(2, len(words) - 2):
    context = [words[i - 2], words[i - 1], words[i + 1], words[i + 2]]
    target = words[i]
    data.append((context, target))

In [263]:
tf.sparse.reorder(xtrain)

TypeError: Input must be a SparseTensor.

In [265]:
from scipy.sparse import csr_matrix

In [274]:
xtrain_ = csr_matrix.sorted_indices(xtrain)

In [275]:
xtrain_

<244928x19511 sparse matrix of type '<class 'numpy.int32'>'
	with 818906 stored elements in Compressed Sparse Row format>

# Word2Vec

In [279]:
from gensim.models import Word2Vec

In [None]:
#Generate list of list
sentence_list = []
for sent in sent_list:
    word_list = []
    for word in sent.split():
        word_list.append(word)
    sentence_list.append(word_list)

In [353]:
model = Word2Vec(sentence_list, window=2, vector_size=100, sg=0, min_count=0)

In [358]:
model.build_vocab(sentence_list, progress_per=10000)

In [359]:
model.train(sentence_list, total_examples=model.corpus_count, epochs=10)

(2414913, 2458680)

In [360]:
model.wv.most_similar(positive=["learning"])

[('vending', 0.9781566262245178),
 ('washing', 0.9589352011680603),
 ('translation', 0.9211783409118652),
 ('slot', 0.9177753925323486),
 ('elliptical', 0.9150514602661133),
 ('dug', 0.9093351364135742),
 ('clawed', 0.9092390537261963),
 ('cpap', 0.9047977328300476),
 ('repair', 0.9045436382293701),
 ('mastercard', 0.903074324131012)]

In [362]:
model.wv['learning']

array([ 9.24759984e-01,  8.64426970e-01,  1.09302449e+00,  7.42402256e-01,
       -2.15008545e+00, -1.08801043e+00,  6.40723467e-01,  4.89838272e-01,
       -6.58541799e-01, -1.69573104e+00, -8.41604590e-01, -1.49746466e+00,
        4.63539332e-01,  2.73074299e-01,  1.28581202e+00, -1.45257592e+00,
        8.89300048e-01, -1.25001347e+00, -1.33422530e+00, -1.85545683e+00,
        2.47237757e-01,  6.67589486e-01,  4.34866488e-01, -3.99671756e-02,
       -1.36693752e+00,  4.69376028e-01, -1.59108460e+00,  5.23888886e-01,
       -5.02065718e-01,  8.63543868e-01,  5.00534594e-01, -7.08035588e-01,
        1.96976960e-01, -2.09720635e+00, -5.82256317e-01,  9.67792749e-01,
        9.61209610e-02, -7.44150400e-01, -1.66353607e+00, -6.20042861e-01,
       -1.34150818e-01,  8.35871339e-01,  6.93104386e-01,  7.01354086e-01,
        7.66724423e-02, -1.66193402e+00, -1.63881516e+00, -1.37015915e+00,
        1.44539297e+00,  1.12358975e+00, -1.22417712e+00, -9.21171904e-01,
        1.47957332e-03, -

In [383]:
# Find Centroid
raw_data['Centroid_cbow'] = [[0.0] * 100] * raw_data.shape[0]
for index in range(len(raw_data)):
    centroid = np.array([0.0] * 100)
    article = raw_data['transformed_text'].iloc[index]
    for sent in article:
        for word in sent.split():
            try:
                centroid = np.add(centroid, model.wv[word])
            except:
                continue
    raw_data['Centroid_cbow'].iloc[index] = centroid.tolist()

In [382]:
for index in range(len(raw_data)):
    cent_article = raw_data['Centroid_cbow'].iloc[index]
    cos_sim = 0
    for word in query:
        cos_sim += (cosine_similarity(model.wv[word], cent_article))
    cos_sim_list(row[index])