# Search Engine for Medium Articles

- **Tokenization**
- **Word Co Occurence Matrix**
- **Continouous Bag of Words (CBoW)**
- **Word2Vec**
- **Search Articles**

In [66]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gensim

import nltk, re, string, contractions
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.preprocessing import MultiLabelBinarizer

In [4]:
raw_data = pd.read_csv(r'F:\Muthu_2023\Personal\NextStep\NLP\NLP\Dataset\medium_articles_v3.csv')
raw_data.head()

Unnamed: 0,link,title,sub_title,author,reading_time,text,id
0,https://towardsdatascience.com/ensemble-method...,"Ensemble methods: bagging, boosting and stacking",Understanding the key concepts of ensemble lea...,Joseph Rocca,20,This post was co-written with Baptiste Rocca.\...,1
1,https://towardsdatascience.com/understanding-a...,Understanding AUC - ROC Curve,"In Machine Learning, performance measurement i...",Sarang Narkhede,5,"In Machine Learning, performance measurement i...",2
2,https://towardsdatascience.com/how-to-work-wit...,How to work with object detection datasets in ...,"A comprehensive guide to defining, loading, ex...",Eric Hofesmann,10,Microsoft's Common Objects in Context dataset ...,3
3,https://towardsdatascience.com/11-dimensionali...,11 Dimensionality reduction techniques you sho...,Reduce the size of your dataset while keeping ...,Rukshan Pramoditha,16,"In both Statistics and Machine Learning, the n...",4
4,https://towardsdatascience.com/the-time-series...,The Time Series Transformer,Attention Is All You Need they said. Is it a m...,Theodoros Ntakouris,6,Attention Is All You Need they said. Is it a m...,5


In [164]:
raw_data = raw_data.drop(66)

`As per Analysis, Article 67 contains 10000+ unique words due to the presence of names of google scholars`

## Text Preprocessing

In [151]:
def text_preprocess(text):
    sent_tokens = sent_tokenize(text)
    stop_words = stopwords.words('English')
    sent_processed = []
    for sent in sent_tokens:
        sent = re.sub(r'[^a-zA-Z0-9 ]',' ', contractions.fix(sent.lower()))
        sent = re.sub(r'https://[^\s\n\r]+', '', sent) #Remove links
        sent = re.sub(r'http://[^\s\n\r]+', '', sent)
        sent = re.sub(r'[^a-zA-Z0-9 ]',' ', sent)
        word_list = []
        for word in sent.split():
            if word not in stop_words and len(word.strip()) > 1 and not word.isnumeric() and not bool(re.search(r'\d', word)) and len(word.strip()) < 20:
                word_list.append(word)
        if len(word_list)>0:
            sent_processed.append(' '.join(word_list))
    return(sent_processed)

In [165]:
raw_data['transformed_text'] = raw_data['text'].apply(text_preprocess)
raw_data.head()

Unnamed: 0,link,title,sub_title,author,reading_time,text,id,transformed_text
0,https://towardsdatascience.com/ensemble-method...,"Ensemble methods: bagging, boosting and stacking",Understanding the key concepts of ensemble lea...,Joseph Rocca,20,This post was co-written with Baptiste Rocca.\...,1,"[post co written baptiste rocca, unity strengt..."
1,https://towardsdatascience.com/understanding-a...,Understanding AUC - ROC Curve,"In Machine Learning, performance measurement i...",Sarang Narkhede,5,"In Machine Learning, performance measurement i...",2,[machine learning performance measurement esse...
2,https://towardsdatascience.com/how-to-work-wit...,How to work with object detection datasets in ...,"A comprehensive guide to defining, loading, ex...",Eric Hofesmann,10,Microsoft's Common Objects in Context dataset ...,3,[microsoft common objects context dataset coco...
3,https://towardsdatascience.com/11-dimensionali...,11 Dimensionality reduction techniques you sho...,Reduce the size of your dataset while keeping ...,Rukshan Pramoditha,16,"In both Statistics and Machine Learning, the n...",4,[statistics machine learning number attributes...
4,https://towardsdatascience.com/the-time-series...,The Time Series Transformer,Attention Is All You Need they said. Is it a m...,Theodoros Ntakouris,6,Attention Is All You Need they said. Is it a m...,5,"[attention need said, robust convolution, hack..."


In [109]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   link              208 non-null    object
 1   title             208 non-null    object
 2   sub_title         208 non-null    object
 3   author            208 non-null    object
 4   reading_time      208 non-null    int64 
 5   text              208 non-null    object
 6   id                208 non-null    int64 
 7   transformed_text  208 non-null    object
dtypes: int64(2), object(6)
memory usage: 13.1+ KB


In [110]:
raw_data['id'].nunique()

208

## Word Co Occurence Matrix

In [166]:
sent_list = raw_data['transformed_text'].explode()
voc_list = sent_list.str.split().explode().unique()
print('Vocabulary Size: ', len(voc_list), 'No. of sentences: ', len(sent_list))

In [240]:
d = {}
for sentence in sent_list:
    words = sentence.split()
    for i in range(len(words)-2):
        if (words[i], words[i+1]) not in d:
            if (words[i+1], words[i]) not in d:
                d[(words[i], words[i+1])] = 1
            else:
                d[(words[i+1], words[i])] += 1
        else:
            d[(words[i], words[i+1])] += 1
            
        if (words[i], words[i+2]) not in d:
            if (words[i+2], words[i]) not in d:
                d[(words[i], words[i+2])] = 1
            else:
                d[(words[i+2], words[i])] += 1
        else:
            d[(words[i], words[i+2])] += 1

In [242]:
d

{('post', 'co'): 4,
 ('post', 'written'): 7,
 ('co', 'written'): 3,
 ('co', 'baptiste'): 3,
 ('written', 'baptiste'): 5,
 ('written', 'rocca'): 5,
 ('old', 'saying'): 1,
 ('old', 'expresses'): 1,
 ('saying', 'expresses'): 1,
 ('saying', 'pretty'): 1,
 ('expresses', 'pretty'): 1,
 ('expresses', 'well'): 1,
 ('pretty', 'well'): 5,
 ('pretty', 'underlying'): 1,
 ('well', 'underlying'): 1,
 ('well', 'idea'): 1,
 ('underlying', 'idea'): 1,
 ('underlying', 'rules'): 1,
 ('idea', 'rules'): 3,
 ('idea', 'powerful'): 2,
 ('rules', 'powerful'): 1,
 ('rules', 'ensemble'): 1,
 ('powerful', 'ensemble'): 1,
 ('powerful', 'methods'): 1,
 ('ensemble', 'methods'): 7,
 ('ensemble', 'machine'): 2,
 ('methods', 'machine'): 4,
 ('methods', 'learning'): 7,
 ('roughly', 'ensemble'): 1,
 ('roughly', 'learning'): 1,
 ('ensemble', 'learning'): 9,
 ('learning', 'often'): 3,
 ('methods', 'often'): 1,
 ('methods', 'trust'): 1,
 ('often', 'trust'): 1,
 ('often', 'top'): 2,
 ('trust', 'top'): 1,
 ('trust', 'rankings

In [228]:
x_list = []
y_list = []
for sentence in sent_list:
    words = sentence.split()
    for ind in range(len(words)):
        pair_list = []
        for sub_ind in range(ind - 2, ind + 3):
            if sub_ind != ind and sub_ind >= 0 and sub_ind < len(words):
                pair_list.append(words[sub_ind])                
        if len(pair_list) > 0:
            x_list.append(pair_list)
            y_list.append([words[ind]])

In [229]:
len(x_list), len(y_list)

(244928, 244928)

In [231]:
mlb = MultiLabelBinarizer(classes = voc_list, sparse_output=True) # Generates Multi label Encoding with sparse output

In [232]:
mlb.fit_transform(x_list)

<244928x19511 sparse matrix of type '<class 'numpy.int32'>'
	with 818906 stored elements in Compressed Sparse Row format>

In [233]:
mlb.fit_transform(y_list)

<244928x19511 sparse matrix of type '<class 'numpy.int32'>'
	with 244928 stored elements in Compressed Sparse Row format>

In [234]:
mlb.classes_

array(['post', 'co', 'written', ..., 'serotonin', 'gobbled', 'critic'],
      dtype=object)