# Search Engine for Medium Articles

- **Tokenization**
- **Word Co Occurence Matrix**
- **Continouous Bag of Words (CBoW)**
- **Word2Vec**
- **Search Articles**

In [66]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gensim

import nltk, re, string, contractions
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [4]:
raw_data = pd.read_csv(r'F:\Muthu_2023\Personal\NextStep\NLP\NLP\Dataset\medium_articles_v3.csv')
raw_data.head()

Unnamed: 0,link,title,sub_title,author,reading_time,text,id
0,https://towardsdatascience.com/ensemble-method...,"Ensemble methods: bagging, boosting and stacking",Understanding the key concepts of ensemble lea...,Joseph Rocca,20,This post was co-written with Baptiste Rocca.\...,1
1,https://towardsdatascience.com/understanding-a...,Understanding AUC - ROC Curve,"In Machine Learning, performance measurement i...",Sarang Narkhede,5,"In Machine Learning, performance measurement i...",2
2,https://towardsdatascience.com/how-to-work-wit...,How to work with object detection datasets in ...,"A comprehensive guide to defining, loading, ex...",Eric Hofesmann,10,Microsoft's Common Objects in Context dataset ...,3
3,https://towardsdatascience.com/11-dimensionali...,11 Dimensionality reduction techniques you sho...,Reduce the size of your dataset while keeping ...,Rukshan Pramoditha,16,"In both Statistics and Machine Learning, the n...",4
4,https://towardsdatascience.com/the-time-series...,The Time Series Transformer,Attention Is All You Need they said. Is it a m...,Theodoros Ntakouris,6,Attention Is All You Need they said. Is it a m...,5


## Text Preprocessing

In [71]:
def text_preprocess(text):
    sent_tokens = sent_tokenize(text)
    stop_words = stopwords.words('English')
    sent_processed = []
    for sent in sent_tokens:
        sent = re.sub(r'[^a-zA-Z0-9 ]',' ', contractions.fix(sent.lower()))
        word_list = []
        for word in sent.split():
            if word not in stop_words and len(word.strip()) > 1:
                word_list.append(word)
        if len(word_list)>0:
            sent_processed.append(' '.join(word_list))
    return(sent_processed)

In [72]:
raw_data['transformed_text'] = raw_data['text'].apply(text_preprocess)
raw_data.head()

Unnamed: 0,link,title,sub_title,author,reading_time,text,id,transformed_text
0,https://towardsdatascience.com/ensemble-method...,"Ensemble methods: bagging, boosting and stacking",Understanding the key concepts of ensemble lea...,Joseph Rocca,20,This post was co-written with Baptiste Rocca.\...,1,"[post co written baptiste rocca, unity strengt..."
1,https://towardsdatascience.com/understanding-a...,Understanding AUC - ROC Curve,"In Machine Learning, performance measurement i...",Sarang Narkhede,5,"In Machine Learning, performance measurement i...",2,[machine learning performance measurement esse...
2,https://towardsdatascience.com/how-to-work-wit...,How to work with object detection datasets in ...,"A comprehensive guide to defining, loading, ex...",Eric Hofesmann,10,Microsoft's Common Objects in Context dataset ...,3,[microsoft common objects context dataset coco...
3,https://towardsdatascience.com/11-dimensionali...,11 Dimensionality reduction techniques you sho...,Reduce the size of your dataset while keeping ...,Rukshan Pramoditha,16,"In both Statistics and Machine Learning, the n...",4,[statistics machine learning number attributes...
4,https://towardsdatascience.com/the-time-series...,The Time Series Transformer,Attention Is All You Need they said. Is it a m...,Theodoros Ntakouris,6,Attention Is All You Need they said. Is it a m...,5,"[attention need said, robust convolution, hack..."


## Word Co Occurence Matrix

In [78]:
sent_list = raw_data['transformed_text'].explode()
voc_list = sent_list.str.split().explode().unique()

In [79]:
len(voc_list)

30968

In [82]:
sent_list = raw_data['transformed_text'].explode()

In [83]:
len(sent_list)

26892

In [90]:
d = {}
for sentence in sent_list:
    words = sentence.split()
    for i in range(len(words)-1):
        if (words[i], words[i+1]) not in d and (words[i+1], words[i]) not in d:
            d[(words[i], words[i+1])] = 1

In [86]:
if ('Muthu', 'kumar1') in d:
    print('1')

In [91]:
len(d)

190509

In [98]:
temp_df = pd.DataFrame()
temp_df['sentences'] = raw_data['transformed_text'].explode().str.split()

In [99]:
temp_df

Unnamed: 0,sentences
0,"[post, co, written, baptiste, rocca]"
0,"[unity, strength]"
0,"[old, saying, expresses, pretty, well, underly..."
0,"[roughly, ensemble, learning, methods, often, ..."
0,"[purpose, post, introduce, various, notions, e..."
...,...
207,"[cannot, eat, way, every, day, month, stay, ke..."
207,"[written, week, week, guide, eating, accordanc..."
207,"[bottom, line, use, menstrual, cycle, lose, we..."
207,"[created, quick, start, guide, including, meal..."


In [100]:
pd.get_dummies(temp_df['sentences'])

TypeError: unhashable type: 'list'

In [101]:
diff = pd.DataFrame({'R': ['a', 'c', 'd'], 
                     'T': ['d', 'a', 'c'],
                     'S_': [1, 2, 3]})
 
print(pd.get_dummies(diff, prefix=['column1', 'column2']))

   S_  column1_a  column1_c  column1_d  column2_a  column2_c  column2_d
0   1          1          0          0          0          0          1
1   2          0          1          0          1          0          0
2   3          0          0          1          0          1          0


In [102]:
diff

Unnamed: 0,R,T,S_
0,a,d,1
1,c,a,2
2,d,c,3
