In [1]:
import requests, os, re
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
import spacy
nlp = spacy.load('en_core_web_sm',disable=['ner','parser'])
from collections import defaultdict
from gensim.models import Word2Vec
nlp.max_length=5000000
import re
from contractions_dict import contractions_dict

### Read the scrapped data from excel sheet

In [2]:
df = pd.read_excel("final_unt.xlsx")

#### Applying Basic Cleaning of Data
1. Convert entire text to string incase if any rows has only numbers
2. Convert to lower case
3. Expanding the contractions
4. Removing the urls
5. Removing more than one space between words
6. Converting the words to root word by lemmatizing

In [4]:
df.text = df.text.astype('str')
df.text = df.text.apply(lambda x: x.lower())

In [5]:
contr_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

In [6]:
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contr_re.sub(replace, text)

In [8]:
df.text = df.text.apply(lambda x:expand_contractions(x))

In [9]:
df.text = df.text.str.replace('http\S+|[^a-z]|\w*\d\w*',' ',regex=True)

In [10]:
df.text = df.text.apply(lambda x: re.sub(' +',' ',x))

In [11]:
def lemmatize(text):
    words = []
    for word in nlp(text):
        if not word.is_stop:
            words.append(word.lemma_)
    return ' '.join(words)

In [12]:
df.text = df.text.apply(lemmatize)

#### Creating a model using word to vec wich is a vectore space model

In [13]:
sentences = [row.split() for row in df['text']]

In [14]:
w2v_model = Word2Vec(min_count=200,
                     window=5,
                     vector_size=100,
                     workers=4)
# this line of code to prepare the model vocabulary
w2v_model.build_vocab(sentences)
# train word vectors
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=w2v_model.epochs)
# we are calling init_sims(), which will make the model much more memory-efficient
w2v_model.init_sims(replace=True)

  w2v_model.init_sims(replace=True)


In [15]:
# Explore the model
w2v_model.wv.most_similar(positive=['education'])

[('quality', 0.5562518239021301),
 ('educator', 0.5258485078811646),
 ('ed', 0.514979362487793),
 ('educational', 0.512590765953064),
 ('teacher', 0.46883636713027954),
 ('counseling', 0.4491509199142456),
 ('preparation', 0.44886964559555054),
 ('psychology', 0.44762682914733887),
 ('achieve', 0.4087194800376892),
 ('certification', 0.3998575806617737)]

#### Saving the model

In [16]:
w2v_model.save("word2vec.model")