# Custom Trained Word2Vec Model

- without nonalphanumeric characters removal in preprocessing step.


In [1]:
import gensim
import pandas as pd
import numpy as np
import re
import bs4
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
tqdm.pandas()

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [2]:
df = pd.read_csv('Final_df.csv')

In [3]:
df.shape, df.columns

((1000000, 9),
 Index(['PostTypeId', 'Question_Id', 'Title', 'Tags', 'AnswerCount',
        'ViewCount', 'Body', 'Score', 'Answer_corpus'],
       dtype='object'))

## Model Training - On Question Corpus
### Ideal - Question corpus + answer corpus but memory constraints didnt allow.

In [5]:
%%time
df['Question_Corpus'] = df['Title'] + " " + df['Body']

CPU times: user 815 ms, sys: 772 ms, total: 1.59 s
Wall time: 1.62 s


In [6]:
df['Question_Corpus'].iloc[0]

"How do I calculate someone's age based on a DateTime type birthday? <p>Given a <code>DateTime</code> representing a person's birthday, how do I calculate their age in years?</p>\n"

## Data Preprocessing
- Removing html tags from question corpus
- Converting text to lowercase
- Text decontraction
- Not removing any punctuations as '+' character is useful as we have questions tagged as c++, c#, .net etc

In [3]:
# # https://stackoverflow.com/a/47091490/4084039
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    #phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase


def text_preprocessing(text):
    '''This function does text preprocessing 
       It includes removal of html tags,
       converting to lowercase, 
       decontraction and 
       removal of any non alphanumeric characters.
       
       Function takes one parameter - text
       returns - preprocessed text
    '''
    # Remove html tags from question corpus
    text = bs4.BeautifulSoup(text, 'lxml').get_text()
    # Convert each word to lowercase
    text = text.lower()
    # text decontraction. eg: won't to will not. Can't to cannot
    text = decontracted(text)
    # Remove any non-alphanumeric characters if present
    #text = re.sub('\W', ' ',text).strip()
    return text

In [8]:
df.columns

Index(['PostTypeId', 'Question_Id', 'Title', 'Tags', 'AnswerCount',
       'ViewCount', 'Body', 'Score', 'Answer_corpus', 'Question_Corpus'],
      dtype='object')

In [9]:
%%time
df['Cleaned_corpus'] = df['Question_Corpus'].progress_apply(lambda x: text_preprocessing(x))

100%|██████████| 1000000/1000000 [09:14<00:00, 1802.17it/s]

CPU times: user 9min 11s, sys: 5.95 s, total: 9min 17s
Wall time: 9min 14s





In [10]:
df['Cleaned_corpus'].head()

0    how do i calculate someone's age based on a da...
1    calculate relative time in c# given a specific...
2    determine a user's timezone is there a standar...
3    difference between math.floor() and math.trunc...
4    filling a dataset or a datatable from a linq q...
Name: Cleaned_corpus, dtype: object

### W2vec accepts input in list of list format where inner lists should contain word tokens

In [11]:
%%time
corpus = df['Cleaned_corpus'].str.split().tolist()
len(corpus)

CPU times: user 20.8 s, sys: 3.91 s, total: 24.7 s
Wall time: 24.7 s


1000000

In [12]:
# To discover system cores for parallel processing
#https://stackoverflow.com/questions/53417258/what-is-workers-parameter-in-word2vec-in-nlp
from gensim.utils import effective_n_jobs
effective_n_jobs(-1)

8

## Model Training

In [13]:
%%time
#https://towardsdatascience.com/a-beginners-guide-to-word-embedding-with-gensim-word2vec-model-5970fa56cc92
#https://machinelearningmastery.com/develop-word-embeddings-python-gensim/


#size: The number of dimensions of the embeddings and the default is 100.
#window: The maximum distance between a target word and words around the target word. The default window is 5.
#min_count: The minimum count of words to consider when training the model; words with occurrence less than this count will be ignored. The default for min_count is 5.
#workers: The number of partitions during training and the default workers is 3.
#sg: The training algorithm, either CBOW(0) or skip gram(1). The default training algorithm is CBOW.

model1 = Word2Vec(corpus, min_count=2,size= 300,workers=8, window =5, sg=0)

INFO - 08:49:49: collecting all words and their counts
INFO - 08:49:49: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 08:49:49: PROGRESS: at sentence #10000, processed 1124216 words, keeping 88758 word types
INFO - 08:49:49: PROGRESS: at sentence #20000, processed 2246839 words, keeping 153620 word types
INFO - 08:49:50: PROGRESS: at sentence #30000, processed 3421690 words, keeping 217748 word types
INFO - 08:49:50: PROGRESS: at sentence #40000, processed 4614264 words, keeping 280935 word types
INFO - 08:49:50: PROGRESS: at sentence #50000, processed 5828887 words, keeping 345263 word types
INFO - 08:49:51: PROGRESS: at sentence #60000, processed 7103150 words, keeping 410973 word types
INFO - 08:49:51: PROGRESS: at sentence #70000, processed 8377956 words, keeping 476008 word types
INFO - 08:49:51: PROGRESS: at sentence #80000, processed 9679157 words, keeping 540692 word types
INFO - 08:49:52: PROGRESS: at sentence #90000, processed 10993264 words, keepin

CPU times: user 49min 15s, sys: 15.4 s, total: 49min 30s
Wall time: 17min 7s


In [14]:
len(model1.wv.vocab.keys())

1747920

### Vocabulary of one million seven hundred forty-seven thousand nine hundred twenty words is built.

In [15]:
print(model1['python'])

[ 1.50918806e+00 -1.85878778e+00  1.61693120e+00  2.58832645e+00
 -3.20206618e+00 -4.03099149e-01 -2.53829193e+00  2.51184225e+00
  6.77844882e-01 -3.08668882e-01  6.47953868e-01 -7.95691162e-02
 -1.22440958e+00  8.85386392e-02  2.76785374e+00  2.49264717e+00
 -2.98029542e+00 -1.81757462e+00 -2.50846952e-01  1.28881061e+00
 -4.94186550e-01  1.06023125e-01 -2.00635102e-03 -3.80804509e-01
 -1.69312894e+00  1.68917775e+00 -1.70153886e-01  1.94352436e+00
  5.18117785e-01  2.14713526e+00 -2.00733495e+00  8.20280492e-01
  3.09260297e+00  2.60101169e-01  3.47028518e+00  6.12944543e-01
  5.75977445e-01 -9.98596489e-01  1.65202844e+00  1.21345162e+00
 -1.18380710e-01  1.46645471e-01 -1.73597023e-01 -6.14722490e-01
  1.12078631e+00 -5.17284930e-01 -1.31699252e+00  1.49792254e+00
  1.71648669e+00 -3.87544107e+00 -7.86019683e-01  9.54831541e-01
 -8.34102690e-01  2.59639645e+00 -3.84834260e-01 -1.28887141e+00
 -1.10986829e+00 -7.00336695e-02  2.26638699e+00 -1.47424843e-02
  1.12565482e+00  6.24476

  if __name__ == '__main__':


### Save trained model

In [None]:
model1.save("model/custom_trained_w2v/word2vec_v2.model")

INFO - 09:06:56: saving Word2Vec object under model/custom_trained_w2v/word2vec_v2.model, separately None
INFO - 09:06:56: storing np array 'syn1neg' to model/custom_trained_w2v/word2vec_v2.model.trainables.syn1neg.npy
INFO - 09:07:14: storing np array 'vectors' to model/custom_trained_w2v/word2vec_v2.model.wv.vectors.npy


### Load custom trained w2vec model

In [4]:
%%time
loaded_model = Word2Vec.load("model/custom_trained_w2v/word2vec_v2.model")

INFO - 09:10:17: loading Word2Vec object from model/custom_trained_w2v/word2vec_v2.model
INFO - 09:10:23: loading trainables recursively from model/custom_trained_w2v/word2vec_v2.model.trainables.* with mmap=None
INFO - 09:10:23: loading syn1neg from model/custom_trained_w2v/word2vec_v2.model.trainables.syn1neg.npy with mmap=None
INFO - 09:10:40: loading vocabulary recursively from model/custom_trained_w2v/word2vec_v2.model.vocabulary.* with mmap=None
INFO - 09:10:40: loading wv recursively from model/custom_trained_w2v/word2vec_v2.model.wv.* with mmap=None
INFO - 09:10:40: loading vectors from model/custom_trained_w2v/word2vec_v2.model.wv.vectors.npy with mmap=None
INFO - 09:10:56: setting ignored attribute vectors_norm to None
INFO - 09:10:56: setting ignored attribute cum_table to None
INFO - 09:10:56: loaded model/custom_trained_w2v/word2vec_v2.model


CPU times: user 10.6 s, sys: 3.3 s, total: 13.9 s
Wall time: 44.4 s


In [8]:
#https://tedboy.github.io/nlps/generated/generated/gensim.models.Word2Vec.most_similar.html
# method uses cosine similarity
loaded_model.wv.most_similar('dictionary', topn=15)

[('dictionary,', 0.8161787986755371),
 ('dict', 0.7764616012573242),
 ('hashtable', 0.7649470567703247),
 ('dictionary.', 0.7405421733856201),
 ('hashmap', 0.7313495874404907),
 ('dictionary?', 0.7255147695541382),
 ('dictionaries', 0.7158240079879761),
 ('tuple', 0.6831574440002441),
 ('collection', 0.6677265167236328),
 ('hashset', 0.6446368098258972),
 ('keyvaluepair', 0.636683464050293),
 ('array', 0.6243866086006165),
 ('dictionary:', 0.6164132356643677),
 ('sorteddictionary', 0.6145343780517578),
 ('dictionaries,', 0.6042093634605408)]

### Its amazing to see how similar words include soreteddictionary,hashtables,keyvaluepair.


In [6]:
loaded_model.wv.most_similar('python', topn=10)

[('perl', 0.758644700050354),
 ('python,', 0.7367944121360779),
 ('python.', 0.7332587242126465),
 ('python?', 0.6904996633529663),
 ('ruby', 0.6825644969940186),
 ('haskell', 0.6816902756690979),
 ("python's", 0.6795947551727295),
 ('jython', 0.6729694604873657),
 ('tcl', 0.6706594228744507),
 ('bash', 0.6340200901031494)]

### Similar words included:-
- Jython (Java implementation of python)
- cython (c extension for python)
- ipython (browser based notebook)

In [7]:
#print(cosine_similarity(np.array(loaded_model['python']).reshape(1,-1), np.array(loaded_model['pandas']).reshape(1,-1)))
print(cosine_similarity(np.array(loaded_model['python']).reshape(1,-1), np.array(loaded_model['dictionary']).reshape(1,-1)))
print(cosine_similarity(np.array(loaded_model['html']).reshape(1,-1), np.array(loaded_model['css']).reshape(1,-1)))

[[0.30442676]]
[[0.5608]]


  from ipykernel import kernelapp as app
  app.launch_new_instance()


### Download pre-trained glove and w2v model for comparison

In [20]:
# https://nlp.stanford.edu/projects/glove/
# stanfords pre-trained 300 dim glove vectors

#!wget --header="Host: doc-0c-0k-docs.googleusercontent.com" --header="User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36" --header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" --header="Accept-Language: en-US,en;q=0.9" --header="Referer: https://drive.google.com/" --header="Cookie: AUTH_9a0rnn520492vdm9l67ltu9t8cpotig0_nonce=0d2fulvulvjhm" --header="Connection: keep-alive" "https://doc-0c-0k-docs.googleusercontent.com/docs/securesc/ssq6alpsc1e8bsr56d67t9bugoing32d/vqpvemgir6vnf7k8mnpih7l7jhjo6bmn/1636535850000/00484516897554883881/10996415371724027994/1lDca_ge-GYO0iQ6_XDLWePQFMdAA2b8f?e=download&authuser=0&nonce=0d2fulvulvjhm&user=10996415371724027994&hash=rcthh8u4u0rerd2j89a2l1tgd5cs33ff" -c -O 'glove_vectors'

In [21]:
# https://developer.syn.co.in/tutorial/bot/oscova/pretrained-vectors.html
# Pre-trained w2v trained on google news dataset
#!wget --header="Host: doc-10-5k-docs.googleusercontent.com" --header="User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36" --header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" --header="Accept-Language: en-US,en;q=0.9" --header="Referer: https://drive.google.com/" --header="Cookie: AUTH_9a0rnn520492vdm9l67ltu9t8cpotig0_nonce=oencppjtu4ofo" --header="Connection: keep-alive" "https://doc-10-5k-docs.googleusercontent.com/docs/securesc/ssq6alpsc1e8bsr56d67t9bugoing32d/545v2gosgsltblk4h005n0bpitncgc2d/1636537125000/06848720943842814915/10996415371724027994/0B7XkCwpI5KDYNlNUTTlSS21pQmM?e=download&authuser=0&nonce=oencppjtu4ofo&user=10996415371724027994&hash=m9u0ofet090md869bufn7nab4bitv4pj" -c -O 'GoogleNews-vectors-negative300.bin.gz'

## Model Loading

In [9]:
import pickle
from gensim.models import KeyedVectors
from gensim import models


#please use below code to load glove vectors
with open('model/glove_vectors', 'rb') as f:
    model = pickle.load(f)
    glove_words = set(model.keys())
    
loaded_model = Word2Vec.load("model/custom_trained_w2v/word2vec_v2.model")

word2vec_path = 'model/GoogleNews-vectors-negative300.bin.gz'
w2v_pretrained_model = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

INFO - 09:11:54: loading Word2Vec object from model/custom_trained_w2v/word2vec_v2.model
INFO - 09:12:03: loading trainables recursively from model/custom_trained_w2v/word2vec_v2.model.trainables.* with mmap=None
INFO - 09:12:03: loading syn1neg from model/custom_trained_w2v/word2vec_v2.model.trainables.syn1neg.npy with mmap=None
INFO - 09:12:04: loading vocabulary recursively from model/custom_trained_w2v/word2vec_v2.model.vocabulary.* with mmap=None
INFO - 09:12:04: loading wv recursively from model/custom_trained_w2v/word2vec_v2.model.wv.* with mmap=None
INFO - 09:12:04: loading vectors from model/custom_trained_w2v/word2vec_v2.model.wv.vectors.npy with mmap=None
INFO - 09:12:05: setting ignored attribute vectors_norm to None
INFO - 09:12:05: setting ignored attribute cum_table to None
INFO - 09:12:05: loaded model/custom_trained_w2v/word2vec_v2.model
INFO - 09:12:12: loading projection weights from model/GoogleNews-vectors-negative300.bin.gz
INFO - 09:13:15: loaded (3000000, 300) m

## Model Comparison

In [18]:
print("Custom Trained W2v model")
#print(cosine_similarity(np.array(loaded_model['python']).reshape(1,-1), np.array(loaded_model['pandas']).reshape(1,-1)))
print("Similarity between python and dictionary: ",
      cosine_similarity(np.array(loaded_model['python']).reshape(1,-1), np.array(loaded_model['dictionary']).reshape(1,-1)))

print("Similarity between html and css: ",
      cosine_similarity(np.array(loaded_model['html']).reshape(1,-1), np.array(loaded_model['css']).reshape(1,-1)))

print("Similarity between dataframe and series: ",
      cosine_similarity(np.array(loaded_model['dataframe']).reshape(1,-1), np.array(loaded_model['series']).reshape(1,-1)))

print("Similarity between php and javascript: ",
      cosine_similarity(np.array(loaded_model['php']).reshape(1,-1), np.array(loaded_model['javascript']).reshape(1,-1)))
print("-------------------------------------------------------------------------------------------------------")
print("Pre-Trained Glove model: ")

#print(cosine_similarity(np.array(model['python']).reshape(1,-1), np.array(model['pandas']).reshape(1,-1)))
print("Similarity between python and dictionary: ",
      cosine_similarity(np.array(model['python']).reshape(1,-1), np.array(model['dictionary']).reshape(1,-1)))

print("Similarity between html and css: ",
      cosine_similarity(np.array(model['html']).reshape(1,-1), np.array(model['css']).reshape(1,-1)))

print("Similarity between python and series: ",
      cosine_similarity(np.array(model['python']).reshape(1,-1), np.array(model['series']).reshape(1,-1)))

print("Similarity between php and javascript: ",
      cosine_similarity(np.array(model['php']).reshape(1,-1), np.array(model['javascript']).reshape(1,-1)))

print("-------------------------------------------------------------------------------------------------------")
print("Pre-Trained W2V model: ")

#print(cosine_similarity(np.array(w2v_pretrained_model['python']).reshape(1,-1), np.array(w2v_pretrained_model['pandas']).reshape(1,-1)))
print("Similarity between python and dictionary: ",
      cosine_similarity(np.array(w2v_pretrained_model['python']).reshape(1,-1), np.array(w2v_pretrained_model['dictionary']).reshape(1,-1)))

print("Similarity between html and css: ",
      cosine_similarity(np.array(w2v_pretrained_model['html']).reshape(1,-1), np.array(w2v_pretrained_model['css']).reshape(1,-1)))

print("Similarity between python and series: ",
      cosine_similarity(np.array(w2v_pretrained_model['python']).reshape(1,-1), np.array(w2v_pretrained_model['series']).reshape(1,-1)))

print("Similarity between php and javascript: ",
      cosine_similarity(np.array(w2v_pretrained_model['php']).reshape(1,-1), np.array(w2v_pretrained_model['javascript']).reshape(1,-1)))

# pandas , apply

Custom Trained W2v model
Similarity between python and dictionary:  [[0.30442676]]
Similarity between html and css:  [[0.5608]]
Similarity between dataframe and series:  [[0.43597504]]
Similarity between php and javascript:  [[0.6253061]]
-------------------------------------------------------------------------------------------------------
Pre-Trained Glove model: 
Similarity between python and dictionary:  [[0.28257843]]
Similarity between html and css:  [[0.73995874]]
Similarity between python and series:  [[0.22612679]]
Similarity between php and javascript:  [[0.64708991]]
-------------------------------------------------------------------------------------------------------
Pre-Trained W2V model: 
Similarity between python and dictionary:  [[0.13850798]]
Similarity between html and css:  [[0.5491108]]
Similarity between python and series:  [[-0.02696471]]
Similarity between php and javascript:  [[0.544561]]




In [24]:
# print(cosine_similarity(np.array(loaded_model['scipy']).reshape(1,-1), np.array(loaded_model['numpy']).reshape(1,-1)))
# print(cosine_similarity(np.array(model['scipy']).reshape(1,-1), np.array(model['numpy']).reshape(1,-1)))
# print(cosine_similarity(np.array(w2v_pretrained_model['scipy']).reshape(1,-1), np.array(w2v_pretrained_model['numpy']).reshape(1,-1)))

In [25]:
# print(cosine_similarity(np.array(loaded_model['flask']).reshape(1,-1), np.array(loaded_model['django']).reshape(1,-1)))
# print(cosine_similarity(np.array(model['flask']).reshape(1,-1), np.array(model['django']).reshape(1,-1)))
# print(cosine_similarity(np.array(w2v_pretrained_model['flask']).reshape(1,-1), np.array(w2v_pretrained_model['django']).reshape(1,-1)))

In [19]:
print(cosine_similarity(np.array(loaded_model['iphone']).reshape(1,-1), np.array(loaded_model['cocoa']).reshape(1,-1)))
print(cosine_similarity(np.array(model['iphone']).reshape(1,-1), np.array(model['cocoa']).reshape(1,-1)))
print(cosine_similarity(np.array(w2v_pretrained_model['iphone']).reshape(1,-1), np.array(w2v_pretrained_model['cocoa']).reshape(1,-1)))

[[0.7082475]]
[[0.23233346]]
[[0.12214837]]


  if __name__ == '__main__':


## Inferences:-

- For similarity between 'python' and 'dictionary' other 2 models might be referring to python snake while our model is referring to technical terms 'python' programming language though it didnt captured much similarity between two.
- Glove model seems to be working well compared to pretrained w2v as it is trained on wikipedia dataset so it might contains technical similarity as well.
- But Pre-trained models including glove didn't contain word such as 'dataframe', 'django', 'scipy' in its vocab.
- For words like 'iphone' and 'cocoa' (Apple's application-development framework for macOS) custom model gave highest similarity among all.
- Custom trained w2v model is best choice for our problem statement.


In [None]:
#cosine_similarity(loaded_model.wv.__getitem__(['python']), loaded_model.wv.__getitem__(['pandas']))

### Lets see performance of custom trained w2v on our query

In [26]:

print(stopwords.words('english'))
stopwords = stopwords.words('english')

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [27]:
len(stopwords)

179

In [28]:
loaded_model = Word2Vec.load("model/custom_trained_w2v/word2vec_v2.model")

customtrained_w2v_words = list(loaded_model.wv.vocab)
len(customtrained_w2v_words)

INFO - 09:21:50: loading Word2Vec object from model/custom_trained_w2v/word2vec_v2.model
INFO - 09:21:59: loading trainables recursively from model/custom_trained_w2v/word2vec_v2.model.trainables.* with mmap=None
INFO - 09:21:59: loading syn1neg from model/custom_trained_w2v/word2vec_v2.model.trainables.syn1neg.npy with mmap=None
INFO - 09:22:00: loading vocabulary recursively from model/custom_trained_w2v/word2vec_v2.model.vocabulary.* with mmap=None
INFO - 09:22:00: loading wv recursively from model/custom_trained_w2v/word2vec_v2.model.wv.* with mmap=None
INFO - 09:22:00: loading vectors from model/custom_trained_w2v/word2vec_v2.model.wv.vectors.npy with mmap=None
INFO - 09:22:01: setting ignored attribute vectors_norm to None
INFO - 09:22:01: setting ignored attribute cum_table to None
INFO - 09:22:01: loaded model/custom_trained_w2v/word2vec_v2.model


1747920

In [29]:
df = pd.read_csv('Final_df.csv')
df.columns

Index(['PostTypeId', 'Question_Id', 'Title', 'Tags', 'AnswerCount',
       'ViewCount', 'Body', 'Score', 'Answer_corpus'],
      dtype='object')

In [30]:
%%time
df['Cleaned_Titles'] = df['Title'].progress_apply(lambda x: text_preprocessing(x))

100%|██████████| 1000000/1000000 [04:42<00:00, 3533.90it/s]


CPU times: user 4min 36s, sys: 7.7 s, total: 4min 44s
Wall time: 4min 43s


In [31]:
import gc
gc.collect()

150219

In [32]:
def get_embedding(sentence):
    '''Get 300 dim word embedding for each word from custom trained w2v model.
       Avg word embedding to create sentence embedding
       
       Function accepts only one parameter - sentence (text input)
       returns - 300 dim sentence embedding'''
    
    custom_w2v = []
    for word in sentence.split():
        if (word not in stopwords):
            try:
                custom_w2v.append(loaded_model[word]) #keyerror
            except:
                pass
         
    avg_w2v = np.array(custom_w2v).mean(axis=0)
    return avg_w2v
    

In [33]:
%%time
df['Sentence_Embedding'] = df['Cleaned_Titles'].progress_apply(lambda x: get_embedding(x))

100%|██████████| 1000000/1000000 [01:30<00:00, 11086.80it/s]

CPU times: user 1min 30s, sys: 853 ms, total: 1min 30s
Wall time: 1min 30s





In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 11 columns):
PostTypeId            1000000 non-null int64
Question_Id           1000000 non-null int64
Title                 1000000 non-null object
Tags                  1000000 non-null object
AnswerCount           1000000 non-null int64
ViewCount             1000000 non-null int64
Body                  1000000 non-null object
Score                 1000000 non-null int64
Answer_corpus         1000000 non-null object
Cleaned_Titles        1000000 non-null object
Sentence_Embedding    999581 non-null object
dtypes: int64(5), object(6)
memory usage: 83.9+ MB


In [35]:
df.dropna(inplace=True)

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 999581 entries, 0 to 999999
Data columns (total 11 columns):
PostTypeId            999581 non-null int64
Question_Id           999581 non-null int64
Title                 999581 non-null object
Tags                  999581 non-null object
AnswerCount           999581 non-null int64
ViewCount             999581 non-null int64
Body                  999581 non-null object
Score                 999581 non-null int64
Answer_corpus         999581 non-null object
Cleaned_Titles        999581 non-null object
Sentence_Embedding    999581 non-null object
dtypes: int64(5), object(6)
memory usage: 91.5+ MB


In [43]:
df.shape

(999581, 12)

In [37]:

def get_similar_questions(query):
    ''' Function to accept user query and show top 5 similar question alongwith cosine similarity score.
        Function accepts one parameter: query (text input)
        Processing: Text preprocessing of query, compute sentence embedding and top similar 5 questions.
        Returns: Prints dataframe of similar titles and cosine similarity score.
    '''
    preprocessed_query = text_preprocessing(query)
    query_embedding = get_embedding(preprocessed_query)
    embeddings = [x for x in df['Sentence_Embedding']]
    df['Cosine_sim'] = cosine_similarity(np.array(query_embedding).reshape(1, -1),np.array(embeddings)).T
    df.sort_values(by='Cosine_sim', ascending=False, inplace=True)
    print(df[['Title','Cosine_sim']].head().values)

In [38]:
%%time
query = 'python sort dictionary'
get_similar_questions(query)



[['Ordering in Python (2.4) dictionary' 0.9170001745223999]
 ['How does Python sort a list of tuples?' 0.9121592044830322]
 ['Python: sort this dictionary (dict in dict)' 0.9115707874298096]
 ['Reversible dictionary for python' 0.8975422382354736]
 ['Python "extend" for a dictionary' 0.8960131406784058]]
CPU times: user 3.91 s, sys: 1.9 s, total: 5.81 s
Wall time: 4.82 s


In [39]:
%%time
query = 'CSS Performance'
get_similar_questions(query)



[['CSS Performance' 0.9999998807907104]
 ['CSS Performance' 0.9999998807907104]
 ['Are CSS selectors a big performance hit?' 0.9220609664916992]
 ['Performance of tokenizing CSS in PHP' 0.905392050743103]
 ['IE6 performance with CSS expressions' 0.901648759841919]]
CPU times: user 4.49 s, sys: 1.13 s, total: 5.62 s
Wall time: 4.64 s


In [40]:
%%time
query = 'python convert date to datetime'
get_similar_questions(query)



[['Convert date to datetime in Python' 0.9999998211860657]
 ['Convert date Python' 0.9570077657699585]
 ['How do I convert a DateTime to a Date in C#' 0.9416810274124146]
 ['Convert DateTime to Date' 0.9378916621208191]
 ['Convert datetime in to date' 0.9378916621208191]]
CPU times: user 4.89 s, sys: 1.01 s, total: 5.9 s
Wall time: 4.89 s


In [41]:
%%time
query = 'how to create list of lists in python'
get_similar_questions(query)



[['How do I create a list of Python lambdas (in a list comprehension/for loop)?'
  0.9303664565086365]
 ['Python: create a list of dictionaries using a list comprehension'
  0.9293903112411499]
 ['How to create a list of lists in PHP?' 0.9288452863693237]
 ['nesting python list comprehensions to construct a list of lists'
  0.9274009466171265]
 ['Create a dictionary in python which is indexed by lists'
  0.9203953742980957]]
CPU times: user 4.19 s, sys: 909 ms, total: 5.1 s
Wall time: 4.08 s


In [None]:
%%time
get_similar_questions('pd.melt() not working python')



In [55]:
query = 'key error in w2vec'
get_similar_questions_updated(query)



[['BAD_UID error while exporting key in CryptoAPI' 0.935831606388092]
 ['Error correcting key encryption' 0.9212811589241028]
 ["Foregin key constraint error but shouldn't be occuring"
  0.9050024747848511]
 ['trap of primary key error' 0.8889710903167725]
 ['Error with foreign key' 0.8877214193344116]]


## Inference:-
- Comparing to pre-trained glove model our model seems to be working better.
- Its amazing to mark how model learn that list of list in python can be created using list comprehension.
- 'w2vec' word isn't present in our vocab still it gave similar results for keyerror occuring in other languages like sql.