In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)

In [2]:
import numpy as np
import pandas as pd
from scipy.stats import itemfreq
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


### Understanding word2vec

In [3]:
from gensim.models.keyedvectors import KeyedVectors
w2vec = KeyedVectors.load_word2vec_format("E:\AIML\GLAIML\Resource Materials\Statistical NLP\gensim_glove.6B.50d\gensim_glove.6B.50d.txt", binary=False)



In [46]:
a = w2vec.word_vec("physics")
b = w2vec.word_vec("relativity")

In [47]:
from scipy.stats.stats import pearsonr
pearsonr(a,b)

(0.607742, 2.860726734483569e-06)

### Finding sentence vectors from word vectors

In [48]:
title1 = "laws of physics"
title2 = "theory of relativity"
title3 = "battle of panipat"

#### Assignment: Find vector representation for each title by splitting the title into indivdiual words. Find vector for each word and average them. What's the pearson correlation between different titles?

In [84]:
from nltk import word_tokenize

In [87]:
t1 = np.array([w2vec.word_vec(x) for x in word_tokenize(title1)])
t2 = np.array([w2vec.word_vec(x) for x in word_tokenize(title2)])
t3 = np.array([w2vec.word_vec(x) for x in word_tokenize(title3)])

t1 =t1.mean(axis=0)
t2 =t2.mean(axis=0)
t3 =t3.mean(axis=0)

print(pearsonr(t1,t2))
print(pearsonr(t2,t3))
print(pearsonr(t1,t3))


(0.8310373, 8.078668933786241e-14)
(0.48349163, 0.0003753143672253442)
(0.49981984, 0.00021934156127459731)


### Finding similiar books

In [93]:
books_data = pd.read_csv('books.csv',encoding="latin")

In [94]:
books_data.shape

(5700, 2)

In [95]:
books_data.head()

Unnamed: 0,title,category
0,"Oral and Maxillofacial Surgery: An Objective-Based Textbook, 2e",Medical Books
1,"Barron's GRE, 21st Edition",Test Preparation
2,George Balanchine: The Ballet Maker (Eminent Lives),Biographies & Memoirs
3,"A Partner in Holiness: Deepening Mindfulness, Practicing Compassion and Enriching Our Lives Through the Wisdom of R. Levi Yitzhak of Berdichev's, Vol. 2 (Institute for Jewish Spirituality)",Religion & Spirituality
4,Construction Scheduling: Principles and Practices (2nd Edition),Arts & Photography


In [96]:
books_data.columns

Index(['title', 'category'], dtype='object')

In [97]:
import re
books_data["title_clean"] = [re.sub("[^a-zA-Z ]","",x).lower() for x in books_data["title"]]

In [98]:
books_data.head()

Unnamed: 0,title,category,title_clean
0,"Oral and Maxillofacial Surgery: An Objective-Based Textbook, 2e",Medical Books,oral and maxillofacial surgery an objectivebased textbook e
1,"Barron's GRE, 21st Edition",Test Preparation,barrons gre st edition
2,George Balanchine: The Ballet Maker (Eminent Lives),Biographies & Memoirs,george balanchine the ballet maker eminent lives
3,"A Partner in Holiness: Deepening Mindfulness, Practicing Compassion and Enriching Our Lives Through the Wisdom of R. Levi Yitzhak of Berdichev's, Vol. 2 (Institute for Jewish Spirituality)",Religion & Spirituality,a partner in holiness deepening mindfulness practicing compassion and enriching our lives through the wisdom of r levi yitzhak of berdichevs vol institute for jewish spirituality
4,Construction Scheduling: Principles and Practices (2nd Edition),Arts & Photography,construction scheduling principles and practices nd edition


In [99]:
from tqdm import tqdm
title_vec = np.zeros((books_data.shape[0],50))
for i in tqdm(range(0,books_data.shape[0])):
    words = books_data["title_clean"].iloc[i].split(" ")
    words = [x.strip() for x in words]
    ind_word_vecs = [w2vec.word_vec(x) for x in words if x in w2vec.vocab]
    title_vec[i] = np.array(ind_word_vecs).mean(axis=0)

  import sys
  ret = ret.dtype.type(ret / rcount)
100%|███████████████████████████████████████████████████████████████████████████| 5700/5700 [00:00<00:00, 21010.35it/s]


In [100]:
title_vec = np.nan_to_num(title_vec)

In [101]:
from sklearn.metrics.pairwise import cosine_similarity

In [102]:
cosine_sim_titles = cosine_similarity(title_vec)

In [103]:
cosine_sim_titles.shape

(5700, 5700)

In [104]:
# get books similiar to a given title..

title_id = 13
books_data['title'].iloc[title_id]

'Organ Transplantation (Health and Medical Issues Today)'

In [113]:
### Q: Find the titles with the highest degree of similarity (top 10)
top_n_idx = np.flip(np.argsort(cosine_sim_titles[title_id,:]))[0:10]
#np.flip will flip the order. i.e ascending items go down the array end.
top_n_sim_values = cosine_sim_titles[title_id, top_n_idx]
top_n_sim_values

array([1.        , 0.92993812, 0.92339052, 0.92211345, 0.91843464,
       0.91842045, 0.91840914, 0.91698754, 0.91646997, 0.9157601 ])

In [114]:
# find top n with values > 0
top_n_idx = top_n_idx[top_n_sim_values > 0]

In [115]:
# Matching books
books_data['title'].iloc[top_n_idx]

13      Organ Transplantation (Health and Medical Issues Today)                                                
616     Medicine and Social Justice: Essays on the Distribution of Health Care                                 
2354    Health Care Emergency Management: Principles and Practice                                              
3125    IBS: A Doctor's Plan for Chronic Digestive Troubles 3 Ed: The Definitive Guide to Prevention and Relief
932     A Core Curriculum for Diabetes Education: Diabetes Education And Program Management                    
3840    The Anticipatory Corpse: Medicine, Power, and the Care of the Dying (ND Studies in Medical Ethics)     
2014    Medicine and Charity Before the Welfare State (Studies in the Social History of Medicine)              
1925    Small Animal Emergency and Critical Care for Veterinary Technicians, 3e                                
1265    Occupational Therapy in Home Health Care                                                        

<b> Q: find the most similiar books corresponding to title id 34 - "Shadowrun: Third Edition (FPR25000)". 
What issues do you see with the results </b>

In [116]:
title_id = 34
books_data['title'].iloc[title_id]

'Shadowrun: Third Edition (FPR25000)'

In [117]:
top_n_idx = np.flip(np.argsort(cosine_sim_titles[title_id,:]))[0:10]
top_n_sim_values = cosine_sim_titles[title_id, top_n_idx]
top_n_idx = top_n_idx[top_n_sim_values > 0]

In [118]:
# Matching books
books_data['title'].iloc[top_n_idx]

34      Shadowrun: Third Edition (FPR25000)                                                                                                                                                        
4879    Shadowrun 4th Edition                                                                                                                                                                      
4070    generatingfunctionology: Third Edition                                                                                                                                                     
3956    Minecraft Pocket Edition: The Minecraft Pocket Edition Essentials Handbook Guide to Minecraft (An Unofficial Minecraft Pocket Edition Handbook) ... edition, minecraft handbook, minecraft)
1857    CNC Programming Handbook, Third Edition                                                                                                                                                    
274     Sixth World 

### Using TF IDF with word2vec

In [119]:
from sklearn.feature_extraction.text import TfidfTransformer


In [120]:
# initialize vectorizer
vect = TfidfVectorizer(ngram_range=(1,2),stop_words='english', max_features=5000)


In [121]:
vect.fit(books_data['title_clean'])
title_matrix = vect.transform(books_data['title_clean'])

In [147]:
books_data.head()

Unnamed: 0,title,category,title_clean
0,"Oral and Maxillofacial Surgery: An Objective-Based Textbook, 2e",Medical Books,oral and maxillofacial surgery an objectivebased textbook e
1,"Barron's GRE, 21st Edition",Test Preparation,barrons gre st edition
2,George Balanchine: The Ballet Maker (Eminent Lives),Biographies & Memoirs,george balanchine the ballet maker eminent lives
3,"A Partner in Holiness: Deepening Mindfulness, Practicing Compassion and Enriching Our Lives Through the Wisdom of R. Levi Yitzhak of Berdichev's, Vol. 2 (Institute for Jewish Spirituality)",Religion & Spirituality,a partner in holiness deepening mindfulness practicing compassion and enriching our lives through the wisdom of r levi yitzhak of berdichevs vol institute for jewish spirituality
4,Construction Scheduling: Principles and Practices (2nd Edition),Arts & Photography,construction scheduling principles and practices nd edition


In [149]:
vect.get_feature_names()[110:130]

['animation',
 'anne',
 'anniversary',
 'anniversary edition',
 'annotated',
 'annual',
 'answer',
 'answers',
 'anthology',
 'anthropology',
 'anti',
 'anxiety',
 'ap',
 'ap english',
 'apocalypse',
 'appalachian',
 'apples',
 'application',
 'applications',
 'applied']

In [122]:
title_matrix = title_matrix.toarray()

In [123]:
title_matrix.shape

(5700, 5000)

In [124]:
# get tf idf weights
idx = 34
features = np.where(title_matrix[idx,:] > 0)[0]

In [138]:
features

array([1290, 4110], dtype=int64)

In [145]:
vect.get_feature_names()[4110]

'shadowrun'

In [146]:
vect.get_feature_names()[1290]

'edition'

In [125]:
feature_names = [vect.get_feature_names()[x] for x in features]
feature_names

['edition', 'shadowrun']

#### Find the tf-idf weights of the 2 words. Which word has a higher weightage?

In [126]:
# feature weights
feature_weights = np.array([title_matrix[idx,x] for x in features])[:,np.newaxis]
feature_weights.shape

(2, 1)

In [137]:
feature_weights

array([[0.43375068],
       [0.90103293]])

In [127]:
# multiplying tf idf weights with word2vec

word_vecs = np.array([w2vec.word_vec(x) if x in w2vec.vocab else np.zeros(50) for x in feature_names])

In [128]:
word_vecs.shape

(2, 50)

In [129]:
res = word_vecs*feature_weights

In [130]:
res.shape[0]

2

In [131]:
def get_weighted_vectors(idx):
    
    features = np.where(title_matrix[idx,:] > 0)[0]
    feature_names = [vect.get_feature_names()[x] for x in features]
    feature_weights = np.array([title_matrix[idx,x] for x in features])[:,np.newaxis]
    # np.newaxis is used to add dimension to matrix so that multiplication can happen over different order of matrices.  
    word_vecs = np.array([w2vec.word_vec(x) if x in w2vec.vocab else np.zeros(50) for x in feature_names])
    res = word_vecs*feature_weights
    return res.mean(axis=0)
    

In [132]:
from tqdm import tqdm
title_vec_weighted = np.zeros((books_data.shape[0],50))
for i in tqdm(range(0,books_data.shape[0])):
    vec = get_weighted_vectors(i)
    
    if vec.shape[0] == 0:
        title_vec_weighted[i] = np.zeros(50)
    else:
        title_vec_weighted[i] = vec

  
100%|██████████████████████████████████████████████████████████████████████████████| 5700/5700 [00:57<00:00, 99.73it/s]


In [133]:
title_id=34
books_data['title'].iloc[title_id]

'Shadowrun: Third Edition (FPR25000)'

In [134]:

cosine_sim_titles_weighted = cosine_similarity(title_vec_weighted)
top_n_idx = np.flip(np.argsort(cosine_sim_titles_weighted[title_id,]),axis=0)[0:10]
top_n_sim_values = cosine_sim_titles_weighted[title_id, top_n_idx]
top_n_sim_values

# find top n with values > 0
top_n_idx = top_n_idx[top_n_sim_values > 0]
# Matching books
books_data['title'].iloc[top_n_idx]

34      Shadowrun: Third Edition (FPR25000)                                                                                                                                                        
4879    Shadowrun 4th Edition                                                                                                                                                                      
2795    Shadowrun Unwired (Shadowrun (Catalyst Hardcover))                                                                                                                                         
4660    Rigger 2: A Shadowrun Sourcebook                                                                                                                                                           
3956    Minecraft Pocket Edition: The Minecraft Pocket Edition Essentials Handbook Guide to Minecraft (An Unofficial Minecraft Pocket Edition Handbook) ... edition, minecraft handbook, minecraft)
274     Sixth World 