<h2> 3.6 Featurizing text data with tfidf weighted word-vectors </h2>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import warnings
import numpy as np
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")
import sys
import os 
import pandas as pd
import numpy as np
from tqdm import tqdm
import joblib

# exctract word2vec vectors
# https://github.com/explosion/spaCy/issues/1721
# http://landinghub.visualstudio.com/visual-cpp-build-tools
import spacy

In [2]:
data_points = 3000

In [3]:
# avoid decoding problems
df_data = pd.read_csv("data/00_train.csv")

df_data['question1'] = df_data['question1'].apply(lambda x: str(x))
df_data['question2'] = df_data['question2'].apply(lambda x: str(x))
df_data.head(1)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0


## Checkpoint 1: Applying TFIDF WEIGHT W2V

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

questions = list(df_data['question1'][:data_points] + df_data['question2'][:data_points])

tfidf = TfidfVectorizer(lowercase=False, )
tfidf.fit_transform(questions)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

In [5]:
counter = 0
for k, v in word2tfidf.items():
    if counter == 4:
        break
    else:
        counter += 1
        print(k, ' ', v)

000   6.704115752446321
09   8.313553664880422
10   5.540964942640641
100   6.809476268104148


- After we find TF-IDF scores, we convert each question to a weighted average of word2vec vectors by these scores.
- here we use a pre-trained GLOVE model which comes free with "Spacy".  https://spacy.io/usage/vectors-similarity
- It is trained on Wikipedia and therefore, it is stronger in terms of word semantics. 

In [6]:
# en_vectors_web_lg, which includes over 1 million unique vectors.
nlp = spacy.load('en_core_web_sm')

def tifidf_to_wieghtedW2V(text_df, tfidf_vectors):
    vecs = []
    # https://github.com/noamraph/tqdm
    # tqdm is used to print the progress bar
    for ques in tqdm(list(text_df)):
        doc = nlp(ques) 
        # 384 is the number of dimensions of vectors 
        mean_vec = np.zeros([len(doc), len(doc[0].vector)])
        for word in doc:
            # word2vec
            vec = word.vector
            # fetch df score
            try:
                idf = tfidf_vectors[str(word)]
            except:
                idf = 0
            # compute final vec
            mean_vec += vec * idf
        mean_vec = mean_vec.mean(axis=0)
        vecs.append(mean_vec)
    return vecs

In [7]:
q1 = list(tifidf_to_wieghtedW2V(df_data['question1'][:data_points], word2tfidf))

100%|██████████████████████████████████████████████████████████████████████████████| 3000/3000 [00:31<00:00, 95.93it/s]


In [8]:
joblib.dump(q1, f'models/01_nlp/03_1_tfidf_weighted_w2v_{data_points}.joblib')

['models/01_nlp/03_1_tfidf_weighted_w2v_3000.joblib']

In [9]:
q2 = list(tifidf_to_wieghtedW2V(df_data['question2'][:data_points], word2tfidf))

100%|██████████████████████████████████████████████████████████████████████████████| 3000/3000 [00:30<00:00, 99.66it/s]


In [10]:
joblib.dump(q2, f'models/01_nlp/04_2_tfidf_weighted_w2v{data_points}.joblib')

['models/01_nlp/04_2_tfidf_weighted_w2v3000.joblib']

## Checkpoint 3: Applying Glove

In [4]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle
import gensim

In [5]:
# from gensim.scripts.glove2word2vec import glove2word2vec
# glove2word2vec(glove_input_file="data/glove.42B.300d.txt", word2vec_output_file="data/glove_vectors.txt")

(1917494, 300)

In [6]:
# !python -m gensim.scripts.glove2word2vec --input  data/glove.840B.300d.txt --output data/glove.840B.300d.w2vformat.txt

In [5]:
from gensim.models.keyedvectors import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format("data/glove_vectors.txt", binary=False, unicode_errors='ignore')

In [6]:
def wmd(s1, s2,model):
    s1 = str(s1)
    s2 = str(s2)
    s1 = s1.split()
    s2 = s2.split()
    return model.wmdistance(s1, s2)

http://proceedings.mlr.press/v37/kusnerb15.pdf i read about word mover distance and after that i calculated some distances from avg word vectors as below

In [None]:
df = df_data 
df['Word_Mover_Dist'] = df.apply(lambda x: wmd(x['question1'], x['question2'],glove_model), axis=1)

In [None]:
# the avg-w2v for each sentence/review is stored in this list
def avg_w2v(list_of_sent,model,d):
    '''
    Returns average of word vectors for
    each sentance with dimension of model given
    '''
    sent_vectors = []
    for sent in list_of_sent: # for each review/sentence
        doc = [word for word in sent if word in model.key_to_index]
        if doc: 
            sent_vec = np.mean(model[doc],axis=0)
        else:
            sent_vec = np.zeros(d)
        sent_vectors.append(sent_vec)
    return sent_vectors

In [None]:
#converting into lists
list_of_question1=[]
for sent in df.question1.values:
    list_of_question1.append(sent.split())
list_of_question2=[]
for sent in df.question2.values:
    list_of_question2.append(sent.split())

In [None]:
#avg word 2 vec
# d= [word for word in sent if word in glove_model.key_to_index]
# sent_vec = np.mean(glove_model[d],axis=0)

avgw2v_q1 = avg_w2v(list_of_question1,glove_model,300)
avgw2v_q2 = avg_w2v(list_of_question2,glove_model,300)

In [34]:
#converting as df
df_avgw2v = pd.DataFrame()
df_avgw2v['q1_vec'] = list(avgw2v_q1)
df_avgw2v['q2_vec'] = list(avgw2v_q2)
df_q1 = pd.DataFrame(df_avgw2v.q1_vec.values.tolist())
df_q2 = pd.DataFrame(df_avgw2v.q2_vec.values.tolist())

In [None]:
df_q1.head()

In [39]:
#importing soma distances and calculating
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock,canberra, euclidean, minkowski
from scipy.spatial.distance import braycurtis, chebyshev, correlation, mahalanobis
from scipy.spatial.distance import seuclidean, hamming, jaccard, kulsinski, rogerstanimoto, russellrao, sokalmichener, sokalsneath, kulsinski, yule

df['dist_cosine'] = [cosine(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
df['dist_cityblock'] = [cityblock(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
df['dist_canberra'] = [canberra(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
df['dist_euclidean'] = [euclidean(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
df['dist_minkowski'] = [minkowski(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
df['dist_braycurtis'] = [braycurtis(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
df['dist_chebyshev'] = [chebyshev(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
df['dist_correlation'] = [correlation(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
df['dist_hamming'] = [hamming(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
df['dist_jaccard'] = [jaccard(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
df['dist_kulsinski'] = [kulsinski(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
df['dist_rogerstanimoto'] = [rogerstanimoto(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
df['dist_russellrao'] = [russellrao(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
df['dist_sokalmichener'] = [sokalmichener(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
df['dist_kulsinski'] = [kulsinski(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]
df['dist_yule'] = [yule(x, y) for (x, y) in zip(avgw2v_q1,avgw2v_q2)]

In [40]:
#filling na values with 0  for cosine distance
df = df.fillna(0)

In [43]:
print(df.shape)
df.head(1)

(404290, 22)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,Word_Mover_Dist,dist_cosine,dist_cityblock,dist_canberra,...,dist_braycurtis,dist_chebyshev,dist_correlation,dist_hamming,dist_jaccard,dist_kulsinski,dist_rogerstanimoto,dist_russellrao,dist_sokalmichener,dist_yule
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,0.144728,0.006854,9.08724,82.744686,...,0.117393,0.112668,0.00684,1.0,1.0,0.860943,-0.670791,0.895868,-0.670791,0.233231


In [45]:
df.to_csv("data/03_nlp_glove_features.csv")

In [12]:
question1 = df_data['question1'][:2000]

tfidf_vect = TfidfVectorizer(ngram_range=(1,2), min_df=10, lowercase=False)
tfidf_1 = tfidf_vect.fit_transform(question1)

In [None]:
joblib.dump(tfidf_1, "models/01_nlp/01_1_tfidf_model.joblib")

In [None]:
question2 = df_data['question2'][:2000]

tfidf_vect = TfidfVectorizer(ngram_range=(1,2), min_df=10, lowercase=False)
tfidf_2 = tfidf_vect.fit_transform(question2)
type(tfidf_2)

In [None]:
joblib.dump(tfidf_2, "models/01_nlp/02_2_tfidf_model.joblib")

In [None]:
tfidf_vect = TfidfVectorizer(ngram_range=(1,3),max_features=200000,min_df=0.000032)
train_tfidf = tfidf_vect.fit_transform(X_train_tf.Text)
test_tfidf = tfidf_vect.transform(X_test_tf.Text)
print('No of Tfidf features',len(tfidf_vect.get_feature_names()))

In [None]:
from scipy.sparse import hstack
X_train1 = hstack((X_train_tf.values,train_tfidf))
X_test1 = hstack((X_test_tf.values,test_tfidf))

In [None]:
scale = StandardScaler(with_mean=False)
X_train_sc = scale.fit_transform(X_train1)
X_test_sc = scale.transform(X_test1)