<h2> 3.6 Featurizing text data with tfidf weighted word-vectors </h2>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import warnings
import numpy as np
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")
import sys
import os 
import pandas as pd
import numpy as np
from tqdm import tqdm
import joblib

# exctract word2vec vectors
# https://github.com/explosion/spaCy/issues/1721
# http://landinghub.visualstudio.com/visual-cpp-build-tools
import spacy

In [2]:
data_points = 3000

In [3]:
# avoid decoding problems
df_data = pd.read_csv("data/00_train.csv")

df_data['question1'] = df_data['question1'].apply(lambda x: str(x))
df_data['question2'] = df_data['question2'].apply(lambda x: str(x))
df_data.head(1)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0


## Checkpoint 1: Applying TFIDF WEIGHT W2V

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

questions = list(df_data['question1'][:data_points] + df_data['question2'][:data_points])

tfidf = TfidfVectorizer(lowercase=False, )
tfidf.fit_transform(questions)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

In [5]:
counter = 0
for k, v in word2tfidf.items():
    if counter == 4:
        break
    else:
        counter += 1
        print(k, ' ', v)

000   6.704115752446321
09   8.313553664880422
10   5.540964942640641
100   6.809476268104148


- After we find TF-IDF scores, we convert each question to a weighted average of word2vec vectors by these scores.
- here we use a pre-trained GLOVE model which comes free with "Spacy".  https://spacy.io/usage/vectors-similarity
- It is trained on Wikipedia and therefore, it is stronger in terms of word semantics. 

In [6]:
# en_vectors_web_lg, which includes over 1 million unique vectors.
nlp = spacy.load('en_core_web_sm')

def tifidf_to_wieghtedW2V(text_df, tfidf_vectors):
    vecs = []
    # https://github.com/noamraph/tqdm
    # tqdm is used to print the progress bar
    for ques in tqdm(list(text_df)):
        doc = nlp(ques) 
        # 384 is the number of dimensions of vectors 
        mean_vec = np.zeros([len(doc), len(doc[0].vector)])
        for word in doc:
            # word2vec
            vec = word.vector
            # fetch df score
            try:
                idf = tfidf_vectors[str(word)]
            except:
                idf = 0
            # compute final vec
            mean_vec += vec * idf
        mean_vec = mean_vec.mean(axis=0)
        vecs.append(mean_vec)
    return vecs

In [7]:
df = pd.DataFrame()
df['id']=df_data['id'][:data_points]

In [8]:
df['q1_feats_m'] = list(tifidf_to_wieghtedW2V(df_data['question1'][:data_points], word2tfidf))

100%|█████████████████████████████████████████████████████████████████████████████| 3000/3000 [00:29<00:00, 101.99it/s]


In [9]:
df['q2_feats_m'] =  list(tifidf_to_wieghtedW2V(df_data['question2'][:data_points], word2tfidf))

100%|█████████████████████████████████████████████████████████████████████████████| 3000/3000 [00:27<00:00, 107.99it/s]


In [10]:
df_q1 = pd.DataFrame(df['q1_feats_m'].values.tolist(), index= df.index)
df_q2 = pd.DataFrame(df['q2_feats_m'].values.tolist(), index= df.index)

print("Number of features in question1 w2v  dataframe :", df_q1.shape[1])
print("Number of features in question2 w2v  dataframe :", df_q2.shape[1])

Number of features in question1 w2v  dataframe : 96
Number of features in question2 w2v  dataframe : 96


In [11]:
df_q1['id']=df_data['id']
df_q2['id']=df_data['id']
result  = df_q1.merge(df_q2, on='id',how='left')
result.to_csv(f'models/01_nlp/tfidf_weight_w2v{data_points}.csv')

In [14]:
print(result.shape)
result.head(1)

(3000, 193)


Unnamed: 0,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,9_x,...,86_y,87_y,88_y,89_y,90_y,91_y,92_y,93_y,94_y,95_y
0,38.161535,-16.883424,27.364944,-2.225619,6.019804,-9.301675,-1.573213,1.481994,-7.730493,-2.612193,...,20.356704,-33.651369,4.493593,-2.982234,-11.982838,12.24958,18.157573,-10.208319,16.406053,12.979552


## Checkpoint 2: TFIDF

In [8]:
tfidf_vect = TfidfVectorizer(ngram_range=(1,3),max_features=200000,min_df=0.000032)
tfidf = tfidf_vect.fit_transform(questions)
# test_tfidf = tfidf_vect.transform(X_test_tf.Text)
print('No of Tfidf features',len(tfidf_vect.get_feature_names()))

NameError: name 'questions' is not defined

In [25]:
#Save vectorizer.vocabulary_
import joblib
joblib.dump(tfidf_vect.vocabulary_, f'models/01_nlp/tfidf{data_points}.joblib') 

['models/01_nlp/tfidf3000.joblib']

In [23]:
# from scipy.sparse import hstack
# X_train1 = hstack((X_train_,train_tfidf))
# X_test1 = hstack((X_test_tf.values,test_tfidf))

In [None]:
# scale = StandardScaler(with_mean=False)
# X_train_sc = scale.fit_transform(X_train1)
# X_test_sc = scale.transform(X_test1)

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer()
vec = CountVectorizer(vocabulary= joblib.load('models/01_nlp/tfidf3000.joblib')) 
tfidf = transformer.fit_transform(vec.fit_transform(["dfgsd"]))

In [8]:
tfidf = transformer.fit_transform(vec.fit_transform(["asdfasldkfnasd sdfasfjasdlkasdf asdfasdf"]))
df = pd.DataFrame.sparse.from_spmatrix(tfidf)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86578,86579,86580,86581,86582,86583,86584,86585,86586,86587
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
tfidf = TfidfVectorizer(lowercase=False, )
tfidf.fit_transform(questions)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

In [10]:
# from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer()
vec = CountVectorizer(vocabulary= joblib.load('models/01_nlp/tfidf3000.joblib')) 
word2tfidf = dict(zip(vec.get_feature_names(), vec.idf_))

AttributeError: 'CountVectorizer' object has no attribute 'idf_'

In [11]:
# en_vectors_web_lg, which includes over 1 million unique vectors.
nlp = spacy.load('en_core_web_sm')

def tifidf_to_wieghtedW2V(text_df, tfidf_vectors):
    vecs = []
    # https://github.com/noamraph/tqdm
    # tqdm is used to print the progress bar
    for ques in tqdm(list(text_df)):
        doc = nlp(ques) 
        # 384 is the number of dimensions of vectors 
        mean_vec = np.zeros([len(doc), len(doc[0].vector)])
        for word in doc:
            # word2vec
            vec = word.vector
            # fetch df score
            try:
                idf = tfidf_vectors[str(word)]
            except:
                idf = 0
            # compute final vec
            mean_vec += vec * idf
        mean_vec = mean_vec.mean(axis=0)
        vecs.append(mean_vec)
    return vecs

In [12]:
list(tifidf_to_wieghtedW2V(["what is the step by step guide to invest in share market"], word2tfidf))

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 31.99it/s]


[array([ 42.5868373 , -14.43384796,  23.56646349,  -4.58721548,
          4.34642098,  -8.28829327,   1.5316028 ,   8.01649164,
         -6.30272877,  -5.92579629,  10.23099293,  -7.72317278,
         -0.9798522 , -11.35297754, -28.64178765,  17.88679239,
         10.47494122,   1.58813325, -25.98949604,   6.16953206,
        -15.84814543,  -4.13335568,  -9.93601622,  -9.05970663,
          3.77335968, -12.49490783,  21.75930128, -18.87092397,
         14.60306701,  12.00374097,  -7.56202748, -24.81611756,
         -0.09120483, -11.75074169,   6.90590835,  -6.70017198,
         -3.8140227 ,  11.53788067, -12.0932984 ,  29.87197882,
         11.38655274,  30.40739807,  -6.07892269,   2.04485524,
         -1.92368665, -15.22185927,   0.52148398,   4.04752018,
          0.11793567, -21.49267738,  27.52496653,  15.30824181,
         -4.38627413,  24.30221075,   0.54014511, -20.28569125,
         19.00639492, -14.903054  ,  -1.13469172,  -7.45244232,
         12.59973426, -18.95968416,  12.