<h2> 3.6 Featurizing text data with tfidf weighted word-vectors </h2>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import warnings
import numpy as np
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")
import sys
import os 
import pandas as pd
import numpy as np
from tqdm import tqdm

# exctract word2vec vectors
# https://github.com/explosion/spaCy/issues/1721
# http://landinghub.visualstudio.com/visual-cpp-build-tools
import spacy

In [2]:
# avoid decoding problems
df = pd.read_csv(r"F:\New folder\train.csv",nrows=100000)
 
# encode questions to unicode
# https://stackoverflow.com/a/6812069
# ----------------- python 2 ---------------------
# df['question1'] = df['question1'].apply(lambda x: unicode(str(x),"utf-8"))
# df['question2'] = df['question2'].apply(lambda x: unicode(str(x),"utf-8"))
# ----------------- python 3 ---------------------
df['question1'] = df['question1'].apply(lambda x: str(x))
df['question2'] = df['question2'].apply(lambda x: str(x))

In [3]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
df.shape

(100000, 6)

In [5]:
y_true=df['is_duplicate']

In [6]:
#from sklearn.model_selection import train_test_split
X_train =df[:70000]
X_test = df[70000:100000]
y_train = df['is_duplicate'][:70000]
y_test = df['is_duplicate'][70000:100000]

In [7]:
X_train.shape,X_test.shape, y_train.shape, y_test.shape

((70000, 6), (30000, 6), (70000,), (30000,))

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
train_questions = list(X_train['question1']) + list(X_train['question2'])
test_questions = list(X_test['question1']) + list(X_test['question2'])
tfidf = TfidfVectorizer(lowercase=False)
tfidf.fit(train_questions)

x_train_tfidf=tfidf.transform(train_questions)
x_test_tfidf=tfidf.transform(test_questions)
# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

- After we find TF-IDF scores, we convert each question to a weighted average of word2vec vectors by these scores.
- here we use a pre-trained GLOVE model which comes free with "Spacy".  https://spacy.io/usage/vectors-similarity
- It is trained on Wikipedia and therefore, it is stronger in terms of word semantics. 

In [9]:
# en_vectors_web_lg, which includes over 1 million unique vectors.
nlp = spacy.load('en_core_web_sm')

train_vecs1 = []
# https://github.com/noamraph/tqdm
# tqdm is used to print the progress bar
for qu1 in tqdm(list(X_train['question1'])):
    doc1 = nlp(qu1) 
    # 384 is the number of dimensions of vectors 
    mean_vec1 = np.zeros([len(doc1), len(doc1[0].vector)])
    for word1 in doc1:
        # word2vec
        vec1 = word1.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        # compute final vec
        mean_vec1 += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    train_vecs1.append(mean_vec1)
X_train['q1_feats_m'] = list(train_vecs1)


100%|████████████████████████████████████████████████████████████████████████████| 70000/70000 [20:23<00:00, 64.49it/s]


In [10]:
# en_vectors_web_lg, which includes over 1 million unique vectors.
nlp = spacy.load('en_core_web_sm')

test_vecs1 = []
# https://github.com/noamraph/tqdm
# tqdm is used to print the progress bar
for qu1 in tqdm(list(X_test['question1'])):
    doc1 = nlp(qu1) 
    # 384 is the number of dimensions of vectors 
    mean_vec1 = np.zeros([len(doc1), len(doc1[0].vector)])
    for word1 in doc1:
        # word2vec
        vec1 = word1.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        # compute final vec
        mean_vec1 += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    test_vecs1.append(mean_vec1)
X_test['q1_feats_m'] = list(test_vecs1)


100%|████████████████████████████████████████████████████████████████████████████| 30000/30000 [08:27<00:00, 59.14it/s]


### Now for question2

In [11]:
train_vecs2 = []
for qu2 in tqdm(list(X_train['question2'])):
    doc2 = nlp(qu2) 
    mean_vec2 = np.zeros([len(doc2), len(doc2[0].vector)])
    for word2 in doc2:
        # word2vec
        vec2 = word2.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word2)]
        except:
            #print word
            idf = 0
        # compute final vec
        mean_vec2 += vec2 * idf
    mean_vec2 = mean_vec2.mean(axis=0)
    train_vecs2.append(mean_vec2)
X_train['q2_feats_m'] = list(train_vecs2)

100%|████████████████████████████████████████████████████████████████████████████| 70000/70000 [19:52<00:00, 58.72it/s]


In [12]:
test_vecs2 = []
for qu2 in tqdm(list(X_test['question2'])):
    doc2 = nlp(qu2) 
    mean_vec2 = np.zeros([len(doc2), len(doc2[0].vector)])
    for word2 in doc2:
        # word2vec
        vec2 = word2.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word2)]
        except:
            #print word
            idf = 0
        # compute final vec
        mean_vec2 += vec2 * idf
    mean_vec2 = mean_vec2.mean(axis=0)
    test_vecs2.append(mean_vec2)
X_test['q2_feats_m'] = list(test_vecs2)

100%|████████████████████████████████████████████████████████████████████████████| 30000/30000 [08:35<00:00, 56.95it/s]


In [13]:
#prepro_features_train.csv (Simple Preprocessing Feartures)
#nlp_features_train.csv (NLP Features)
if os.path.isfile('nlp_features_train.csv'):
    dfnlp = pd.read_csv("nlp_features_train.csv",encoding='latin-1')
else:
    print("download nlp_features_train.csv from drive or run previous notebook")

if os.path.isfile('df_fe_without_preprocessing_train.csv'):
    dfppro = pd.read_csv("df_fe_without_preprocessing_train.csv",encoding='latin-1')
else:
    print("download df_fe_without_preprocessing_train.csv from drive or run previous notebook")

In [14]:
df1 = dfnlp.drop(['qid1','qid2','question1','question2'],axis=1)
df2 = dfppro.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)

In [15]:
train_df1 = df1[:70000]
test_df1 = df1[70000:100000]
train_df2=df2[:70000]
test_df2 = df2[70000:100000]

In [16]:
train_df3 = X_train.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
train_df3_q1 = pd.DataFrame(train_df3.q1_feats_m.values.tolist(), index= train_df3.index)
train_df3_q2 = pd.DataFrame(train_df3.q2_feats_m.values.tolist(), index= train_df3.index)

test_df3 = X_test.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
test_df3_q1 = pd.DataFrame(test_df3.q1_feats_m.values.tolist(), index= test_df3.index)
test_df3_q2 = pd.DataFrame(test_df3.q2_feats_m.values.tolist(), index= test_df3.index)

In [18]:
# dataframe of nlp features
print(train_df1.shape)
train_df1.head()

(70000, 17)


Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,0,0.99998,0.833319,0.999983,0.999983,0.916659,0.785709,0.0,1.0,2.0,13.0,100,93,93,100,0.982759
1,1,0,0.799984,0.399996,0.749981,0.599988,0.699993,0.466664,0.0,1.0,5.0,12.5,86,63,66,75,0.596154
2,2,0,0.399992,0.333328,0.399992,0.249997,0.399996,0.285712,0.0,1.0,4.0,12.0,63,63,43,47,0.166667
3,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,12.0,28,24,9,14,0.039216
4,4,0,0.399992,0.199998,0.99995,0.666644,0.57142,0.30769,0.0,1.0,6.0,10.0,67,47,35,56,0.175


In [19]:
# data before preprocessing 
print(test_df2.shape)
test_df2.head()

(30000, 12)


Unnamed: 0,id,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2
70000,70000,1,1,40,33,8,6,4.0,14.0,0.285714,2,0
70001,70001,1,1,77,76,12,14,1.0,25.0,0.04,2,0
70002,70002,1,1,44,38,7,7,4.0,14.0,0.285714,2,0
70003,70003,1,1,45,35,8,6,5.0,14.0,0.357143,2,0
70004,70004,2,1,105,116,15,22,3.0,34.0,0.088235,3,1


In [20]:
# Questions 1 tfidf weighted word2vec
train_df3_q1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
0,-18.750306,30.474162,-131.688216,-132.341369,53.097417,90.837481,18.884289,-4.658396,-61.738883,-152.082096,...,-87.055406,102.910387,23.587646,73.91092,-15.87182,-2.142419,142.344314,-140.08388,13.871884,-21.903918
1,-104.690877,74.371543,-135.348104,-126.639307,-27.947215,72.847219,25.225759,15.836089,-4.461842,-37.594207,...,-147.596489,57.452204,110.535398,69.032432,-129.56164,6.049076,64.504514,-122.112493,-71.928552,20.352412
2,-99.130791,-106.101304,-85.229888,-136.495436,63.577369,57.745399,3.622945,40.707156,26.82165,-60.320099,...,-112.386775,126.757936,39.97549,92.301755,-56.048038,38.667549,-7.289244,-123.946619,-2.998062,-5.717014
3,11.849351,-69.411225,-103.920914,-32.967309,26.241607,141.254266,8.469155,38.05519,-23.449674,-46.134465,...,65.585668,24.645576,-71.129215,27.308936,49.465869,-87.845095,31.299102,-65.751526,-83.854064,6.852451
4,-79.953414,-0.984554,-186.729869,-219.849053,139.824115,16.127034,-38.261394,101.931907,80.4848,-206.065002,...,-131.091526,168.08179,34.590146,136.819382,-29.986358,143.434891,109.191581,-169.557661,-60.625889,-160.25212


In [21]:
# Questions 2 tfidf weighted word2vec
test_df3_q2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
70000,-56.219362,34.866852,-63.264648,-31.519709,2.716173,32.774231,-24.16051,-12.94399,-15.835053,-29.038805,...,-28.000884,45.249554,42.942631,-22.451017,-38.416905,-48.394424,19.529397,-88.156666,-26.429246,6.294696
70001,-44.487986,-16.559647,-8.265042,-131.722139,47.139518,50.726799,-35.937455,18.328376,-39.180067,-78.949415,...,-36.934663,62.453272,-16.769735,161.073872,-5.007065,5.939252,-5.67858,-114.505209,-82.736851,4.692711
70002,-20.219085,17.384493,-53.644833,-51.75213,-37.840623,71.498784,-44.634093,15.386852,23.041755,-15.788619,...,-42.303903,66.895313,-18.015015,61.749053,-143.049924,37.470536,37.753334,-75.69966,-70.648948,27.517377
70003,-59.414867,-22.977995,-97.961061,-104.056167,5.540221,35.076674,57.393421,-8.171372,-59.113157,-30.082358,...,-102.366942,31.409345,70.293955,49.520248,17.312032,-22.825102,44.528164,-25.255875,-47.480465,29.623539
70004,129.181501,-126.786942,-32.677114,-166.040644,-33.310141,207.555442,59.908262,-59.599952,101.891909,173.294937,...,-3.41274,150.706235,3.313084,74.065591,18.052254,-49.848646,-44.868527,-156.702087,49.602188,59.535578


In [23]:
print("Number of features in nlp dataframe :", train_df1.shape[1])
print("Number of features in preprocessed dataframe :", df2.shape[1])
#print("Number of features in question1 w2v  dataframe :", df3_q1.shape[1])
#print("Number of features in question2 w2v  dataframe :", df3_q2.shape[1])
#print("Number of features in final dataframe  :", df1.shape[1]+df2.shape[1]+df3_q1.shape[1]+df3_q2.shape[1])

Number of features in nlp dataframe : 17
Number of features in preprocessed dataframe : 12


In [24]:
# storing the final features to csv file
if not os.path.isfile('final_features_train.csv'):
    train_df3_q1['id']=train_df1['id']
    train_df3_q2['id']=train_df1['id']
    train_df1  = train_df1.merge(train_df2, on='id',how='left')
    train_df2  = train_df3_q1.merge(train_df3_q2, on='id',how='left')
    result_train  = train_df1.merge(train_df2, on='id',how='left')
    result_train.to_csv('final_features_train.csv')

In [25]:
# storing the final features to csv file
if not os.path.isfile('final_features_test.csv'):
    test_df3_q1['id']=test_df1['id']
    test_df3_q2['id']=test_df1['id']
    test_df1  = test_df1.merge(test_df2, on='id',how='left')
    test_df2  = test_df3_q1.merge(test_df3_q2, on='id',how='left')
    result_test  = test_df1.merge(test_df2, on='id',how='left')
    result_test.to_csv('final_features_test.csv')

In [26]:
x=pd.read_csv("final_features_train.csv")
x.head()

Unnamed: 0.1,Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,...,86_y,87_y,88_y,89_y,90_y,91_y,92_y,93_y,94_y,95_y
0,0,0,0,0.99998,0.833319,0.999983,0.999983,0.916659,0.785709,0.0,...,-63.887937,96.26677,14.10105,89.70437,-3.14925,-28.455414,104.015658,-119.16877,-14.518671,-14.929926
1,1,1,0,0.799984,0.399996,0.749981,0.599988,0.699993,0.466664,0.0,...,-141.592824,107.190335,58.450946,68.84951,-133.354518,5.177492,61.985479,-159.598408,-91.079395,22.82197
2,2,2,0,0.399992,0.333328,0.399992,0.249997,0.399996,0.285712,0.0,...,-123.236945,130.897132,45.473882,5.160702,-57.163145,-6.466167,32.739081,-11.935622,-34.474873,29.023742
3,3,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-88.282231,82.170867,-63.144346,-5.639859,-57.968393,-14.59268,99.42225,12.783008,-63.892824,-19.885731
4,4,4,0,0.399992,0.199998,0.99995,0.666644,0.57142,0.30769,0.0,...,17.894099,41.746871,-60.181064,19.474478,-14.277007,8.899386,75.176742,-77.21448,24.538786,-26.880159


In [27]:
y=pd.read_csv("final_features_test.csv")
y.head()

Unnamed: 0.1,Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,...,86_y,87_y,88_y,89_y,90_y,91_y,92_y,93_y,94_y,95_y
0,0,70000,1,0.666644,0.666644,0.999967,0.599988,0.833319,0.624992,0.0,...,-28.000884,45.249554,42.942631,-22.451017,-38.416905,-48.394424,19.529397,-88.156666,-26.429246,6.294696
1,1,70001,0,0.399992,0.249997,0.0,0.0,0.153845,0.133332,0.0,...,-36.934663,62.453272,-16.769735,161.073872,-5.007065,5.939252,-5.67858,-114.505209,-82.736851,4.692711
2,2,70002,0,0.666644,0.499988,0.666644,0.499988,0.57142,0.57142,0.0,...,-42.303903,66.895313,-18.015015,61.749053,-143.049924,37.470536,37.753334,-75.69966,-70.648948,27.517377
3,3,70003,0,0.999967,0.749981,0.999975,0.799984,0.999986,0.777769,0.0,...,-102.366942,31.409345,70.293955,49.520248,17.312032,-22.825102,44.528164,-25.255875,-47.480465,29.623539
4,4,70004,0,0.249997,0.249997,0.142855,0.083333,0.187499,0.124999,0.0,...,-3.41274,150.706235,3.313084,74.065591,18.052254,-49.848646,-44.868527,-156.702087,49.602188,59.535578
