<h2> 3.6 Featurizing text data with tfidf weighted word-vectors </h2>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import warnings
import numpy as np
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")
import sys
import os 
import pandas as pd
import numpy as np
from tqdm import tqdm
import joblib

# exctract word2vec vectors
# https://github.com/explosion/spaCy/issues/1721
# http://landinghub.visualstudio.com/visual-cpp-build-tools
import spacy

In [2]:
# avoid decoding problems
df_data = pd.read_csv("data/00_train.csv")

df_data['question1'] = df_data['question1'].apply(lambda x: str(x))
df_data['question2'] = df_data['question2'].apply(lambda x: str(x))
df_data.head(1)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0


## Checkpoint 1: Applying TF-IDF

In [8]:
question1 = df_data['question1'][:2000]

tfidf_vect = TfidfVectorizer(ngram_range=(1,2), min_df=10, lowercase=False)
tfidf_1 = tfidf_vect.fit_transform(question1)

In [9]:
joblib.dump(tfidf_1, "models/01_nlp/01_1_tfidf_model.joblib")

['models/01_nlp/01_1_tfidf_model.joblib']

In [10]:
question2 = df_data['question2'][:2000]

tfidf_vect = TfidfVectorizer(ngram_range=(1,2), min_df=10, lowercase=False)
tfidf_2 = tfidf_vect.fit_transform(question2)
type(tfidf_2)

scipy.sparse.csr.csr_matrix

In [11]:
joblib.dump(tfidf_2, "models/01_nlp/02_2_tfidf_model.joblib")

['models/01_nlp/02_2_tfidf_model.joblib']

## Checkpoint 2: Applying TFIDF WEIGHT W2V

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

questions = list(df_data['question1'] + df_data['question2'])

tfidf = TfidfVectorizer(lowercase=False, )
tfidf.fit_transform(questions)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

In [4]:
counter = 0
for k, v in word2tfidf.items():
    if counter == 4:
        break
    else:
        counter += 1
        print(k, ' ', v)

00   9.410080524353322
000   6.863242916834832
0000   13.216743014123642
000000   13.216743014123642


In [5]:
# en_vectors_web_lg, which includes over 1 million unique vectors.
nlp = spacy.load('en_core_web_sm')

def tifidf_to_wieghtedW2V(text_df, tfidf_vectors):
    vecs = []
    # https://github.com/noamraph/tqdm
    # tqdm is used to print the progress bar
    for ques in tqdm(list(text_df)):
        doc = nlp(ques) 
        # 384 is the number of dimensions of vectors 
        mean_vec = np.zeros([len(doc), len(doc[0].vector)])
        for word in doc:
            # word2vec
            vec = word.vector
            # fetch df score
            try:
                idf = tfidf_vectors[str(word)]
            except:
                idf = 0
            # compute final vec
            mean_vec += vec * idf
        mean_vec = mean_vec.mean(axis=0)
        vecs.append(mean_vec)
    return vecs

In [6]:
q1 = list(tifidf_to_wieghtedW2V(df_data['question1'][:2000], word2tfidf))

100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:19<00:00, 101.65it/s]


In [7]:
joblib.dump(q1, 'models/01_nlp/03_1_tfidf_weighted_w2v.joblib')

['models/01_nlp/03_1_tfidf_weighted_w2v.joblib']

Its done previsously

In [8]:
q2 = list(tifidf_to_wieghtedW2V(df_data['question2'][:2000], word2tfidf))

100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:23<00:00, 85.82it/s]


In [9]:
joblib.dump(q2, 'models/01_nlp/04_2_tfidf_weighted_w2v.joblib')

['models/01_nlp/04_2_tfidf_weighted_w2v.joblib']