### `import` Packages

In [None]:
import warnings
warnings.filterwarnings('ignore')

import nltk
nltk.download('stopwords')

import os
import re
import spacy
import time

import numpy as np
import pandas as pd
import seaborn as sns

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from tqdm import tqdm
from matplotlib import pyplot as plt
from matplotlib import style

style.use('seaborn-deep')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
folder_path = '/content/drive/MyDrive/AAIC/QQP/'

In [None]:
STOP_WORDS = stopwords.words("english")

#### Data Reading

In [None]:
df = pd.read_csv(folder_path + 'train.csv')

df['question1'] = df['question1'].apply(lambda x : str(x))
df['question2'] = df['question2'].apply(lambda x : str(x))

In [None]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [None]:
df.dtypes

id               int64
qid1             int64
qid2             int64
question1       object
question2       object
is_duplicate     int64
dtype: object

In [None]:
df.shape

(404290, 6)

In [None]:
questions = df['question1'].to_list() + df['question2'].to_list()
print(len(questions))

808580


### TFIDF Vectorizer

In [None]:
tfidf = TfidfVectorizer(lowercase=False)
tfidf.fit_transform(raw_documents=questions)

<808580x109679 sparse matrix of type '<class 'numpy.float64'>'
	with 8146555 stored elements in Compressed Sparse Row format>

In [None]:
words_tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

### TFIDF Weighted Avg W2V

In [None]:
def fetch_weighted_avg_vectors(dframe, col_name, feature_names_with_idf_val, n_rows=None):
    nlp = spacy.load('en_core_web_sm')
    col_vals = dframe[col_name].to_list()[:n_rows]

    vecs = []
    for q in tqdm(col_vals):
        doc = nlp(q)
        mean_vec = np.zeros(shape=(len(doc), len(doc[0].vector)))

        for word in doc:
            vec = word.vector
            
            try:
                idf_val = feature_names_with_idf_val[str(word)]
            except Exception as e:
                idf_val = 0
            
            mean_vec += vec * idf_val
        
        mean_vec = mean_vec.mean(axis=0)
        vecs.append(mean_vec)
    
    return vecs

In [None]:
# question1_vecs = fetch_weighted_avg_vectors(dframe=df, col_name='question1', feature_names_with_idf_val=words_tfidf)
# question2_vecs = fetch_weighted_avg_vectors(dframe=df, col_name='question2', feature_names_with_idf_val=words_tfidf)
# df['q1_feats_m'] = question1_vecs
# df['q2_feats_m'] = question2_vecs