In [1]:
import pandas as pd
import scipy.sparse
import joblib 

# Creating db file from csv
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sqlalchemy import create_engine # database connection
import sqlite3

from sklearn.model_selection import GridSearchCV

import os
import datetime as dt
from collections import Counter, defaultdict

from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, log_loss, confusion_matrix

from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import SGDClassifier

In [2]:
data_points = 3000

In [3]:
mat = pd.read_csv("data/00_train.csv")
mat = mat.drop(['qid1','qid2','question1','question2',],axis=1)
# df_y = mat[:data_points]
df_y = mat
df_y.head(1)

Unnamed: 0,id,is_duplicate
0,0,0


## Checkpoint 1: Merging TFIDF WEIGHTED W2V

In [4]:
mat = joblib.load('models/01_nlp/03_1_tfidf_weighted_w2v.joblib')
tfidf_weighted_w2v_1 = pd.DataFrame(mat)
tfidf_weighted_w2v_1.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
0,40.42618,-17.285498,29.463175,-2.10149,6.234135,-9.76733,-1.931092,2.764305,-8.621197,-2.86643,...,32.48296,-41.912592,1.249518,-1.629152,-14.163644,20.832646,31.582641,-14.17523,21.968109,18.779508


In [5]:
mat = joblib.load('models/01_nlp/04_2_tfidf_weighted_w2v.joblib')
tfidf_weighted_w2v_2 = pd.DataFrame(mat)
tfidf_weighted_w2v_2.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
0,37.420996,-16.780388,25.324247,-2.273731,5.29817,-12.906271,2.898618,9.660838,-7.949519,-3.139777,...,22.065664,-35.353501,4.115318,-2.66872,-13.516946,12.673964,19.263596,-10.946913,17.096025,14.284331


In [6]:
tfidf_weighted_w2v_1['id']= df_y['id']
tfidf_weighted_w2v_2['id']= df_y['id']

tfidf_weighted_w2v_full = tfidf_weighted_w2v_1.merge(tfidf_weighted_w2v_2, on='id',how='left')
tfidf_weighted_w2v_full.head(2)

Unnamed: 0,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,9_x,...,86_y,87_y,88_y,89_y,90_y,91_y,92_y,93_y,94_y,95_y
0,40.42618,-17.285498,29.463175,-2.10149,6.234135,-9.76733,-1.931092,2.764305,-8.621197,-2.86643,...,22.065664,-35.353501,4.115318,-2.66872,-13.516946,12.673964,19.263596,-10.946913,17.096025,14.284331
1,-9.296082,-38.953275,50.663282,0.5078,4.163735,-11.889674,-28.71055,5.986635,-21.407287,-22.247378,...,9.864944,-51.591321,-11.154295,7.984347,-14.642147,-18.467388,11.879343,-7.531472,4.583699,14.472303


In [7]:
joblib.dump(tfidf_weighted_w2v_full, f"models/01_nlp/06_tfidf_weighted_w2v_{data_points}_data_points.joblib")

['models/01_nlp/06_tfidf_weighted_w2v_2000_data_points.joblib']

## Checkpoint 2: Merging TFIDF

In [None]:
mat = joblib.load('models/01_nlp/01_1_tfidf_model.joblib') # sparse
mat1 = pd.DataFrame.sparse.from_spmatrix(mat)

mat1.head(1)
# tfidf_1_dense  = mat.sparse.to_dense() # dense
# tfidf_1_dense.head(1)

In [None]:
mat = joblib.load('models/01_nlp/02_2_tfidf_model.joblib') # sparse
mat2 = pd.DataFrame.sparse.from_spmatrix(mat)

mat2.head(1)
# tfidf_2_dense  = mat.sparse.to_dense() # dense
# tfidf_2_dense.head(1)

In [None]:
mat1['id'] = df_y['id']
mat2['id'] = df_y['id']


df_tfidf_full = mat1.merge(mat2, on='id',how='left')
df_tfidf_full.head(2)

# tfidf_1_dense['id'] = df_y['id']
# tfidf_2_dense['id'] = df_y['id']

# df_tfidf_full = tfidf_1_dense.merge(tfidf_2_dense, on='id',how='left')
# df_tfidf_full.head(2)

In [None]:
# joblib.dump(df_tfidf_full, f"models/01_nlp/05_full_tfidf_model_{data_points}_datapoints.joblib")

# joblib.dump(df_tfidf_full, f"models/01_nlp/05_full_tfidf_model_{data_points}datapoints.joblib")

## Checkpoint 3: Merging basic, advanced & nlp distance features using eda

In [4]:
mat = pd.read_csv("data/00_train.csv")
# mat = mat.drop(['qid1','qid2','question1','question2',],axis=1)
# df_y = mat[:data_points]
df_y = mat
df_y.head(1)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0


In [5]:
import re
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from bs4 import BeautifulSoup

# To get the results in 4 decemal points
SAFE_DIV = 0.0001 

STOP_WORDS = stopwords.words("english")


def preprocess(x):
    x = str(x).lower()
    x = x.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
                           .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
                           .replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
                           .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
                           .replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
                           .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
                           .replace("€", " euro ").replace("'ll", " will")
    x = re.sub(r"([0-9]+)000000", r"\1m", x)
    x = re.sub(r"([0-9]+)000", r"\1k", x)
    
    
    porter = PorterStemmer()
    pattern = re.compile('\W')
    
    if type(x) == type(''):
        x = re.sub(pattern, ' ', x)
    
    if type(x) == type(''):
        x = porter.stem(x)
        example1 = BeautifulSoup(x)
        x = example1.get_text()
               
    return x

df_y['cleaned_q1'] = preprocess(df_y['question1'][0])
df_y['cleaned_q2'] = preprocess(df_y['question2'][0])

In [6]:
df_basic = pd.read_csv("data/01_basic_eda.csv")
df_basic = df_basic.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
# df_basic = df_basic
df_basic.head(1)

Unnamed: 0,id,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2
0,0,1,1,66,57,14,12,10.0,23.0,0.434783,2,0


In [7]:
df_advance = pd.read_csv("data/02_advanced_eda.csv")
df_advance = df_advance.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
# df_advance = df_advance
df_advance.head(1)

Unnamed: 0,id,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio
0,0,0.99998,0.833319,0.999983,0.999983,0.916659,0.785709,0.0,1.0,2.0,13.0,100,93,93,100


In [9]:
df_nlp_distance = pd.read_csv("data/03_nlp_glove_features.csv")
df_nlp_distance = df_nlp_distance.drop(['Unnamed: 0', 'qid1','qid2','question1','question2','is_duplicate'],axis=1)
# df_advance = df_advance
df_nlp_distance.head(1)

Unnamed: 0,id,Word_Mover_Dist,dist_cosine,dist_cityblock,dist_canberra,dist_euclidean,dist_minkowski,dist_braycurtis,dist_chebyshev,dist_correlation,dist_hamming,dist_jaccard,dist_kulsinski,dist_rogerstanimoto,dist_russellrao,dist_sokalmichener,dist_yule
0,0,0.144728,0.006854,9.08724,82.744686,0.657988,0.657988,0.117393,0.112668,0.00684,1.0,1.0,0.860943,-0.670791,0.895868,-0.670791,0.233231


In [10]:
df_extraction = df_y.merge(df_basic, on='id',how='left')
df_extraction = df_extraction.merge(df_advance, on='id',how='left')
df_extraction = df_extraction.merge(df_nlp_distance, on='id',how='left')
# df_extraction = df_extraction.drop(['id','is_duplicate'],axis=1)
# df_extraction = df_extraction.drop(['id'],axis=1)
print(df_extraction.shape)
df_extraction.head(1)

(404290, 49)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,cleaned_q1,cleaned_q2,freq_qid1,freq_qid2,...,dist_braycurtis,dist_chebyshev,dist_correlation,dist_hamming,dist_jaccard,dist_kulsinski,dist_rogerstanimoto,dist_russellrao,dist_sokalmichener,dist_yule
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,1,1,...,0.117393,0.112668,0.00684,1.0,1.0,0.860943,-0.670791,0.895868,-0.670791,0.233231


In [11]:
df_extraction.to_csv(f"data/04_all_features.csv")