In [6]:
import pandas as pd
import scipy
import sklearn
from sklearn import *
import numpy as np
import os

In [10]:
full_train_df = pd.read_csv('quora_train.csv', encoding='utf8')
train_df, test_df = sklearn.model_selection.train_test_split(full_train_df, test_size=0.1,random_state=123)

In [11]:
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,1))

all_q1 = list(train_df["question1"]) + list(test_df["question1"])
all_q2 = list(train_df["question2"]) + list(test_df["question2"])
all_questions = all_q1 + all_q2

len(all_questions)

808580

In [12]:
def cast_list_as_strings(mylist):
    """
    return a list of strings
    """
    #assert isinstance(mylist, list), f"the input mylist should be a list it is {type(mylist)}"
    mylist_of_strings = []
    for x in mylist:
        mylist_of_strings.append(str(x))

    return mylist_of_strings

In [13]:
q1_train =  cast_list_as_strings(list(train_df["question1"]))
q2_train =  cast_list_as_strings(list(train_df["question2"]))
q1_test  =  cast_list_as_strings(list(test_df["question1"]))
q2_test  =  cast_list_as_strings(list(test_df["question2"]))

all_questions = q1_train + q2_train + q1_test + q2_test

USING TFIDF REPRES

In [16]:
tfidf = sklearn.feature_extraction.text.TfidfVectorizer()
tfidf.fit(all_questions)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [58]:
def get_tfidf_from_df(df, tfidf):
    """
    returns a sparse matrix containing the features build by the  tfidf.
    Each row should contain features from question1 and question2 plus similarity.
    """
    q1_casted =  cast_list_as_strings(list(df["question1"]))
    q2_casted =  cast_list_as_strings(list(df["question2"]))
    
    tfidf_q1 = tfidf.transform(q1_casted)
    tfidf_q2 = tfidf.transform(q2_casted)
    tfidf_q1q2 = scipy.sparse.hstack((tfidf_q1,tfidf_q2))
    
    return tfidf_q1q2.tocsr() 


In [59]:
tfidf_tr_q1q2 = get_tfidf_from_df(train_df,tfidf)
tfidf_te_q1q2  = get_tfidf_from_df(test_df, tfidf)

tfidf_tr_q1q2.shape, train_df.shape, test_df.shape, tfidf_te_q1q2.shape

((363861, 172306), (363861, 6), (40429, 6), (40429, 172306))

In [60]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear",
                                                   random_state=123)
y_train = train_df["is_duplicate"].values
logistic.fit(tfidf_tr_q1q2, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=123, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [61]:
y_val = test_df["is_duplicate"].values
logistic.score(tfidf_te_q1q2, y_val)

0.7597763981300552

ADDING FEAT COS_SIM

In [92]:
def tfidf_add_cos_sim(df, tfidf, sim=True):
    q1_casted =  cast_list_as_strings(list(df["question1"]))
    q2_casted =  cast_list_as_strings(list(df["question2"]))
    
    tfidf_q1 = tfidf.transform(q1_casted)
    tfidf_q2 = tfidf.transform(q2_casted)
    tfidf_q1q2 = scipy.sparse.hstack((tfidf_q1,tfidf_q2))
    if sim == True:
        sims = []
        for i in range(len(q1_casted)):
            sims.append(cosine_similarity(tfidf_q1[i,:],tfidf_q2[i,:]))
        sims = np.reshape(sims, (len(q1_casted), 1))

        return scipy.sparse.hstack((tfidf_q1q2,sims)).tocsr() 
    else:
        return tfidf_q1q2.tocsr() 



In [93]:
tfidf_tr_q1q2_sim  = tfidf_add_cos_sim(train_df,tfidf)
tfidf_te_q1q2_sim  = tfidf_add_cos_sim(test_df, tfidf)

tfidf_tr_q1q2_sim.shape, train_df.shape, test_df.shape, tfidf_te_q1q2_sim.shape

((363861, 172307), (363861, 6), (40429, 6), (40429, 172307))

In [78]:
logistic_sim = sklearn.linear_model.LogisticRegression(solver="liblinear",
                                                   random_state=123)
logistic_sim.fit(tfidf_tr_q1q2_sim, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=123, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [80]:
logistic_sim.score(tfidf_te_q1q2_sim, y_val)

0.7958890895149522

USING JUST 15K MOST COMMON FEAT

In [82]:
tfidf_max_feat = sklearn.feature_extraction.text.TfidfVectorizer(max_features=15000)
tfidf_max_feat.fit(all_questions)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=15000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [83]:
tfidf_tr_q1q2_sim_15  = tfidf_add_cos_sim(train_df,tfidf_max_feat)
tfidf_te_q1q2_sim_15  = tfidf_add_cos_sim(test_df, tfidf_max_feat)

tfidf_tr_q1q2_sim_15.shape, train_df.shape, test_df.shape, tfidf_te_q1q2_sim_15.shape

((363861, 30001), (363861, 6), (40429, 6), (40429, 30001))

In [84]:
logistic_sim_15 = sklearn.linear_model.LogisticRegression(solver="liblinear",
                                                   random_state=123)
logistic_sim_15.fit(tfidf_tr_q1q2_sim_15, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=123, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [86]:
logistic_sim_15.score(tfidf_te_q1q2_sim_15, y_val)

0.7880729179549334

ADDING LEN FEAT

In [104]:
full_train_df['question1_string'] = cast_list_as_strings(list(full_train_df["question1"]))
full_train_df['question2_string'] = cast_list_as_strings(list(full_train_df["question2"]))

In [105]:
max_q1 = max([len(s) for s in full_train_df['question1_string']])
max_q2 = max([len(s) for s in full_train_df['question2_string']])

full_train_df['len_q1'] = [len(s)/max_q1 for s in full_train_df['question1_string']] 
full_train_df['len_q2'] = [len(s)/max_q2 for s in full_train_df['question2_string']]

train_df, test_df = sklearn.model_selection.train_test_split(full_train_df, test_size=0.1,random_state=123)

In [109]:
scipy.sparse.hstack((tfidf_tr_q1q2_sim,train_df['len_q1'].to_numpy().reshape(len(train_df['len_q1']),1),train_df['len_q2'].to_numpy().reshape(len(train_df['len_q2']),1)))

<363861x172309 sparse matrix of type '<class 'numpy.float64'>'
	with 8381270 stored elements in COOrdinate format>

In [111]:
def add_a_column_feat(col_q1, colq2, sparse_matrix):
    feat_q1 = col_q1.to_numpy().reshape(len(col_q1),1)
    feat_q2 = colq2.to_numpy().reshape(len(colq2),1)

    return scipy.sparse.hstack((sparse_matrix,feat_q1,feat_q2)).tocsr()



In [114]:
tfidf_tr_q1q2_len  = add_a_column_feat(train_df['len_q1'], train_df['len_q2'],tfidf_tr_q1q2_sim)
tfidf_te_q1q2_len  = add_a_column_feat(test_df['len_q1'], test_df['len_q2'], tfidf_te_q1q2_sim)

tfidf_tr_q1q2_len.shape, train_df.shape, test_df.shape, tfidf_te_q1q2_len.shape

((363861, 172309), (363861, 10), (40429, 10), (40429, 172309))

In [115]:
logistic_sim_len = sklearn.linear_model.LogisticRegression(solver="liblinear",
                                                   random_state=123)
logistic_sim_len.fit(tfidf_tr_q1q2_len, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=123, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [117]:
logistic_sim_len.score(tfidf_te_q1q2_len, y_val)

0.7962848450369784