In [1]:
# Importing important libraries and modules

import re
import numpy as np
import pandas as pd
import math

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from fuzzywuzzy import fuzz

from sklearn.cluster import KMeans
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.naive_bayes import GaussianNB
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC,LinearSVC
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

from sklearn.metrics import log_loss

In [2]:
# Reading the data

train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [3]:
train_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


### Data Preprocessing

In [4]:
# Cleaning the questions

from nltk.tokenize import RegexpTokenizer


def clean_text(row):

#    row = re.sub("[^a-zA-Z]", " ", row) 
#    tokenizer = RegexpTokenizer(r'\w+')
#    tokenizer.tokenize(row)

    word_list = row.lower().split()                             
    stops = set(stopwords.words("english")) 
    
#    print(word_list)
    ps=PorterStemmer();
    # l = WordNetLemmatizer()
    
    # meaningful_words = [l.lemmatize(w) for w in word_list if not w in stops]       
    meaningful_words = [ps.stem(w) for w in word_list if not w in stops]   
    
    # Joining the tokenized words back into one string
    m = set(meaningful_words)
    meaningful_words = list(m)
    
    return( " ".join( meaningful_words ))   


In [5]:
# Fill the null values with some random string

train_data['question1'].fillna('xpx',inplace=True)
train_data['question2'].fillna('xpx',inplace=True)

In [None]:
# Cleaning the train data

train_data['question1_cleaned'] = train_data.apply(lambda row : clean_text(row['question1']),axis=1)
train_data['question2_cleaned'] = train_data.apply(lambda row : clean_text(row['question2']),axis=1)

In [62]:
# Replacing the null questions with some random string

test_data['question1'].fillna('xpx',inplace=True)
test_data['question2'].fillna('xpx',inplace=True)

In [67]:
# Cleaning the test data

test_data['question1_cleaned'] = test_data.apply(lambda row : clean_text(row['question1']),axis=1)
test_data['question2_cleaned'] = test_data.apply(lambda row : clean_text(row['question2']),axis=1)

In [72]:
# Saving the cleaned data

train_data.to_csv('cleaned_train.csv',index = False)
test_data.to_csv('cleaned_test.csv',index=False)

## Feature Engineering

### Cosine Similarity

In [13]:
# Computing the cosing similarity metric

import re, math
from collections import Counter

WORD = re.compile(r'\w+')

def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])  # sigma (a.b)

    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2) # sigma(sqrt(a^2))* sigma(sqrt(b^2))

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

def get_similarity(row):
    vector1 = text_to_vector(train_data.loc[row]['question1_cleaned'])
    vector2 = text_to_vector(train_data.loc[row]['question2_cleaned'])
    
    cosine = get_cosine(vector1, vector2)
    return cosine

def get_similarity_test(row):
    vector1 = text_to_vector(test_data.loc[row]['question1_cleaned'])
    vector2 = text_to_vector(test_data.loc[row]['question2_cleaned'])
    
    cosine = get_cosine(vector1, vector2)
    return cosine

In [14]:
# Calculating the cosine similarity 

train_data['cosine_similarity'] = train_data.apply(lambda row : get_similarity(row['id']), axis = 1)
test_data['cosine_similarity'] = test_data.apply(lambda row : get_similarity_test(row['test_id']), axis = 1)

### Shared Words

In [15]:
# Feature - Shared Words

def shared_words(row):
    q1 = row['question1_cleaned'].split()
    q2 = row['question2_cleaned'].split()
    shared_words_in_q1 = [w for w in q1 if w in q2]
    shared_words_in_q2 = [w for w in q2 if w in q1]
    R = float((len(shared_words_in_q1) + len(shared_words_in_q2)))/float((len(q1) + len(q2)))
    return R

In [16]:
# Computing the Shared Words
train_data['shared_words'] = train_data.apply(shared_words, axis = 1, raw = True)
test_data['shared_words'] = test_data.apply(shared_words, axis = 1, raw = True)

### Tf-Idf Vectoriser

In [11]:
train_qs = pd.Series(train_data['question1'].tolist() + train_data['question2'].tolist()).astype(str)
test_qs = pd.Series(test_data['question1'].tolist() + test_data['question2'].tolist()).astype(str)

In [12]:
from collections import Counter

# If a word appears only once, we ignore it completely (likely a typo)
# Epsilon defines a smoothing constant, which makes the effect of extremely rare words smaller
def get_weight(count, eps=5000, min_count=2):
    if count < min_count:
        return 0
    else:
        return float(1) / float((count + eps))

eps = 5000 
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

In [13]:
def tfidf(row):
    q1words = {}
    q2words = {}
    for word in str(row['question1_cleaned']).lower().split():
        q1words[word] = 1
    for word in str(row['question2_cleaned']).lower().split():
        q2words[word] = 1
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

In [14]:
train_data['tfidf_count'] = train_data.apply(lambda x: tfidf(x),axis =1 )    

In [15]:
test_data['tfidf_count'] = test_data.apply(lambda x: tfidf(x),axis =1 )    

In [16]:
train_data['tfidf_count'].fillna('0',inplace = True )
test_data['tfidf_count'].fillna('0',inplace = True )

In [41]:
# Computing some basic features - TRAIN

train_data['length_question1'] = train_data.apply(lambda x: len(x['question1_cleaned']),axis=1)
train_data['length_question2'] = train_data.apply(lambda x: len(x['question2_cleaned']),axis=1)
train_data['word_count_question1'] = train_data.apply(lambda x: len(x['question1_cleaned'].split()),axis=1)
train_data['word_count_question2'] = train_data.apply(lambda x: len(x['question2_cleaned'].split()),axis=1)

In [48]:
# Computing some basic features - TEST

test_data['length_question1'] = test_data.apply(lambda x: len(x['question1_cleaned']),axis=1)
test_data['length_question2'] = test_data.apply(lambda x: len(x['question2_cleaned']),axis=1)
test_data['word_count_question1'] = test_data.apply(lambda x: len(x['question1_cleaned'].split()),axis=1)
test_data['word_count_question2'] = test_data.apply(lambda x: len(x['question2_cleaned'].split()),axis=1)
train_data['length_difference'] = abs(train_data['length_question1'] - train_data['length_question2'])
train_data['word_count_difference'] = abs(train_data['word_count_question1'] - train_data['word_count_question2'])
test_data['length_difference'] = abs(test_data['length_question1'] - test_data['length_question2'])
test_data['word_count_difference'] = abs(test_data['word_count_question1']-test_data['word_count_question2'])

In [55]:
# Computing some fuzzy features - Train

# fuzz_ratio : number of matches / total number of characters in both string
# fuzz_partial_ratio : finds the smallest of the two string, searches it in the other
# fuzz_token_sort_ratio : sorts the strings first, and then finds the ratio as fuzz_ratio
# fuzz_token_set_ratio : operates on the same principles as a set

train_data['fuzz_ratio'] = train_data.apply(lambda x: fuzz.ratio(str(x['question1_cleaned']),str(x['question2_cleaned'])),axis=1)
train_data['fuzz_QRatio'] = train_data.apply(lambda x: fuzz.QRatio(str(x['question1_cleaned']),str(x['question2_cleaned'])),axis=1)
train_data['fuzz_WRatio'] = train_data.apply(lambda x: fuzz.WRatio(str(x['question1_cleaned']),str(x['question2_cleaned'])),axis=1)
train_data['fuzz_partial_ratio'] = train_data.apply(lambda x: fuzz.partial_ratio(str(x['question1_cleaned']),str(x['question2_cleaned'])),axis=1)
train_data['fuzz_token_set_ratio'] = train_data.apply(lambda x: fuzz.token_set_ratio(str(x['question1_cleaned']),str(x['question2_cleaned'])),axis=1)
train_data['fuzz_token_sort_ratio'] = train_data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1_cleaned']),str(x['question2_cleaned'])),axis=1)
train_data['fuzz_partial_token_set_ratio'] = train_data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1_cleaned']),str(x['question2_cleaned'])),axis=1)
train_data['fuzz_partial_token_sort_ratio'] = train_data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1_cleaned']),str(x['question2_cleaned'])),axis=1)

In [None]:
# Computing some fuzzy features - Train

# fuzz_ratio : number of matches / total number of characters in both string
# fuzz_partial_ratio : finds the smallest of the two string, searches it in the other
# fuzz_token_sort_ratio : sorts the strings first, and then finds the ratio as fuzz_ratio
# fuzz_token_set_ratio : operates on the same principles as a set

test_data['fuzz_ratio'] = test_data.apply(lambda x: fuzz.ratio(str(x['question1_cleaned']),str(x['question2_cleaned'])),axis=1)
test_data['fuzz_QRatio'] = test_data.apply(lambda x: fuzz.QRatio(str(x['question1_cleaned']),str(x['question2_cleaned'])),axis=1)

###test_data['fuzz_WRatio'] = test_data.apply(lambda x: fuzz.WRatio(str(x['question1_cleaned']),str(x['question2_cleaned'])),axis=1)
###test_data['fuzz_partial_ratio'] = test_data.apply(lambda x: fuzz.partial_ratio(str(x['question1_cleaned']),str(x['question2_cleaned'])),axis=1)

test_data['fuzz_token_set_ratio'] = test_data.apply(lambda x: fuzz.token_set_ratio(str(x['question1_cleaned']),str(x['question2_cleaned'])),axis=1)
test_data['fuzz_token_sort_ratio'] = test_data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1_cleaned']),str(x['question2_cleaned'])),axis=1)

###test_data['fuzz_partial_token_set_ratio'] = test_data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1_cleaned']),str(x['question2_cleaned'])),axis=1)
####test_data['fuzz_partial_token_sort_ratio'] = test_data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1_cleaned']),str(x['question2_cleaned'])),axis=1)

In [18]:
# Saving the cleaned data
train_data.to_csv('train_features.csv',index = False)
test_data.to_csv('test_features.csv',index=False)

## Data Modelling

In [45]:
# Dividing data into train and test - Cosine and Shared

columns = ['tfidf_count','cosine_similarity','shared_words','fuzz_token_sort_ratio','fuzz_token_set_ratio','length_difference']
X_train = train_data[columns]
Y_train = train_data['is_duplicate']
X_test = test_data[columns]                     

print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)

(404288, 6)
(404288,)
(2345796, 6)


### Result Analysis

In [2]:
train_data = pd.read_csv('train_features.csv')
test_data = pd.read_csv('test_features.csv')

In [8]:
train_data = train_data.values
x = train_data[0::,0]

### ML Algorithm Implementation

In [9]:
# PCA Implementation

from sklearn.decomposition import PCA
pca = PCA(n_components = 4)
pca.fit(X_train)
pca.fit_transform(X_train)
pca.transform(X_train)

array([[ -3.72460088e+01,   1.89564539e+00,  -3.80471719e+00,
         -1.26492716e-01],
       [ -7.46456940e-01,   2.79511786e+01,   2.04883464e+00,
          3.37795216e-02],
       [  5.19126508e+00,  -8.30273658e+00,  -4.76960753e+00,
          2.24868661e-01],
       ..., 
       [ -2.18565015e+01,   3.36766753e+00,   1.46634857e+01,
         -3.55902187e-02],
       [  5.64572104e+01,  -1.22711976e+01,  -1.24564507e+00,
          8.96017836e-02],
       [ -4.47139893e+01,  -4.64045422e+00,  -8.10850365e+00,
         -1.56904042e-01]])

In [17]:
columns = ['tfidf_count','cosine_similarity','shared_words','fuzz_token_sort_ratio','fuzz_token_set_ratio','length_difference','is_duplicate']
X_train = train_data[columns]

train = X_train.values

X = train[0::, 1::]
y = train[0::, 0]

In [46]:
# Ensembling
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

classifiers = [
    KNeighborsClassifier(n_neighbors = 30),
    DecisionTreeClassifier(max_depth=100, min_samples_leaf=6),
    RandomForestClassifier(max_depth = 20, n_estimators = 150),
    MultinomialNB(),
    LinearDiscriminantAnalysis(),
    LogisticRegression()]

log_cols = ["Classifier", "Accuracy"]
log  = pd.DataFrame(columns=log_cols)

cv = cross_validation.KFold(len(X_train), n_folds=4)

acc_dict = {}

for train_index, test_index in cv:    
    for clf in classifiers:
        name = clf.__class__.__name__
        clf.fit(X_train.loc[train_index], Y_train.loc[train_index])
        train_predictions = clf.predict_proba(X_train.loc[test_index])
        acc = log_loss(Y_train.loc[test_index], train_predictions,eps=1e-15)
        if name in acc_dict:
            acc_dict[name] += acc
        else:
            acc_dict[name] = acc    
        print(acc_dict)
    
for clf in acc_dict:
    acc_dict[clf] = acc_dict[clf] / 3.0
    log_entry = pd.DataFrame([[clf, acc_dict[clf]]], columns=log_cols)
    log = log.append(log_entry)

{'KNeighborsClassifier': 0.56742982654125174}
{'DecisionTreeClassifier': 2.6550827027120696, 'KNeighborsClassifier': 0.56742982654125174}
{'RandomForestClassifier': 0.50787270705249188, 'DecisionTreeClassifier': 2.6550827027120696, 'KNeighborsClassifier': 0.56742982654125174}
{'RandomForestClassifier': 0.50787270705249188, 'MultinomialNB': 2.5644549149400513, 'DecisionTreeClassifier': 2.6550827027120696, 'KNeighborsClassifier': 0.56742982654125174}
{'RandomForestClassifier': 0.50787270705249188, 'MultinomialNB': 2.5644549149400513, 'DecisionTreeClassifier': 2.6550827027120696, 'LinearDiscriminantAnalysis': 0.56168599145938081, 'KNeighborsClassifier': 0.56742982654125174}
{'LogisticRegression': 0.55952732307627429, 'RandomForestClassifier': 0.50787270705249188, 'KNeighborsClassifier': 0.56742982654125174, 'LinearDiscriminantAnalysis': 0.56168599145938081, 'DecisionTreeClassifier': 2.6550827027120696, 'MultinomialNB': 2.5644549149400513}
{'LogisticRegression': 0.55952732307627429, 'Rando

In [47]:
acc_dict

{'DecisionTreeClassifier': 2.6826256230122838,
 'KNeighborsClassifier': 0.5704165974472809,
 'LinearDiscriminantAnalysis': 0.56037564211001645,
 'LogisticRegression': 0.55827613798395315,
 'MultinomialNB': 2.5599378846570455,
 'RandomForestClassifier': 0.5079191011739036}

In [None]:
from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)

In [29]:
candidate_classifier = SVC()
candidate_classifier.fit(train[0::, 1::], train[0::, 0])
result = candidate_classifier.predict_proba(X_test)

Unnamed: 0,Classifier,Accuracy
0,LogisticRegression,11.623961
0,RandomForestClassifier,9.898493
0,GaussianNB,11.816099
0,LinearDiscriminantAnalysis,11.578505
0,DecisionTreeClassifier,10.84169
0,KNeighborsClassifier,10.827243


In [27]:
from sklearn.cross_validation import cross_val_score
from sklearn import cross_validation
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

In [29]:
# Feature selection - Log Loss : 0.58
'''from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X_new = SelectKBest(chi2, k=3).fit_transform(X_train, Y_train)
X_new'''

'from sklearn.feature_selection import SelectKBest\nfrom sklearn.feature_selection import chi2\nX_new = SelectKBest(chi2, k=3).fit_transform(X_train, Y_train)\nX_new'

In [36]:
from sklearn.grid_search import GridSearchCV
parameters = {'max_depth':[50,100,150], 'min_samples_leaf':[2, 4, 6]}
model = DecisionTreeClassifier()

clf = GridSearchCV(model, parameters)
clf.fit(X_train, Y_train)
clf.best_params_

{'max_depth': 100, 'min_samples_leaf': 6}

In [20]:
# Random Forest : Log Loss - 0.50
cv = cross_validation.KFold(len(X_train), n_folds=3)
model = RandomForestClassifier(n_estimators=150,max_depth=20)

for traincv, testcv in cv:
    print(traincv)
    model.fit(X_train.loc[traincv], Y_train.loc[traincv])
    rf_predict = model.predict_proba(X_train.loc[testcv]) 
    print(rf_predict)
    print(log_loss(Y_train.loc[testcv], rf_predict, eps=1e-15))

[134763 134764 134765 ..., 404285 404286 404287]
[[  4.82756061e-01   5.17243939e-01]
 [  4.98886217e-01   5.01113783e-01]
 [  6.03582095e-01   3.96417905e-01]
 ..., 
 [  9.99967480e-01   3.25203252e-05]
 [  9.78987417e-01   2.10125829e-02]
 [  4.49412856e-01   5.50587144e-01]]
0.50740266307
[     0      1      2 ..., 404285 404286 404287]
[[ 0.44687268  0.55312732]
 [ 0.77030796  0.22969204]
 [ 0.9905966   0.0094034 ]
 ..., 
 [ 0.2927395   0.7072605 ]
 [ 1.          0.        ]
 [ 0.63508604  0.36491396]]
0.510313637871
[     0      1      2 ..., 269523 269524 269525]
[[ 0.28957091  0.71042909]
 [ 0.81535107  0.18464893]
 [ 0.98445238  0.01554762]
 ..., 
 [ 0.72660816  0.27339184]
 [ 1.          0.        ]
 [ 0.28957091  0.71042909]]
0.506395749031


In [None]:
# SVM - Log Loss : 
from sklearn.svm import SVR
cv = cross_validation.KFold(len(X_train), n_folds=3)
model = SVR()

for traincv, testcv in cv:
    print(traincv)
    model.fit(X_train.loc[traincv], Y_train.loc[traincv])
    rf_predict = model.predict_proba(X_train.loc[testcv]) 
    print(rf_predict)
    print(log_loss(Y_train.loc[testcv], rf_predict, eps=1e-15))

[134763 134764 134765 ..., 404285 404286 404287]


In [27]:
# KNN Algorithm - Log Loss : 0.56
'''cv = cross_validation.KFold(len(X_train), n_folds=3)
model = neighbors.KNeighborsClassifier(n_neighbors = 30)

for traincv, testcv in cv:
    print(traincv)
    model.fit(X_train.loc[traincv], Y_train.loc[traincv])
    rf_predict = model.predict_proba(X_train.loc[testcv]) 
    print(rf_predict)
    print(log_loss(Y_train.loc[testcv], rf_predict, eps=1e-15))'''

'cv = cross_validation.KFold(len(X_train), n_folds=3)\nmodel = neighbors.KNeighborsClassifier(n_neighbors = 30)\n\nfor traincv, testcv in cv:\n    print(traincv)\n    model.fit(X_train.loc[traincv], Y_train.loc[traincv])\n    rf_predict = model.predict_proba(X_train.loc[testcv]) \n    print(rf_predict)\n    print(log_loss(Y_train.loc[testcv], rf_predict, eps=1e-15))'

In [31]:
# Preparing the Submission File 

pred = model.predict_proba(X_test)

sub_df = pd.DataFrame()
sub_df['test_id'] = test_data['test_id']
sub_df['is_duplicate'] = pred[:,1]
sub_df.to_csv('quora_submission.csv',index=False)