In [31]:
##########################################
# Load Required Python Libraries
##########################################
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
##########################################

In [32]:
##########################################
# Loads in Quora Dataset
##########################################
#Training Dataset
data = pd.read_csv('/Users/Priscilla/Desktop/QuoraDataset/train.csv')

#Drop irrelevant features
data = data.drop(['id', 'qid1', 'qid2'], axis=1)
##########################################

In [33]:
##########################################
# Feature Engineering Method 1: 
# Creating new input variables to improve ML algorithm performance
##########################################
#Feature: Length of Question
#Apply length function to every data.question1 & data.question2 observation
data['len_q1'] = data.question1.apply(lambda x: len(str(x)))
data['len_q2'] = data.question2.apply(lambda x: len(str(x)))

#Feature: Difference in length between the Questions
#Substract len_q1 from len_q2
data['len_diff'] = data.len_q1 - data.len_q2

#Feature: Character count of Question
#Strip whitespace in data.question1 & data.question2 and apply the length function
data['len_char_q1'] = data.question1.apply(lambda x: len(str(x).replace(' ', '')))
data['len_char_q2'] = data.question2.apply(lambda x: len(str(x).replace(' ', '')))

#Feature: Word count of Question
#Call split function on every data.question1 & data.question2 observation and apply the length function
data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split()))
data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split()))

#Feature: Common words between the Questions
#Intersection of data.question1 and data.question2
#Set function is applied so repeated words in a question is omitted from the final common word count
#Axis=1 to calculate the means column-wise (-->) rather than the default of Axis=0 to calculate the means row-wise(v)
data['len_common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
##########################################

In [34]:
##########################################
# Feature Engineering Method 2:
# Create Bag Of Words Model with Tfidf Normalization
##########################################
from sklearn.feature_extraction.text import TfidfVectorizer

#Obtain the complete vocabulary for the entire dataset
questions_combined = list(data.ix[:,'question1']) + list(data.ix[:,'question2'].values.astype(str))
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(questions_combined)

#86153 unique vocabulary words question1 & question2 combined
complete_vocab = vectorizer.get_feature_names()

#Generate tfidf values for question1 and question2 based on the complete vocabulary of the dataset
vectorizer_q1 = TfidfVectorizer(vocabulary = complete_vocab)
vectorizer_q2 = TfidfVectorizer(vocabulary = complete_vocab)
tfidf_question1 = vectorizer_q1.fit_transform(data.question1)
tfidf_question2 = vectorizer_q2.fit_transform(data.question2.values.astype(str))

#Substract the difference of the tfidf weight matricies for the two questions
#Will be 0 if the words are weighted the same in both questions (similar significance contribution)
diff_idf = tfidf_question1 - tfidf_question2
##########################################

In [50]:
##########################################
# Feature Engineering Method 3:
# Doc2Vec Model
##########################################
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

from sklearn.linear_model import LogisticRegression
##########################################

In [35]:
##########################################
# Split the dataset into training and testing datasets
##########################################
#Loads the library required for splitting the dataset
from sklearn.model_selection import train_test_split

#Method 1 Features
# features = data.ix[:,'len_q1':]

#Method 2 Features
features = diff_idf
y = data.ix[:,'is_duplicate']

X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2)
##########################################

In [36]:
##########################################
# Random Forest Classifier
##########################################
#Loads required libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

#Create a Random Forest Classifer (clf by convention = 'classifier')
clf = RandomForestClassifier(n_jobs=2) #n_jobs = # of jobs in run in parallel for fit and predict

#Train the Random Forest Classifier
clf.fit(X_train, y_train)
##########################################

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=2, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [37]:
##########################################
# Apply Random Forest Classifer on the testing split of the dataset
##########################################
#Predicts the outcome variable of the testing split of the dataset
test_prediction = clf.predict(X_test)

#Prediction probability for the value of the outcome variable (0 or 1)
test_prediction_proba = clf.predict_proba(X_test)
##########################################

In [38]:
##########################################
# Apply Logloss function to Test Dataset Output
##########################################
from sklearn.metrics import log_loss

log_loss(np.array(y_test), test_prediction_proba)
##########################################

0.70306077046282545

In [5]:
#Preview newly added features to the dataset
# pd.options.display.max_colwidth = 100
# data.head()

In [6]:
# data.tail()

In [7]:
#Features of the training dataset
# data.ix[:,'len_q1':].head()

In [8]:
# data.ix[:,'len_q1':].tail()

In [9]:
#Outcome variable of the training dataset
# data.ix[:,'is_duplicate'].head()

In [10]:
# data.ix[:,'is_duplicate'].tail()

In [21]:
#Preview Prediction
# test_prediction[0:10]

In [23]:
#Preview the Prediction Probability [0, 1]
# test_prediction_proba[0:10]

In [20]:
#Displays Results in a Confusion Matrix
#Anything on the diagonal was classified correctly and anything off the diagonal was classified incorrectly
# pd.crosstab(y_test, test_prediction, rownames=['Actual Similarity'], colnames=['Predicted Similarity'])

In [19]:
#Displays a list of features that were the most important in affecting the accuracy of the classification
# important_features = list(zip(X_train, clf.feature_importances_))
# important_features

In [22]:
#Displays the accuracy score of the Random Forest Classifier on the test split
# clf_accuracy = accuracy_score(y_test, test_prediction)
# clf_accuracy

In [None]:
######################################################################################################################

In [39]:
##########################################
# Loads in Quora Test Dataset
##########################################
#Test Dataset
data_test = pd.read_csv('/Users/Priscilla/Desktop/QuoraDataset/test.csv')
##########################################

In [40]:
##########################################
# Feature Engineering Method 1: 
# Creating new input variables to improve ML algorithm performance
##########################################
#Feature: Length of Question
#Apply length function to every data.question1 & data.question2 observation
data_test['len_q1'] = data_test.question1.apply(lambda x: len(str(x)))
data_test['len_q2'] = data_test.question2.apply(lambda x: len(str(x)))

#Feature: Difference in length between the Questions
#Substract len_q1 from len_q2
data_test['len_diff'] = data_test.len_q1 - data_test.len_q2

#Feature: Character count of Question
#Strip whitespace in data.question1 & data.question2 and apply the length function
data_test['len_char_q1'] = data_test.question1.apply(lambda x: len(str(x).replace(' ', '')))
data_test['len_char_q2'] = data_test.question2.apply(lambda x: len(str(x).replace(' ', '')))

#Feature: Word count of Question
#Call split function on every data.question1 & data.question2 observation and apply the length function
data_test['len_word_q1'] = data_test.question1.apply(lambda x: len(str(x).split()))
data_test['len_word_q2'] = data_test.question2.apply(lambda x: len(str(x).split()))

#Feature: Common words between the Questions
#Intersection of data.question1 and data.question2
#Set function is applied so repeated words in a question is omitted from the final common word count
#Axis=1 to calculate the means column-wise (-->) rather than the default of Axis=0 to calculate the means row-wise(v)
data_test['len_common_words'] = data_test.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
##########################################

In [41]:
##########################################
# Feature Engineering Method 2:
# Create Bag Of Words Model with Tfidf Normalization
##########################################
from sklearn.feature_extraction.text import TfidfVectorizer

#Generate tfidf values for question1 and question2 based on the complete vocabulary of the dataset
vectorizer_q1 = TfidfVectorizer(vocabulary = complete_vocab)
vectorizer_q2 = TfidfVectorizer(vocabulary = complete_vocab)
tfidf_question1 = vectorizer_q1.fit_transform(data_test.question1.values.astype(str))
tfidf_question2 = vectorizer_q2.fit_transform(data_test.question2.values.astype(str))

#Substract the difference of the tfidf weight matricies for the two questions
#Will be 0 if the words are weighted the same in both questions (similar significance contribution)
diff_idf = tfidf_question1 - tfidf_question2
##########################################

In [42]:
##########################################
# Apply Random Forest Classifer on the Test Dataset
##########################################
#Features of the test dataset
#Method 1 Features
#data_test_features = data_test.ix[:,'len_q1':]

#Method 2 Features
data_test_features = diff_idf

#Predicts the outcome variable of the Test Dataset
test_prediction = clf.predict(data_test_features)

#Prediction probability for the value of the outcome variable (0 or 1)
test_prediction_proba = clf.predict_proba(data_test_features)
##########################################

In [48]:
##########################################
# Create Submission File
##########################################
submission = pd.DataFrame()
submission['test_id'] = data_test.test_id
submission['is_duplicate'] = test_prediction

submission.to_csv('/Users/Priscilla/Desktop/QuoraDataset/submission.csv', index = False)
##########################################

In [49]:
len(submission)

2345796