In [1]:
##########################################
# Load Required Python Libraries
##########################################
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
import pickle
##########################################

In [2]:
##########################################
# Loads in Quora Dataset
##########################################
#Training Dataset
data = pd.read_csv('train.csv')
data['question1'] = data['question1'].astype(str)
data['question2'] = data['question2'].astype(str)

test_data = pd.read_csv('test.csv')
test_data['question1'] = test_data['question1'].astype(str)
test_data['question2'] = test_data['question2'].astype(str)

#Drop irrelevant features
data = data.drop(['id', 'qid1', 'qid2'], axis=1)
##########################################

In [3]:
##########################################
# Create a the total word vocabulary, defined over all the test and training questions
##########################################

complete_vocab_list= []
for df in [data]:
    for i in range(1, 3):
        vectorizer = TfidfVectorizer()
        vectorizer.fit_transform(df['question'+str(i)])
        complete_vocab_list = list(set(complete_vocab_list + vectorizer.get_feature_names()))
complete_vocab_list = list(set(complete_vocab_list))

In [8]:
#Print out dictionary to be used in other classifier models
word_dict = open('word_dictionary_training_set.txt', 'wb')
complete_vocab_list = [x.encode('ascii', 'ignore') for x in complete_vocab_list]
for item in complete_vocab_list:
    word_dict.write("%s\n" % item)

In [5]:
total_questions = data['question1'] + data['question2']
total_questions = [x for x in total_questions if type(x) != float]

In [9]:
##########################################
# Create a tfidf matrix using the complete vocab list and then classify according to the matrix
##########################################
vectorizer = TfidfVectorizer(stop_words = 'english', vocabulary = complete_vocab_list)
vectorizer.fit(total_questions)
q1_idf = vectorizer.transform(data['question1'])
q2_idf = vectorizer.transform(data['question2'])
q3_idf = vectorizer.transform(test_data['question1'])
q4_idf = vectorizer.transform(test_data['question2'])

#Generate the features from the training data
features_idf = q1_idf - q2_idf
y = data.ix[:,'is_duplicate']

In [15]:
# #Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(criterion='entropy', max_features = 300)
#Create a cross_validation accuracy score
scores = cross_val_score(rf_classifier, features_idf, y, cv=5)
rf_accuracy = reduce(lambda x, y: x + y, scores) / len(scores)

In [16]:
#Look at the cross validated accuracy
print rf_accuracy

0.784454223877


In [17]:
#Fit the random forest classifier to the training data
rf_classifier.fit(features_idf, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=300, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [18]:
#Export tf-idf difference, entropy based Random Forest Classifier
filename = 'tfidf_rf_entropy.sav'
pickle.dump(rf_classifier, open(filename, 'wb'))

In [19]:
#Predict using the test set and replace with cross validated average accuracy
test_features_idf = q3_idf - q4_idf
test_prediction = rf_classifier.predict(test_features_idf)
test_prediction = list(test_prediction)

In [30]:
btest_prediction = [1 - rf_accuracy if x == 0 else rf_accuracy for x in test_prediction]

In [31]:
#Submit using the predicted probabilities from the RF, averaged over the cross-validated results
test_results_df = pd.DataFrame({'test_id':test_data['test_id'], 'is_duplicate': btest_prediction})

In [69]:
test_results_df.to_csv('submission_tfidf_rf.csv', index = False)

In [None]:
# #Create a linear SVM Classifier
svm_classifier = svm.SVC(kernel = 'linear')
#Create a cross_validation accuracy score
scores = cross_val_score(svm_classifier, features_idf, y, cv=5)
svm_accuracy = reduce(lambda x, y: x + y, scores) / len(scores)

In [None]:
#Export tf-idf difference, linear SVM model
filename = 'tfidf_linear_svm.sav'
pickle.dump(rf_classifier, open(filename, 'wb'))

In [68]:
#For test questions with a null value, we know that if ! both null then P(duplicate) = 0
confident_results = list(test_data[test_data['question1'] == 'nan']['test_id']) + list(test_data[test_data['question2'] == 'nan']['test_id'])
for test_id in confident_results:
    test_results_df.loc[test_id, 'is_duplicate'] = 0
#Look at the result
print test_results_df.loc[confident_results[0], 'is_duplicate']

0.0
