In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier as dt
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import naive_bayes
import pickle

In [2]:
##########################################
# Loads in Quora Dataset
##########################################
#Training Dataset
data = pd.read_csv('train.csv')
data['question1'] = data['question1'].astype(str)
data['question2'] = data['question2'].astype(str)

test_data = pd.read_csv('test.csv')
test_data['question1'] = test_data['question1'].astype(str)
test_data['question2'] = test_data['question2'].astype(str)

#Drop irrelevant features
data = data.drop(['id', 'qid1', 'qid2'], axis=1)
##########################################

In [3]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print "Topic %d:" % (topic_idx)
        print " ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]])

In [5]:
#Read in the word dictionary
complete_word_list = open('word_dictionary_training_set.txt', 'r')
word_list = complete_word_list.read().split('\n')
word_list = list(set(word_list))
word_list = [x for x in word_list if x != '']
total_questions = data['question1'] + data['question2']
total_questions = [x for x in total_questions if type(x) != float]

In [6]:
#Fit a 50 topic LDA model to a vectorized version of the questions
vectorizer = CountVectorizer(stop_words='english', vocabulary = word_list)
tf = vectorizer.fit_transform(total_questions)
lda_fit = LatentDirichletAllocation(n_topics=50).fit(tf)



In [8]:
#Export tf-idf difference, entropy based Random Forest Classifier
filename = 'lda_fit.sav'
pickle.dump(lda_fit, open(filename, 'wb'))

In [None]:
#View the top words in the LDA representation
no_top_words = 20
tf_feature_names = vectorizer.get_feature_names()
display_topics(lda_fit, tf_feature_names, no_top_words)

In [None]:
#Train a Random Forest Classifier on the LDA input matrix
q1_train = lda_fit.transform(data['question1'])
q2_train = lda_fit.transform(data['question2'])
training_input = q1_train - q2_train
X_train, X_test, y_train, y_test = train_test_split(training_input, data['is_duplicate'], test_size=0.2)
#Create a decision tree classifier object
lda_svm_classifier = svm.SVC(kernel='linear')
#Train the Decision Forest Classifier
lda_svm_classifier.fit(X_train, y_train)
#predict on the test set
test_prediction = lda_classifier.predict(X_test)
print accuracy_score(y_test, test_prediction)

In [383]:
#Print the Accuracy
print accuracy_score(y_test, test_prediction)
#Print the Precision
print precision_score(y_test, test_prediction, average='weighted')
#Print the confusion matrix
print confusion_matrix(y_test, test_prediction)

0.473526473526
0.444969595819
[[ 97   1   0  19   6]
 [ 53   3   0  39  14]
 [ 36   1   0  79  19]
 [ 21   0   0 181  83]
 [ 20   0   0 136 193]]


In [377]:
#Train a linear support vector machine on the tf-idf input matrix using the H&L dictionary
tf_vectorizer = TfidfVectorizer(vocabulary = word_list)
tf_features = tf_vectorizer.fit_transform(ohio_reviews)
X_train, X_test, y_train, y_test = train_test_split(tf_features, ohio_stars, test_size=0.2)
#Create a decision tree classifier object
tf_classifier = svm.SVC(kernel='linear')
#Train the Decision Forest Classifier
tf_classifier.fit(X_train, y_train)
#Predict on the test set
test_prediction = tf_classifier.predict(X_test)

In [378]:
#Print the Accuracy
print accuracy_score(y_test, test_prediction)
#Print the Precision
print precision_score(y_test, test_prediction, average='weighted')
#Print the confusion matrix
print confusion_matrix(y_test, test_prediction)

0.508491508492
0.489525644744
[[ 80  20   4  12   5]
 [ 27  27  17  26   6]
 [ 19  15  27  61  26]
 [ 10   2  20 122 128]
 [  8   2   2  82 253]]
