In [24]:
"""module docstring"""

# imports
import sys
import gensim
import math
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
#from sklearn.preprocessing import Imputer

# import os
# import csv
import re
from nltk.corpus import stopwords
import numpy as np




# exception classes

# interface functions

# classes

In [3]:
# internal functions & classes

In [4]:
# extracts word vectors from a question

def question2words(question, stops):
    """
    :param question: single question string
    :return:
    This function converts a raw question to a string of words
    """
    # remove non-letters => C vs C++
    letters_only = re.sub("[^a-zA-Z]", " ", question)

    # convert to lower case, split into separate words
    words = letters_only.lower().split(" ")

    # remove stop words
    meaningful_words = [w for w in words if (not (w in stops or len(w)<2))]

    # return an array of meaningful words
    return (meaningful_words)

def qwords2vector(words, model, index2word_set, num_features):
    """
    Function to average all of the word vectors in a given question

    :param words:
    :param model:
    :param index2word_set:
    :param num_features:
    :return:
    """

    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0

    # Loop over each word in the question and, if it is in the model's
    # vocaublary, add its feature vector to the total

    for word in words:
        if word in index2word_set:
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    #
    # Divide the result by the number of words to get the average
    if nwords == 0:
        return featureVec

    featureVec = np.divide(featureVec,nwords)
    return featureVec

def qwords2vectorOfvectors(words, model, index2word_set, num_features):
    """
    Function to average all of the word vectors in a given question

    :param words:
    :param model:
    :param index2word_set:
    :param num_features:
    :return:
    """
    return [model[w] for w in words if w in index2word_set]

def cos_dis(x,y):
    x_abs = math.sqrt(sum([i*i for i in x]))
    y_abs = math.sqrt(sum([i*i for i in y]))
    normal_factor = x_abs * y_abs

    if normal_factor == 0:
        return 0.0

    return sum([abs(x1-y1) for x1,y1 in zip(x,y)])/normal_factor

def argmax(lst):
  return lst.index(max(lst))

def find_best_matched(word, question):
    return argmax([cos_dis(word,pair) for pair in question])

def abs_dist(x,y): #vector of vectors, and vector of words
   return [abs(x1-y1) for x1,y1 in zip(x,y)]

In [5]:
# Main

## embedding model
model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)

In [6]:
## Load data ##
train_address = "../Data/train.csv"
test_address = "../Data/test.csv"

train_data = pd.read_csv(train_address)
#test_data = pd.read_csv(test_address)

In [7]:
print(list(train_data))
#print(list(test_data))

['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate']


In [8]:
## separate input and result

# columns: ['test_id', 'question1', 'question2', 'is_duplicate']
#test_id = test_data['test_id']
#test_question1 = test_data['question1']
#test_question2 = test_data['question2']

# columns: ['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate']
train_id = train_data['id']
train_qid1 = train_data['qid1']
train_qid2 = train_data['qid2']
train_question1 = train_data['question1']
train_question2 = train_data['question2']
train_is_duplicate = train_data['is_duplicate']

In [9]:
## clean the input
# In Python, searching a set is much faster than searching
#   a list, so convert the stop words to a set
stops = set(stopwords.words("english"))

clean_train_question1 = [question2words(str(x),stops) for x in train_question1]
clean_train_question2 = [question2words(str(x),stops) for x in train_question2]

#clean_test_question1 = [question2words(str(x),stops) for x in test_question1]
#clean_test_question2 = [question2words(str(x),stops) for x in test_question2]

In [11]:
# Index2word is a list that contains the names of the words in
# the model's vocabulary. Convert it to a set, for speed
index2word_set = set(model.index2word)
num_features = 300


## convert to vector
vectors_train_question1 = [qwords2vector(x, model, index2word_set, num_features) for x in clean_train_question1]
vectors_train_question2 = [qwords2vector(x, model, index2word_set, num_features) for x in clean_train_question2]

#vectors_test_question1 = [qwords2vector(x, model, index2word_set, num_features) for x in clean_test_question1]
#vectors_test_question2 = [qwords2vector(x, model, index2word_set, num_features) for x in clean_test_question2]

In [12]:
## compute the distance of question 1 and question 2
train_distance_q1_q2 = [abs_dist(x,y) for x,y in zip(vectors_train_question1,vectors_train_question2)]
#test_distance_q1_q2 = [abs_dist(x,y) for x,y in zip(vectors_test_question1,vectors_test_question2)]

In [13]:
train_features = train_distance_q1_q2
train_result = train_is_duplicate

In [14]:
X_train, X_test, y_train, y_test = train_test_split(train_features, train_result, test_size=0.2, random_state=4322)

In [16]:

#test_features = test_distance_q1_q2

# Fit a random forest to the training data, using 1000 trees
forest = RandomForestClassifier(n_estimators = 100)


print("Fitting a random forest to labeled training data...")
forest = forest.fit(X_train, y_train)

Fitting a random forest to labeled training data...


In [28]:
## output ## 
#test_id,is_duplicate
#0,0.5
#1,0.4
#2,0.9

result_prob_tst = forest.predict_proba(X_test)
#output generation



test_roc_auc =  0.76416204888
test_log_loss =  0.76416204888


In [39]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, result_prob_tst[:, 1])
roc_auc = auc(false_positive_rate, true_positive_rate) #0.66166516195753156

lglss = log_loss(y_test, result_prob_tst[:, 1])

print("test_roc_auc = ",str(roc_auc))
print("test_log_loss = ",str(lglss))

test_roc_auc =  0.76416204888
test_log_loss =  0.560872901297


In [58]:
length = len(result_prob_tst[:, 1])
print(length)

#test_id = list(range(1,length+1))
#output_result = pd.DataFrame(test_id,result_prob_tst[:, 1])
#output_result.columns=['test_id','is_duplicate']
output_result = pd.DataFrame(result_prob_tst[:, 1])
output_result.columns=['is_duplicate']
output_result.to_csv("output_1.csv")

80858
