In [1]:
##########################################
# Load Required Python Libraries
##########################################
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
##########################################

In [2]:
##########################################
# Loads in Quora Dataset
##########################################
#Training Dataset
data = pd.read_csv('/stfm/research5/m1pll00/quora/dataset/train.csv')

#Drop irrelevant features
data = data.drop(['id', 'qid1', 'qid2'], axis=1)
##########################################

In [None]:
##########################################
# Feature Engineering Method 1: 
# Creating new input variables to improve ML algorithm performance
##########################################
#Feature: Length of Question
#Apply length function to every data.question1 & data.question2 observation
data['len_q1'] = data.question1.apply(lambda x: len(str(x)))
data['len_q2'] = data.question2.apply(lambda x: len(str(x)))

#Feature: Difference in length between the Questions
#Substract len_q1 from len_q2
data['len_diff'] = data.len_q1 - data.len_q2

#Feature: Character count of Question
#Strip whitespace in data.question1 & data.question2 and apply the length function
data['len_char_q1'] = data.question1.apply(lambda x: len(str(x).replace(' ', '')))
data['len_char_q2'] = data.question2.apply(lambda x: len(str(x).replace(' ', '')))

#Feature: Word count of Question
#Call split function on every data.question1 & data.question2 observation and apply the length function
data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split()))
data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split()))

#Feature: Common words between the Questions
#Intersection of data.question1 and data.question2
#Set function is applied so repeated words in a question is omitted from the final common word count
#Axis=1 to calculate the means column-wise (-->) rather than the default of Axis=0 to calculate the means row-wise(v)
data['len_common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
##########################################

In [None]:
##########################################
# Feature Engineering Method 2:
# Create Bag Of Words Model with Tfidf Normalization
##########################################
from sklearn.feature_extraction.text import TfidfVectorizer

#Obtain the complete vocabulary for the entire dataset
questions_combined = list(data.ix[:,'question1'].values.astype('str')) + list(data.ix[:,'question2'].values.astype('str'))
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(questions_combined)

#86153 unique vocabulary words question1 & question2 combined
complete_vocab = vectorizer.get_feature_names()

#Generate tfidf values for question1 and question2 based on the complete vocabulary of the dataset
vectorizer_q1 = TfidfVectorizer(vocabulary = complete_vocab)
vectorizer_q2 = TfidfVectorizer(vocabulary = complete_vocab)
tfidf_question1 = vectorizer_q1.fit_transform(data.question1.values.astype('str'))
tfidf_question2 = vectorizer_q2.fit_transform(data.question2.values.astype('str'))

#Substract the difference of the tfidf weight matricies for the two questions
#Will be 0 if the words are weighted the same in both questions (similar significance contribution)
diff_idf = tfidf_question1 - tfidf_question2
##########################################

In [3]:
##########################################
# Feature Engineering Method 3:
# Word2Vec Model
##########################################
#Training own word2vec model based on the training data we have
#Do not remove numbers or stop words so the algorithm can have a broader context of the sentence to produce higher quality vectors
import re
from nltk.corpus import stopwords
import nltk.data

#Tokenizer for sentence splitting
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

#Function to prep question1 and question2 for word2vec model
#Word2vec expects a list of lists as input (single sentences each as a list of words)
def question_to_wordlist(text, remove_stopwords = False):
    text = re.sub("[^a-zA-Z]", " ", text)
    words = text.lower().split()
    
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return(words)

def question_to_sentences(text, tokenizer, remove_stopwords = False):
    text = tokenizer.tokenize(text.strip())
    sentences = []
    
    for t in text:
        if(len(t) > 0):
            sentences.append(question_to_wordlist(t, remove_stopwords))
    return sentences
##########################################

In [None]:
#Import Encoding Detection library
import chardet

#Prep data for word2vec
sentences = []

print("Parsing sentences from training set...")
#Converting question1 to sentences for word2vec model
for i in xrange(0, len(data['question1'])):
    try:
        #Check for empty strings ""
        if(not pd.isnull(data['question1'][i])):
            sentences += question_to_sentences(data['question1'][i], tokenizer)
    except:
        try:
            encoding = chardet.detect(data['question1'][i])['encoding']
            sentences += question_to_sentences(data['question1'][i].decode(encoding), tokenizer)
        except:
            print(encoding)

In [None]:
#Converting question2 to sentences for word2vec model
for i in xrange(0,len(data['question2'])):
    try:
        if(not pd.isnull(data['question2'][i])):
            sentences += question_to_sentences(data['question2'][i], tokenizer)
    except:
        try:
            encoding = chardet.detect(data['question2'][i])['encoding']
            sentences += question_to_sentences(data['question2'][i].decode(encoding), tokenizer)
        except:
            print(encoding)

In [4]:
#Library for printing output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#Set parameters for word2vec model
num_features = 300 #Word vector dimensionality
min_word_count = 10 #Minimum word count (min times a word has to appear to be meaningful, should be between 0-100 depending on dataset size)
num_workers = 4 #Number of threads to run in parallel (only useful if have cython installed)
context = 5 #Context window size (how many words apart from current one can affect the meaning of the current word)
downsampling = 1e-3 #Downsample setting for frequent words (words that appear with higher frequency will be randomly down-sampled)

In [None]:
#Library for training word2vec model
from gensim.models import word2vec

#Training word2vec model
print("Training word2vec model...")
model = word2vec.Word2Vec(sentences, workers = num_workers,\
                         size = num_features, min_count = min_word_count,\
                         window = context, sample = downsampling)

In [None]:
#If not training the model any further call init_sims to make the model memory-efficient
model.init_sims(replace = True)

#Save the model
model_name = "300features_10minwords_5context"
model.save(model_name)

In [5]:
#(# words in model's vocab, size of feature vector)
#Library for training word2vec model
from gensim.models import word2vec, KeyedVectors

model = KeyedVectors.load("300features_10minwords_5context")
model.wv.syn0.shape
# model["flower"]

(20224, 300)

In [6]:
#Average word vectors for each question
def questionFeatureVec(question, model, num_features):
    featureVec = np.zeros((num_features,), dtype = "float32")
    num_words = 0
    
    #Creates a set with the list of words in the model's vocab.
    vocabulary = set(model.wv.index2word)
    
    #Loop through each word in the question and if it's in the model's vocab. add the word's feature vector to the total
    for word in question:
        if word in vocabulary:
            num_words = num_words + 1
            featureVec = np.add(featureVec, model[word])
    
    try:
        #Divide the result by the total number of words in the question to get the average
        featureVec = np.divide(featureVec, num_words)
    except RuntimeWarning:
        print(featureVec)
    return(featureVec)

#Create average word vector for entire dataset
def makeFeatureVec(questions, model, num_features):
    count = 0
    dataFeatureVec = np.zeros((len(questions), num_features), dtype = "float32")
    
    #Loop through each question in the dataset and calculate its average questions word vectors
    for question in questions:
        dataFeatureVec[count] = questionFeatureVec(question, model, num_features)
        count += 1
    return(dataFeatureVec)

In [7]:
#Create the average word vectors for question1
clean_question1 = []
for question in data.question1:
    clean_question1.append(question_to_wordlist(question, remove_stopwords = True))

In [8]:
vecs_question1 = makeFeatureVec(clean_question1, model, num_features)



In [None]:
#Create the average word vectors for question2
clean_question2 = []
for question in data.question2:
    clean_question2.append(question_to_wordlist(question, remove_stopwords = True))

In [None]:
vecs_question2 = makeFeatureVec(clean_question2, model, num_features)

In [None]:
vecs_merge = [vecs_question1, vecs_question2]
len(vecs_merge)

In [None]:
len(data['is_duplicate'])

In [None]:
#Fit a Random Forest Classifier to the dataset - Using 100 trees
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(vecs_merge, data['is_duplicate'])

In [None]:
# model.wv.index2word

In [None]:
# model.doesnt_match("man woman child kitchen".split())

In [None]:
# model.doesnt_match("france england germany berlin".split())

In [None]:
# model.doesnt_match("paris berlin london austria".split())

In [None]:
# model.most_similar("man")

In [None]:
# model.most_similar("queen")

In [None]:
# model.most_similar("awful")

In [None]:
# import chardet
# result = chardet.detect(data['question1'][11077])['encoding']
# print(result)
# print(data['question1'][11077].decode('Windows-1254'))

In [None]:
##########################################
# Split the dataset into training and testing datasets
##########################################
#Loads the library required for splitting the dataset
from sklearn.model_selection import train_test_split

#Method 1 Features
# features = data.ix[:,'len_q1':]

#Method 2 Features
features = diff_idf
y = data.ix[:,'is_duplicate']

X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2)
##########################################

In [None]:
##########################################
# Random Forest Classifier
##########################################
#Loads required libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

#Create a Random Forest Classifer (clf by convention = 'classifier')
clf = RandomForestClassifier(n_jobs=2) #n_jobs = # of jobs in run in parallel for fit and predict

#Train the Random Forest Classifier
clf.fit(X_train, y_train)
##########################################

In [None]:
##########################################
# Apply Random Forest Classifer on the testing split of the dataset
##########################################
#Predicts the outcome variable of the testing split of the dataset
test_prediction = clf.predict(X_test)

#Prediction probability for the value of the outcome variable (0 or 1)
test_prediction_proba = clf.predict_proba(X_test)
##########################################

In [None]:
##########################################
# Apply Logloss function to Test Dataset Output
##########################################
from sklearn.metrics import log_loss

log_loss(np.array(y_test), test_prediction_proba)
##########################################

In [None]:
#Preview newly added features to the dataset
# pd.options.display.max_colwidth = 100
# data.head()

In [None]:
# data.tail()

In [None]:
#Features of the training dataset
# data.ix[:,'len_q1':].head()

In [None]:
# data.ix[:,'len_q1':].tail()

In [None]:
#Outcome variable of the training dataset
# data.ix[:,'is_duplicate'].head()

In [None]:
# data.ix[:,'is_duplicate'].tail()

In [None]:
#Preview Prediction
# test_prediction[0:10]

In [None]:
#Preview the Prediction Probability [0, 1]
# test_prediction_proba[0:10]

In [None]:
#Displays Results in a Confusion Matrix
#Anything on the diagonal was classified correctly and anything off the diagonal was classified incorrectly
# pd.crosstab(y_test, test_prediction, rownames=['Actual Similarity'], colnames=['Predicted Similarity'])

In [None]:
#Displays a list of features that were the most important in affecting the accuracy of the classification
# important_features = list(zip(X_train, clf.feature_importances_))
# important_features

In [None]:
#Displays the accuracy score of the Random Forest Classifier on the test split
# clf_accuracy = accuracy_score(y_test, test_prediction)
# clf_accuracy

In [None]:
######################################################################################################################

In [None]:
##########################################
# Loads in Quora Test Dataset
##########################################
#Test Dataset
data_test = pd.read_csv('/Users/Priscilla/Desktop/QuoraDataset/test.csv')
##########################################

In [None]:
##########################################
# Feature Engineering Method 1: 
# Creating new input variables to improve ML algorithm performance
##########################################
#Feature: Length of Question
#Apply length function to every data.question1 & data.question2 observation
data_test['len_q1'] = data_test.question1.apply(lambda x: len(str(x)))
data_test['len_q2'] = data_test.question2.apply(lambda x: len(str(x)))

#Feature: Difference in length between the Questions
#Substract len_q1 from len_q2
data_test['len_diff'] = data_test.len_q1 - data_test.len_q2

#Feature: Character count of Question
#Strip whitespace in data.question1 & data.question2 and apply the length function
data_test['len_char_q1'] = data_test.question1.apply(lambda x: len(str(x).replace(' ', '')))
data_test['len_char_q2'] = data_test.question2.apply(lambda x: len(str(x).replace(' ', '')))

#Feature: Word count of Question
#Call split function on every data.question1 & data.question2 observation and apply the length function
data_test['len_word_q1'] = data_test.question1.apply(lambda x: len(str(x).split()))
data_test['len_word_q2'] = data_test.question2.apply(lambda x: len(str(x).split()))

#Feature: Common words between the Questions
#Intersection of data.question1 and data.question2
#Set function is applied so repeated words in a question is omitted from the final common word count
#Axis=1 to calculate the means column-wise (-->) rather than the default of Axis=0 to calculate the means row-wise(v)
data_test['len_common_words'] = data_test.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
##########################################

In [None]:
##########################################
# Feature Engineering Method 2:
# Create Bag Of Words Model with Tfidf Normalization
##########################################
from sklearn.feature_extraction.text import TfidfVectorizer

#Generate tfidf values for question1 and question2 based on the complete vocabulary of the dataset
vectorizer_q1 = TfidfVectorizer(vocabulary = complete_vocab)
vectorizer_q2 = TfidfVectorizer(vocabulary = complete_vocab)
tfidf_question1 = vectorizer_q1.fit_transform(data_test.question1.values.astype(str))
tfidf_question2 = vectorizer_q2.fit_transform(data_test.question2.values.astype(str))

#Substract the difference of the tfidf weight matricies for the two questions
#Will be 0 if the words are weighted the same in both questions (similar significance contribution)
diff_idf = tfidf_question1 - tfidf_question2
##########################################

In [None]:
##########################################
# Apply Random Forest Classifer on the Test Dataset
##########################################
#Features of the test dataset
#Method 1 Features
#data_test_features = data_test.ix[:,'len_q1':]

#Method 2 Features
data_test_features = diff_idf

#Predicts the outcome variable of the Test Dataset
test_prediction = clf.predict(data_test_features)

#Prediction probability for the value of the outcome variable (0 or 1)
test_prediction_proba = clf.predict_proba(data_test_features)
##########################################

In [None]:
##########################################
# Create Submission File
##########################################
submission = pd.DataFrame()
submission['test_id'] = data_test.test_id
submission['is_duplicate'] = test_prediction

submission.to_csv('/Users/Priscilla/Desktop/QuoraDataset/submission.csv', index = False)
##########################################

In [None]:
len(submission)