In [2]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split, KFold
from sklearn import linear_model
import matplotlib.pyplot as plt 
%matplotlib inline

from gensim.models import KeyedVectors

In [3]:
#Import the data 
column_names = ['qid', 'question_text', 'target']
df = pd.read_csv('./train.csv', names=column_names, header=None, na_values='?', low_memory=False)

path_to_embeddings = "./GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin"
embeddings_index = KeyedVectors.load_word2vec_format(path_to_embeddings, binary=True)

Y = np.array(df['target'][1:2001])
X = np.array(df['question_text'][1:2001])

X_words, X_test_words, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)

In [14]:
df.head(6)

Unnamed: 0,qid,question_text,target
0,qid,question_text,target
1,00002165364db923c7e6,How did Quebec nationalists see their province...,0
2,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
3,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
4,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
5,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [4]:
def fix_matrix(resulting_array, max_row_length): 
    if resulting_array.size == 0: 
        resulting_array = np.reshape(resulting_array, (0, max_row_length))
    else: 
        diff = max_row_length - resulting_array.shape[1]
        if diff % 2 == 1: 
            left = diff // 2 + 1 
        else: 
            left = diff // 2 
        right = diff // 2 
        
        resulting_array = np.pad(resulting_array, ((0,0),(left,right)), 'constant', constant_values=(0,0))
    return resulting_array

In [5]:
def fix_row_length(array, max_row_length):
    if len(array.shape) < 2:
        array_len = array.shape[0] #array has to be a numpy array
    else: 
        array_len = array.shape[1] #array has to be a numpy array
    
    
    if array_len < max_row_length: 
        diff = max_row_length - array_len
        if diff % 2 == 1: 
            left = diff // 2 + 1 
        else: 
            left = diff // 2 
        right = diff // 2 
        if len(array.shape) < 2:
            array = np.pad(array, (left,right), 'constant', constant_values=(0,0))
        else:
            array = np.pad(array, ((0,0),(left,right)), 'constant', constant_values=(0,0))
    return array

In [6]:
# extract the embeddings
# create a vector with the embeddings values 
# put all the embeddings together to create a row in the matrix 
# check if the lenghth of the concatenated embeddings array is bigger than the current length of each row in np.array
# if it's shorter, pass it on to the fix_length function

def vectorize_questions(words):
    max_row_length = 0
    resulting_array = np.array([])

    for sentence in words: 
        stripped_sentence = sentence.replace("?", "")
        words = stripped_sentence.split()
        sentence_vector = []
        for word in words: 
            if word in embeddings_index: 
                sentence_vector.extend(embeddings_index[word])

        #turn it into a np_array
        sentence_vector = np.array(sentence_vector)
        sentence_vector = np.reshape(sentence_vector, (1, sentence_vector.shape[0]))

        if sentence_vector.shape[1] > max_row_length: 
            max_row_length = sentence_vector.shape[1]
            resulting_array = fix_matrix(resulting_array, max_row_length)
        elif sentence_vector.shape[1] < max_row_length:     
            sentence_vector=fix_row_length(sentence_vector, max_row_length)

        resulting_array = np.append(resulting_array, sentence_vector, axis=0)
    return resulting_array


In [7]:
#Pre-process training set and test set
X_train = vectorize_questions(X_words)
X_test = vectorize_questions(X_test_words)
print(X_train.shape)
print(X_test.shape)

#fix shape difference

diff = abs(X_train.shape[1] - X_test.shape[1])
if diff % 2 == 1: 
    left = diff // 2 + 1 
else: 
    left = diff // 2 
right = diff // 2 

if (X_train.shape < X_test.shape): 
    X_train = np.pad(X_train, ((0,0),(left,right)), 'constant', constant_values=(0,0))
else:
    X_test = np.pad(X_test, ((0,0),(left,right)), 'constant', constant_values=(0,0))
    
print(X_train.shape)
print(X_test.shape)

(1500, 13200)
(500, 11400)
(1500, 13200)
(500, 13200)


In [8]:
logreg = linear_model.LogisticRegression(C=.001, penalty='l1')
logreg.fit(X_train, Y_train)



LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [9]:
yhat =  logreg.predict(X_test)
accuracy = np.mean(np.equal(yhat, Y_test).astype(int))
print(accuracy)

0.942


In [10]:
from sklearn import svm
svc = svm.SVC(probability=False,  kernel="rbf", C=2.8, gamma=.0073,verbose=10)

In [11]:
svc.fit(X_train,Y_train)

[LibSVM]

SVC(C=2.8, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0073, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=10)

In [12]:
yhat_svm = svc.predict(X_test)
acc = np.mean(np.equal(yhat_svm, Y_test).astype(int))
print('Accuaracy = {0:f}'.format(acc))

Accuaracy = 0.942000
