In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split, KFold
from sklearn.cluster import KMeans 
import matplotlib.pyplot as plt 
%matplotlib inline

from gensim.models import KeyedVectors

In [9]:
from sklearn import metrics

In [2]:
#Import the data 
column_names = ['qid', 'question_text', 'target']
df = pd.read_csv('./train.csv', names=column_names, header=None, na_values='?', low_memory=False)

path_to_embeddings = "./GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin"
embeddings_index = KeyedVectors.load_word2vec_format(path_to_embeddings, binary=True)


In [20]:

Y = np.array(df['target'][1:5001])
X = np.array(df['question_text'][1:5001])

X_words, X_test_words, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)

In [3]:
def fix_matrix(resulting_array, max_row_length): 
    if resulting_array.size == 0: 
        resulting_array = np.reshape(resulting_array, (0, max_row_length))
    else: 
        diff = max_row_length - resulting_array.shape[1]
        if diff % 2 == 1: 
            left = diff // 2 + 1 
        else: 
            left = diff // 2 
        right = diff // 2 
        
        resulting_array = np.pad(resulting_array, ((0,0),(left,right)), 'constant', constant_values=(0,0))
    return resulting_array

In [4]:
def fix_row_length(array, max_row_length):
    if len(array.shape) < 2:
        array_len = array.shape[0] #array has to be a numpy array
    else: 
        array_len = array.shape[1] #array has to be a numpy array
    
    
    if array_len < max_row_length: 
        diff = max_row_length - array_len
        if diff % 2 == 1: 
            left = diff // 2 + 1 
        else: 
            left = diff // 2 
        right = diff // 2 
        if len(array.shape) < 2:
            array = np.pad(array, (left,right), 'constant', constant_values=(0,0))
        else:
            array = np.pad(array, ((0,0),(left,right)), 'constant', constant_values=(0,0))
    return array

In [7]:
# extract the embeddings
# create a vector with the embeddings values 
# put all the embeddings together to create a row in the matrix 
# check if the lenghth of the concatenated embeddings array is bigger than the current length of each row in np.array
# if it's shorter, pass it on to the fix_length function

def vectorize_questions(words):
    max_row_length = 0
    resulting_array = np.array([])

    for sentence in words: 
        stripped_sentence = sentence.replace("?", "")
        words = stripped_sentence.split()
        sentence_vector = []
        for word in words: 
            if word in embeddings_index: 
                sentence_vector.extend(embeddings_index[word])

        #turn it into a np_array
        sentence_vector = np.array(sentence_vector)
        sentence_vector = np.reshape(sentence_vector, (1, sentence_vector.shape[0]))

        if sentence_vector.shape[1] > max_row_length: 
            max_row_length = sentence_vector.shape[1]
            resulting_array = fix_matrix(resulting_array, max_row_length)
        elif sentence_vector.shape[1] < max_row_length:     
            sentence_vector=fix_row_length(sentence_vector, max_row_length)

        resulting_array = np.append(resulting_array, sentence_vector, axis=0)
    return resulting_array


In [21]:
#Pre-process training set and test set
X_train = vectorize_questions(X_words)
X_test = vectorize_questions(X_test_words)
print(X_train.shape)
print(X_test.shape)

#fix shape difference

diff = abs(X_train.shape[1] - X_test.shape[1])
if diff % 2 == 1: 
    left = diff // 2 + 1 
else: 
    left = diff // 2 
right = diff // 2 

if (X_train.shape < X_test.shape): 
    X_train = np.pad(X_train, ((0,0),(left,right)), 'constant', constant_values=(0,0))
else:
    X_test = np.pad(X_test, ((0,0),(left,right)), 'constant', constant_values=(0,0))
    
print(X_train.shape)
print(X_test.shape)

(3750, 13200)
(1250, 11700)
(3750, 13200)
(1250, 13200)


In [22]:
n_clusters = 2
kmeans_model = KMeans(n_clusters=n_clusters, n_init=100)
yhat = kmeans_model.fit_predict(X_train)
# metrics.accuracy_score(Y_train, yhat)

In [23]:
accuracy = np.mean(np.equal(yhat, Y_train).astype(int))
print(accuracy)

0.0
