In [1]:
import gensim
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
import time

In [146]:
from sklearn.svm import SVC
from sklearn import tree

In [149]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [2]:
train_data = pd.read_csv("train.csv")
model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True) 

In [45]:
def get_word2vec(description):
    description = description_list[0]
    description = description.lower()
    description = re.sub('[^a-zA-Z]', ' ', description )  
    description = re.sub(r'\s+', ' ', description)
    description_tokens = nltk.sent_tokenize(description)
    all_words = [nltk.word_tokenize(sent) for sent in description_tokens]
    for i in range(len(all_words)):  
        all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]
    all_words = all_words[0]
    avg_word2vec_words = np.zeros(300)
    for word in all_words:
        avg_word2vec_words += model[word]
        
    return avg_word2vec_words
    

In [136]:
def convert_text_col_word2vec(column):
    col_vector_list = []
    for description in column:
        description_vector = get_word2vec(description)
        col_vector_list.append(description_vector)
    return np.array(col_vector_list)

In [164]:
def _run_nn(train_feature_matrix, train_label_matrix, test_feature_matrix):
    nn_clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(3000), random_state=1)
    nn_train_feature_matrix = train_feature_matrix.astype(np.float64)
    nn_test_feature_matrix = test_feature_matrix.astype(np.float64)
    nn_clf.fit(nn_train_feature_matrix, train_label_matrix)
    nn_predictions = nn_clf.predict(nn_test_feature_matrix)
    return nn_predictions

In [141]:
def _run_svm(train_feature_matrix, train_label_matrix, test_feature_matrix):
    clf = SVC(gamma='auto')
    clf.fit(train_feature_matrix, train_label_matrix)
    predicted_labels = clf.predict(test_feature_matrix)
    return predicted_labels

In [142]:
def _run_dtree(train_feature_matrix, train_label_matrix, test_feature_matrix):
    dt_clf = tree.DecisionTreeClassifier()
    dt_clf = dt_clf.fit(train_feature_matrix, train_label_matrix)
    dt_predictions = dt_clf.predict(test_feature_matrix)
    return dt_predictions

In [None]:
def calculate_macro_f1_score(predictions, true_labels):
    true_positives = [0 for i in range(11)]
    false_positives = [0 for i in range(11)]
    false_negatives = [0 for i in range(11)]

    if len(predictions) != len(true_labels):
        print("bug in code, length of predictions should match length of true_labels")
        return None
    for i in range(len(predictions)):
        if predictions[i] == true_labels[i]:
            true_positives[predictions[i]] += 1
        else:
            false_positives[predictions[i]] += 1
            false_negatives[true_labels[i]] += 1

    total_classes = 0
    total_f1 = 0
    for i in range(11):
        if true_positives[i]==0 and false_positives[i]==0:
            continue
        elif true_positives[i]==0 and false_negatives[i]==0:
            continue
        prec = true_positives[i]*1.0/(true_positives[i] + false_positives[i])
        recall = true_positives[i]*1.0/(true_positives[i]+false_negatives[i])
        f1=0
        if prec+recall != 0:
            f1 = 2*prec*recall/(prec+recall)
            total_classes += 1
            total_f1 += f1
    return total_f1/total_classes

In [None]:
name_vector_column = convert_text_col_word2vec(train_data['Name'])
description_vector_column = convert_text_col_word2vec(train_data['Description'])

In [116]:
delete_columns = ['Name','Description','PetID']
dataset = train_data.drop(delete_columns, axis=1)

In [104]:
dataset = dataset.join(pd.DataFrame(name_vector_column), rsuffix='_name')

In [105]:
dataset = dataset.join(pd.DataFrame(description_vector_column), rsuffix='_description')

In [98]:
one_hot_columns = ['Breed1','Breed2','Type','Gender','Color1','Color2','Color3','Vaccinated','Dewormed','Sterilized',
                   'Health','MaturitySize','State','RescuerID']

In [119]:
for column in one_hot_columns:
    dataset=pd.get_dummies(dataset, columns=[column])

In [127]:
def split_dataset(t_frac, random_state, dataset):
    testset=dataset.sample(frac=t_frac,random_state=random_state)
    trainset=dataset.drop(testset.index)
    testset.to_csv("testSet.csv", index = False)
    trainset.to_csv("trainingSet.csv", index = False)
    return trainset, testset

In [128]:
trainset, validationset = split_dataset(0.3, 47, dataset)

In [129]:
def get_features_labels(dataset):
    labels = dataset['AdoptionSpeed']
    features = dataset.drop(['AdoptionSpeed'], axis=1)
    return features, labels

In [131]:
train_features_matrix, train_label_matrix = get_features_labels(trainset)
validation_features, validation_labels = get_features_labels(validationset)

In [143]:
len(validation_features)

4498

In [144]:
def get_predictions(model_name, train_features_matrix, train_label_matrix, test_features):
    predicted_labels = np.zeros(len(test_features))
    if model_name == 'nn':
        predicted_labels = _run_nn(train_features_matrix, train_label_matrix, test_features)
    elif model_name == 'svm':
        predicted_labels = _run_svm(train_features_matrix, train_label_matrix, test_features)
    elif model_name == 'dtree':
        predicted_labels = _run_dtree(train_features_matrix, train_label_matrix, test_features)
    return predicted_labels

In [159]:
models = ['svm','dtree','nn']
predicted_labels_dict = {}

for model in models:
    start_time = time.time()
    predicted_labels = get_predictions(model, train_features_matrix, train_label_matrix, validation_features)
    predicted_labels_dict[model]=predicted_labels
    end_time = time.time() - start_time
    print (model, "time taken", end_time)
    print (model, "micro f1 score", f1_score(predicted_labels, validation_labels.values,average='micro'))
    print (model, "accuracy score", accuracy_score(predicted_labels, validation_labels.values))
    print (model, "macro f1 score", calculate_macro_f1_score(predicted_labels, validation_labels.values))

svm time taken 1455.3647270202637
svm micro f1 score 0.3330369052912405
svm accuracy score 0.3330369052912405
svm macro f1 score 21.756816451122297
dtree time taken 2.8952572345733643
dtree micro f1 score 0.3750555802578924
dtree accuracy score 0.3750555802578924
dtree macro f1 score 32.05676590219869




nn time taken 253.57829523086548
nn micro f1 score 0.40929301911960875
nn accuracy score 0.4092930191196087
nn macro f1 score 37.295981169032785


In [165]:
models = ['nn']
predicted_labels_dict = {}

for model in models:
    start_time = time.time()
    predicted_labels = get_predictions(model, train_features_matrix, train_label_matrix, validation_features)
    predicted_labels_dict[model]=predicted_labels
    end_time = time.time() - start_time
    print (model, "time taken", end_time)
    print (model, "micro f1 score", f1_score(predicted_labels, validation_labels.values,average='micro'))
    print (model, "accuracy score", accuracy_score(predicted_labels, validation_labels.values))
    print (model, "macro f1 score", calculate_macro_f1_score(predicted_labels, validation_labels.values))



nn time taken 5728.024206161499
nn micro f1 score 0.4255224544241885
nn accuracy score 0.4255224544241885
nn macro f1 score 38.12221005040227


In [163]:
train_features_matrix.shape

(10495, 5967)