In [2]:
import re
import os
import time
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy import sparse
import pickle

import seaborn as sns
import matplotlib.pyplot  as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score

import xgboost

import logging
logging.config.fileConfig('logger.conf')
logger = logging.getLogger('simpleLogger')
#use -> logger.info('some log comment')

import data_lake_helper as dl_helper

In [3]:
data_lake = dl_helper.DataLake(version='v2')

In [70]:
def load_feature(feature):
    df[feature] = data_lake.load_obj(feature + '.pkl')

In [71]:
df = data_lake.load_obj('df-cleaned.pkl')
df_train_table = df[df.path == 'dataset/train_set/']
df_test_table = df[df.path == 'dataset/test_set/']

In [72]:
train_y = df_train_table['category'].tolist()
valid_y = df_test_table['category'].tolist()

# label encode the target variable 
encoder = LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [73]:
# Count Vectors as features
########################################

xvalid_count = data_lake.load_npz('xvalid_count.npz')
xtrain_count = data_lake.load_npz("xtrain_count.npz")
count_vect = data_lake.load_obj('count_vect.pkl')

# word level tf-idf
###################

xvalid_tfidf = data_lake.load_npz("xvalid_tfidf.npz")
xtrain_tfidf = data_lake.load_npz("xtrain_tfidf.npz")

# ngram level tf-idf 
####################

xvalid_tfidf_ngram = data_lake.load_npz("xvalid_tfidf_ngram.npz")
xtrain_tfidf_ngram = data_lake.load_npz("xtrain_tfidf_ngram.npz")

# characters level tf-idf
#########################

xvalid_tfidf_ngram_chars = data_lake.load_npz("xvalid_tfidf_ngram_chars" + ".npz")
xtrain_tfidf_ngram_chars = data_lake.load_npz("xtrain_tfidf_ngram_chars" + ".npz")
    

In [74]:
letter_types = sorted(df.category.unique().tolist())

In [8]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, valid_y, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    #get accuracy
    accuracy = accuracy_score(predictions, valid_y)

    #get items recall info
    recall_info = ""
    items_recall = recall_score(valid_y, predictions, average=None)
    
    if len(letter_types) != len(items_recall):
        raise Exception('len(letter_types) != len(items_recall) ' + str(len(letter_types)) + ' != '+ str(len(items_recall)))
        
    #filtered_items_recall = filter(lambda x: x[0] == 'CL' or x[0] == 'RL' , zip(letter_types,items_recall))
    filtered_items_recall = zip(letter_types,items_recall)
    
    for item in filtered_items_recall:
        recall_info += str(item)
    
    msg = "\n" + str(classifier) + "\n" + "items_recall " + recall_info + "\n" + "accuracy_score " + str(accuracy) + "\n"
    logger.info(msg)
    
    return msg

In [26]:
# Extereme Gradient Boosting on Count Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc(), valid_y)
print("Xgb, Count Vectors: ", accuracy) 

# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc(), valid_y)
print("Xgb, WordLevel TF-IDF: ", accuracy) 

# Extereme Gradient Boosting on Character Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram_chars.tocsc(), train_y, xvalid_tfidf_ngram_chars.tocsc(), valid_y)
print("Xgb, CharLevel Vectors: ", accuracy) 

KeyboardInterrupt: 