In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC #(setting multi_class=”crammer_singer”)

import pandas as pd
import pickle
import time

input_data_filepath   = '../data/raw/trainSet.csv'
new_data_filepath     = '../data/raw/candidateTestSet.csv'
model_filepath        = '../models/final_model_object.pckl'
preprocessor_filepath = '../models/preprocessor_object.pckl'

output_prediction_filepath = '../output/candidateTestSet_with_categories.csv'


def train_model(training_data_raw):
    """
    given training training data
    trains a `LinearSVC` model with discovered hyperparameters
    """
    tfidf = TfidfVectorizer(
        stop_words = {'english'},
        strip_accents= 'ascii',
        ngram_range=(1,1), # consider unigrams/bigrams/trigrams?
        min_df = 8,
        max_df = 0.80,
        binary = True, # count term occurance in each query only once
    )
    clf_svc = LinearSVC(
        dual=False,
        tol=1e-3,
        multi_class='ovr',
        max_iter=2500,
    )
    X = tfidf.fit_transform(training_data_raw['query'])
    y = training_data_raw['category']

    clf_svc.fit(X, y)
    
    return tfidf, clf_svc


def predictions_from_model(model, preprocessor, new_data_raw):
    """
    """
    try:
#         print('new_data_raw:', type(new_data_raw), len(new_data_raw))
#         processed = preprocessor.transform(new_data_raw)
#         print('preprocessed:', type(processed), processed.shape)
        y_predict = model.predict(preprocessor.transform(new_data_raw))
#         print('preprocessed:', type(y_predict), y_predict.shape)
    except Exception as err:
        print('error computing model predictions using model', model)
        raise err
    return y_predict


In [2]:
# fetch training data:
df_train = pd.read_csv(input_data_filepath, header=None, names=['query', 'category'])
df_train['category'] = pd.Categorical(df_train['category'].astype(str))

In [3]:
# train model, return preprocessing steps and model object:
preprocessor, trained_model = train_model(df_train)
 
with open(model_filepath, 'wb') as filepath:
    pickle.dump(trained_model, filepath)

with open(preprocessor_filepath, 'wb') as filepath:
    pickle.dump(preprocessor, filepath)

In [5]:
# fetch unlabelled data, add computed predictions, output to file:
df_new = pd.read_csv(new_data_filepath, header=None, names=['query'])
temp = predictions_from_model(trained_model, preprocessor, df_new['query'])
df_new['category_predicted'] = temp

In [6]:
print('predictions:', len(df_new), type(temp), len(temp))

predictions: 67424 <class 'numpy.ndarray'> 67424


In [7]:
df_new.to_csv(output_prediction_filepath, index=False)

In [8]:
print('done')

done
