In [1]:
import numpy as np
import json

import sklearn.feature_extraction.text as skl_txt
import sklearn.linear_model as skl_lm
import sklearn.neighbors as skl_nei
import sklearn.ensemble as skl_en
import sklearn.naive_bayes as skl_nb
import sklearn.svm as skl_svm

import pickle

In [2]:
from preprocessor import Preprocessor as Preprocessor

[nltk_data] Downloading package wordnet to /Users/Najj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
data_dir = "../data/data_cleaned/labelised/"

path_hutto = data_dir + "hutto.txt"
path_airline = data_dir + "airline.txt"
path_kaggle = data_dir + "kaggle.txt"
path_michigan = data_dir + "michigan.txt"
path_rtneg = data_dir + "rt-polarity-neg.txt"
path_rtpos = data_dir + "rt-polarity-pos.txt"

data_files = {
    "kaggle" : path_kaggle, 
             }

ngrams = (1,1)
binn = False
idf = True

pre = Preprocessor()
vectorizer = None

In [4]:
def parse_file(file_path):
    f = open(file_path, "r")
    file_data = json.loads(f.readlines()[0])
    f.close()
    return np.array(file_data)

def preprocess(text_data):
    for i, tweet in enumerate(text_data):
        t = pre.default_processing(tweet)
        text_data[i] = t

def vectorize_data(text_data, ngrams, binn, idf):
    if (binn == True):
        idf = False
    vectorizer = skl_txt.TfidfVectorizer(use_idf = idf, binary = binn, ngram_range = ngrams)
    print(vectorizer)
    vectorizer.fit(text_data) 
    return vectorizer.transform(text_data)

def parse_data(file_data):
    data, labels = file_data[:, 1], np.array(file_data[:, 0], dtype='int')
    preprocess(data)
    data = vectorize_data(data, ngrams, binn, idf)
    return partition_data(data, labels)

def partition_data(data, labels, ratio = 0.7):
    N = int(ratio * data.shape[0])
    idx = np.random.permutation(data.shape[0])
    train_data = data[idx[:N]]
    train_labels = labels[idx[:N]]
    test_data = data[idx[N:]]
    test_labels = labels[idx[N:]]
    return train_data, train_labels, test_data, test_labels

def get_data(files):
    data = {}
    for key in files:
        file_data = parse_file(files[key])
        tr_data, tr_labels, te_data, te_labels = parse_data(file_data)
        partitioned_data = {
            'train_data': tr_data,
            'train_labels' : tr_labels,
            'test_data': te_data,
            'test_labels' : te_labels
        }
        data[key] = partitioned_data
    return data

In [5]:
def train(clfs, data):
    for clf in clfs:
        train_data = data['train_data']
        train_labels = data['train_labels']
        clf[0].fit(train_data, train_labels)

In [6]:
clfs = np.array([
    [skl_svm.LinearSVC(random_state=0), 'SVM'],
#    [skl_lm.LogisticRegression(), 'Max Entropy'],
#    [skl_nb.MultinomialNB(alpha=.01), "Multinomial NB"],
#    [skl_nb.BernoulliNB(alpha=.01), "BernoulliNB"],
#    [skl_lm.RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"],
#    [skl_lm.Perceptron(max_iter=50), "Perceptron"], 
#    [skl_lm.PassiveAggressiveClassifier(max_iter=50), "Passive-Aggressive"],
#    [skl_nei.KNeighborsClassifier(n_neighbors=10), "kNN"],
#    [skl_en.RandomForestClassifier(n_estimators=100), "Random Forest"]
])

all_data = get_data(data_files)

for dataset_label in all_data:
    train(clfs, all_data[dataset_label])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)


In [7]:
with open("../data/objects/vectorizers/tfidf", "wb") as file:
    file.write(pickle.dumps(vectorizer))

In [24]:
for clf in clfs:
    with open("../data/objects/trained_classifiers/" + clf[1], "wb") as file:
        file.write(pickle.dumps(clf[0]))

In [30]:
for clf in clfs:
    with open("../data/objects/trained_classifiers/" + clf[1], "rb") as file:
        classifier = pickle.loads(b''.join(file.readlines()))

In [31]:
print(classifier)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0)


In [8]:
with open("../data/objects/vectorizers/tfidf", "rb") as file:
    tmp = pickle.loads(b''.join(file.readlines()))

In [9]:
print(tmp)

None
