In [172]:
import operator
from typing import Callable, List

import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_curve, roc_auc_score
import pandas as pd

from raw_dataset import RawDataset
from processed_dataset import ProcessedDataset
from split_dataset import SplitDataset
from nn_toolkit.vocab import Vocab, VocabBuilder

%matplotlib notebook
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [129]:
raw = RawDataset()
processed = ProcessedDataset(raw)
processed.process()
split_ds = SplitDataset(processed)

In [130]:
vocab = VocabBuilder(max_size=20000, min_count=5).from_df(split_ds.train_df, 'tokens')

In [131]:
vectorizer = TfidfVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x, vocabulary=vocab.token_to_int)

In [132]:
X_train = vectorizer.fit_transform(split_ds.train_df.tokens)
X_val = vectorizer.transform(split_ds.val_df.tokens)
X_test = vectorizer.transform(processed_ds.test_df.tokens)

In [136]:
y_train = split_ds.train_df.target.values
y_val = split_ds.val_df.target.values

In [159]:
def plot_roc_curve(y_true, y_prob):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    score = roc_auc_score(y_true, y_prob)
    plt.plot(fpr, tpr)
    plt.title(f'ROC AUC Score: {100 * score: 0.2f}')
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.plot(np.linspace(0, 1), np.linspace(0, 1), 'k--')
    plt.show()

In [178]:
def sklearn_test_predictions(split_ds: SplitDataset, transforms, clf):
    train_df = pd.concat([split_ds.train_df, split_ds.val_df])
    X_train = transforms.fit_transform(train_df.tokens)
    X_test = transforms.transform(split_ds.test_df.tokens)
    y_train = train_df.target.values
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    split_ds.test_df['target'] = y_pred
    return split_ds.test_df[['id', 'target']]

In [180]:
submission_df = sklearn_test_predictions(
    split_ds,
    vectorizer,
    LogisticRegression()
)
submission_df.to_csv('../data/submissions/sklearn_lr.csv', index=False)

In [181]:
submission_df = sklearn_test_predictions(
    split_ds,
    vectorizer,
    RandomForestClassifier()
)
submission_df.to_csv('../data/submissions/sklearn_rf.csv', index=False)