In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.learner import *

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *

import dill as pickle
from tqdm import tqdm

from spacy.en.language_data import STOP_WORDS
nlp = spacy.load('en')

from sklearn.feature_extraction.text import CountVectorizer,  TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.decomposition import TruncatedSVD

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

import xgboost as xgb

from spooky import *



In [3]:
PATH = 'data/spooky'

os.makedirs(f'{PATH}/models', exist_ok=True)
os.makedirs(f'{PATH}/tmp', exist_ok=True)

In [4]:
# get raw training and test datasets
train_df = pd.read_csv(f'{PATH}/train.csv')
test_df = pd.read_csv(f'{PATH}/test.csv')

print(f'Training size: {len(train_df)} | Test size: {len(test_df)}')
train_df.head()

Training size: 19579 | Test size: 8392


Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


## Feature Engineering

Add probabilites and predictions learned from sentiment analysis with pre-trained language model

In [None]:
train_lm_results_df = pd.re

Add new columns that represent the text fragments with stopwords removed, lemmatized, and only non-stopwords lemmatized

For each document, add word count, % unique words, % stop words, % punctuation, % of nouns, % of adjectives, % of proper names, % of numbers, % of symbols

In [None]:
def add_cols(df):
    rows = []
    for index, row in df.iterrows():
        cols = OrderedDict()
        
        # grab tokens, entities, and word tokens
        tokens = nlp(row['text'])
        ents = tokens.ents
        words = [ token for token in tokens if (not token.is_punct) ]
        
        cols['cleaned_text'] = ' '.join([ t.text for t in tokens if (not t.is_stop) ])
        cols['lemmatized_text'] = ' '.join([ t.lemma_ for t in tokens ])
        cols['cleaned_lemmatized_text'] = ' '.join([ t.lemma_ for t in tokens if (not t.is_stop) ])
        
        # character and word counts
        cols['char_count'] = len(row['text'])
        cols['word_count'] = len(words)
        
        # ratio of token types to words
        cols['u_word_pct'] = len(set([ w.lemma_ for w in words ])) / len(words)
        cols['stopwords_pct'] = len([ w for w in words if (w.is_stop) ]) / len(words)
        cols['punctuation_pct'] = len([ t for t in tokens if (t.is_punct) ]) / len(tokens)
        cols['symbol_pct'] = len([ t for t in tokens if (t.pos_ == 'SYM') ]) / len(words)
        cols['number_pct'] = len([ t for t in tokens if (t.pos_ == 'NUM') ]) / len(words)
        cols['alpha_pct'] = len([ t for t in tokens if (t.is_alpha) ]) / len(words)
        
        cols['noun_pct'] = len([ t for t in tokens if (t.pos_ == 'NOUN') ]) / len(words)
        cols['verb_pct'] = len([ t for t in tokens if (t.pos_ == 'VERB') ]) / len(words)
        cols['adj_pct'] = len([ t for t in tokens if (t.pos_ == 'ADJ') ]) / len(words)
        cols['proper_name_pct'] = len([ t for t in tokens if (t.pos_ == 'PROPN') ]) / len(words)
    
        # ratio of named entity types
        cols['named_entity_pct'] = len(ents) / len(words)
        cols['named_entity_person_pct'] = len([ ent for ent in ents if (ent.label_ == 'PERSON') ]) / len(words)
        cols['named_entity_norp_pct'] = len([ ent for ent in ents if (ent.label_ == 'NORP') ]) / len(words)
        cols['named_entity_facility_pct'] = len([ ent for ent in ents if (ent.label_ == 'FACILITY') ]) / len(words)
        cols['named_entity_org_pct'] = len([ ent for ent in ents if (ent.label_ == 'ORG') ]) / len(words)
        cols['named_entity_gpe_pct'] = len([ ent for ent in ents if (ent.label_ == 'GPE') ]) / len(words)
        cols['named_entity_non_gpe_loc_pct'] = len([ ent for ent in ents if (ent.label_ == 'LOC') ]) / len(words)
        cols['named_entity_product_pct'] = len([ ent for ent in ents if (ent.label_ == 'PRODUCT') ]) / len(words)
        cols['named_entity_event_pct'] = len([ ent for ent in ents if (ent.label_ == 'EVENT') ]) / len(words)
        cols['named_entity_woa_pct'] = len([ ent for ent in ents if (ent.label_ == 'WORK_OF_ART') ]) / len(words)
        cols['named_entity_lang_pct'] = len([ ent for ent in ents if (ent.label_ == 'LANGUAGE') ]) / len(words)
        cols['named_entity_date_pct'] = len([ ent for ent in ents if (ent.label_ == 'DATE') ]) / len(words)
        cols['named_entity_time_pct'] = len([ ent for ent in ents if (ent.label_ == 'TIME') ]) / len(words)
        cols['named_entity_money_pct'] = len([ ent for ent in ents if (ent.label_ == 'MONEY') ]) / len(words)
        cols['named_entity_quantity_pct'] = len([ ent for ent in ents if (ent.label_ == 'QUANTITY') ]) / len(words)

        rows.append(cols)
        
    return pd.DataFrame(rows, columns=cols.keys())

In [None]:
train_df = pd.concat([train_df, add_cols(train_df)], axis=1)

In [None]:
train_df.head()

In [None]:
train_df.iloc[0].text

In [None]:
# train_df[train_df.named_entity_person_pct > 0.0]

In [None]:
# train_df['cleaned_text'] = train_df.text.apply(
#     lambda txt: ' '.join([ word.text for word in nlp(txt) if (not word.is_stop) ]))

# train_df['lemmatized_text'] = train_df.text.apply(
#     lambda txt: ' '.join([ word.lemma_ for word in nlp(txt) ]))

# train_df['cleaned_lemmatized_text'] = train_df.cleaned_text.apply(
#     lambda txt: ' '.join([ word.lemma_ for word in nlp(txt) ]))

In [None]:
# # counts
# def char_count(row):
#     return len(row['text'])

# def word_count(row):
#     tokens = [ token.text for token in nlp(row['text']) if (not token.is_punct) ]
#     return len(tokens)

# def u_word_pct(row):
#     tokens = [ token.lemma_ for token in nlp(row['text']) if (not token.is_punct) ]
#     return len(list(set(tokens))) / row['word_count']

# # ratio of token types
# def stopwords_pct(row):
#     tokens = [ token for token in nlp(row['text']) if (token.is_stop) ]
#     return len(tokens) / row['word_count']

# def punctuation_pct(row):
#     tokens = [ token for token in nlp(row['text']) if (token.is_punct) ]
#     return len(tokens) / row['word_count']

# def noun_pct(row):
#     tokens = [ token for token in nlp(row['text']) if (token.pos_ == 'NOUN') ]
#     return len(tokens) / row['word_count']

# def verb_pct(row):
#     tokens = [ token for token in nlp(row['text']) if (token.pos_ == 'VERB') ]
#     return len(tokens) / row['word_count']

# def adj_pct(row):
#     tokens = [ token for token in nlp(row['text']) if (token.pos_ == 'ADJ') ]
#     return len(tokens) / row['word_count']

# def proper_name_pct(row):
#     tokens = [ token for token in nlp(row['text']) if (token.pos_ == 'PROPN') ]
#     return len(tokens) / row['word_count']

# def symbol_pct(row):
#     tokens = [ token for token in nlp(row['text']) if (token.pos_ == 'SYM') ]
#     return len(tokens) / row['word_count']

# def number_pct(row):
#     tokens = [ token for token in nlp(row['text']) if (token.pos_ == 'NUM') ]
#     return len(tokens) / row['word_count']

# def alpha_pct(row):
#     tokens = [ token for token in nlp(row['text']) if (token.is_alpha) ]
#     return len(tokens) / row['word_count']

# # ratio of named entity types and specific named entities
# def named_entity_pct(row):
#     return len(nlp(row['text']).ents) / row['word_count']

# def named_entity_person_pct(row):
#     ents = [ ent for ent in nlp(row['text']).ents if (ent.label_ == 'PERSON')]
#     return len(ents) / row['word_count']

# def named_entity_norp_pct(row):
#     ents = [ ent for ent in nlp(row['text']).ents if (ent.label_ == 'NORP')]
#     return len(ents) / row['word_count']

# def named_entity_facility_pct(row):
#     ents = [ ent for ent in nlp(row['text']).ents if (ent.label_ == 'FACILITY')]
#     return len(ents) / row['word_count']

# def named_entity_org_pct(row):
#     ents = [ ent for ent in nlp(row['text']).ents if (ent.label_ == 'ORG')]
#     return len(ents) / row['word_count']

# def named_entity_gpe_pct(row):
#     ents = [ ent for ent in nlp(row['text']).ents if (ent.label_ == 'GPE')]
#     return len(ents) / row['word_count']

# def named_entity_non_gpe_loc_pct(row):
#     ents = [ ent for ent in nlp(row['text']).ents if (ent.label_ == 'LOC')]
#     return len(ents) / row['word_count']

# def named_entity_product_pct(row):
#     ents = [ ent for ent in nlp(row['text']).ents if (ent.label_ == 'PRODUCT')]
#     return len(ents) / row['word_count']

# def named_entity_event_pct(row):
#     ents = [ ent for ent in nlp(row['text']).ents if (ent.label_ == 'EVENT')]
#     return len(ents) / row['word_count']

# def named_entity_woa_pct(row):
#     ents = [ ent for ent in nlp(row['text']).ents if (ent.label_ == 'WORK_OF_ART')]
#     return len(ents) / row['word_count']

# def named_entity_lang_pct(row):
#     ents = [ ent for ent in nlp(row['text']).ents if (ent.label_ == 'LANGUAGE')]
#     return len(ents) / row['word_count']

# def named_entity_date_pct(row):
#     ents = [ ent for ent in nlp(row['text']).ents if (ent.label_ == 'DATE')]
#     return len(ents) / row['word_count']

# def named_entity_time_pct(row):
#     ents = [ ent for ent in nlp(row['text']).ents if (ent.label_ == 'TIME')]
#     return len(ents) / row['word_count']

# def named_entity_money_pct(row):
#     ents = [ ent for ent in nlp(row['text']).ents if (ent.label_ == 'MONEY')]
#     return len(ents) / row['word_count']

# def named_entity_quantity_pct(row):
#     ents = [ ent for ent in nlp(row['text']).ents if (ent.label_ == 'QUANTITY')]
#     return len(ents) / row['word_count']

In [None]:
# # char and word counts
# train_df['char_count'] = train_df.apply(lambda r: char_count(r), axis=1)
# train_df['word_count'] = train_df.apply(lambda r: word_count(r), axis=1)

# # unqique, stop, and punctuation percentages
# train_df['u_word_pct'] = train_df.apply(lambda r: u_word_pct(r), axis=1)
# train_df['stopwords_pct'] = train_df.apply(lambda r: stopwords_pct(r), axis=1)
# train_df['punctuation_pct'] = train_df.apply(lambda r: punctuation_pct(r), axis=1)

# # noun, verb, adj, proper name percentages
# train_df['noun_pct'] = train_df.apply(lambda r: noun_pct(r), axis=1)
# train_df['verb_pct'] = train_df.apply(lambda r: verb_pct(r), axis=1)
# train_df['adj_pct'] = train_df.apply(lambda r: adj_pct(r), axis=1)
# train_df['proper_name_pct'] = train_df.apply(lambda r: proper_name_pct(r), axis=1)

# # alpha, number, symbol percentages
# train_df['alpha_pct'] = train_df.apply(lambda r: alpha_pct(r), axis=1)
# train_df['number_pct'] = train_df.apply(lambda r: number_pct(r), axis=1)
# train_df['symbol_pct'] = train_df.apply(lambda r: symbol_pct(r), axis=1)

# # named entity percentages
# train_df['named_entity_pct'] = train_df.apply(lambda r: named_entity_pct(r), axis=1)
# train_df['named_entity_person_pct'] = train_df.apply(lambda r: named_entity_person_pct(r), axis=1)
# train_df['named_entity_norp_pct'] = train_df.apply(lambda r: named_entity_norp_pct(r), axis=1)
# train_df['named_entity_facility_pct'] = train_df.apply(lambda r: named_entity_facility_pct(r), axis=1)
# train_df['named_entity_org_pct'] = train_df.apply(lambda r: named_entity_org_pct(r), axis=1)
# train_df['named_entity_gpe_pct'] = train_df.apply(lambda r: named_entity_gpe_pct(r), axis=1)
# train_df['named_entity_non_gpe_loc_pct'] = train_df.apply(lambda r: named_entity_non_gpe_loc_pct(r), axis=1)
# train_df['named_entity_product_pct'] = train_df.apply(lambda r: named_entity_product_pct(r), axis=1)
# train_df['named_entity_event_pct'] = train_df.apply(lambda r: named_entity_event_pct(r), axis=1)
# train_df['named_entity_woa_pct'] = train_df.apply(lambda r: named_entity_woa_pct(r), axis=1)
# train_df['named_entity_lang_pct'] = train_df.apply(lambda r: named_entity_lang_pct(r), axis=1)
# train_df['named_entity_date_pct'] = train_df.apply(lambda r: named_entity_date_pct(r), axis=1)
# train_df['named_entity_time_pct'] = train_df.apply(lambda r: named_entity_time_pct(r), axis=1)
# train_df['named_entity_money_pct'] = train_df.apply(lambda r: named_entity_money_pct(r), axis=1)
# train_df['named_entity_quantity_pct'] = train_df.apply(lambda r: named_entity_quantity_pct(r), axis=1)

## Define our multi-class logloss function

In [None]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

## Prepare data

In [None]:
# encode lables
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train_df.author.values)

In [None]:
# define train and validation datasets
X_train, X_valid, y_train, y_valid = train_test_split(train_df.cleaned_lemmatized_text.values, y, 
                                                      stratify=y, test_size=0.1, 
                                                      random_state=42, shuffle=True)

In [None]:
X_train.shape, X_valid.shape

## Basic Models

### CountVectorizer

**CountVectorizer** class to count how many times each term shows up in each document

Parameters:
- **min_df** (min. document frequency): The minimum number of document a term must be present in (integer or if float, represents the percentage of documents)

- **max_df** (max. document frequency): The maximum number of documents a term can be found in (int or float, see above)

Words that are too infrequent or too frequent lack predictive power.

See: http://www.ultravioletanalytics.com/2016/11/18/tf-idf-basics-with-pandas-scikit-learn/


In [None]:
ctv = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,3), stop_words=STOP_WORDS)
# ctv

ctv.fit(list(X_train) + list(X_valid))
X_train_ctv = ctv.transform(X_train)
X_valid_ctv = ctv.transform(X_valid)

len(ctv.vocabulary_)

In [None]:
from itertools import islice
list(islice(ctv.vocabulary_.items(), 5))

Transform the document into a **“bag of words”** representation which essentially is just a separate column for each term containing the count within each document. 

The **sparsity** of this representation which lets us know how many nonzero values there are in the dataset. The more sparse the data is the more challenging it will be to model, but that’s a discussion for another day.

In [None]:
ctv_counts = ctv.transform(train_df.cleaned_lemmatized_text)
print ('sparse matrix shape:', ctv_counts.shape)
print ('nonzero count:', ctv_counts.nnz)
print ('sparsity: %.2f%%' % (100.0 * ctv_counts.nnz / (ctv_counts.shape[0] * ctv_counts.shape[1])))

In [None]:
# top 10 most common words
occ = np.asarray(ctv_counts.sum(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': ctv.get_feature_names(), 'occurrences': occ})
counts_df.sort_values(by='occurrences', ascending=False).head(10)

Now that we’ve got term counts for each document we can use the TfidfTransformer to calculate the weights for each term in each document.

In [None]:
transformer = TfidfTransformer()
transformed_weights = transformer.fit_transform(ctv_counts)
transformed_weights

In [None]:
# top 10 terms by average tf-idf weight
weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': ctv.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(10)

Fit Logistic Regression on word counts

In [None]:
clf = LogisticRegression(C=1.0)
clf.fit(X_train_ctv, y_train)

preds = clf.predict_proba(X_valid_ctv)
print ("logloss: %0.3f " % multiclass_logloss(y_valid, preds))

Fit Naive Bayes on word counts

In [None]:
clf = MultinomialNB()
clf.fit(X_train_ctv, y_train)

preds = clf.predict_proba(X_valid_ctv)
print ("logloss: %0.3f " % multiclass_logloss(y_valid, preds))

### TfidfVectorizer

**TF-IDF (Term Frequency - Inverse Document Frequency)**: A technique for determining what each document, in a set of documents, is about.

**TF** (term frequency) = The porportion of occurences of a specific term to the total number of terms in a document

**IDF** (inverse document frequency) = The inverse of the porportion of documents that contain a specifc word/phrase

*The general idea is that if a specific phrase appears a lot of times in a given document, but it doesn’t appear in many other documents, then we have a good idea that the phrase is important in distinguishing that document from all the others.*

For each term, we will have a separate feature (e.g., if there are 10k terms we will have 10k features), the value will be the tf-idf weight of that term in the document.

Note: You want to use stopwords, stemming/lemmatization *first* to narrow down your corpus to the *important* words.

In [None]:
# Always start with these features. They work (almost) everytime!
tfv = TfidfVectorizer(min_df=3, max_features=None, 
            strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1, sublinear_tf=1,
            stop_words=STOP_WORDS)

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(X_train) + list(X_valid))
X_train_tfv =  tfv.transform(X_train) 
X_valid_tfv = tfv.transform(X_valid)

X_train_tfv.shape, X_valid_tfv.shape

In [None]:
# top 5 weights for training set
weights = np.asarray(X_train_tfv.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': tfv.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(5)

Fit simple logistic regression on TFIDF

In [None]:
clf = LogisticRegression(C=1.0)
clf.fit(X_train_tfv, y_train)

preds = clf.predict_proba(X_valid_tfv)
print ("logloss: %0.3f " % multiclass_logloss(y_valid, preds))

Fit Naive Bayes on TF-IDF

In [None]:
clf = MultinomialNB()
clf.fit(X_train_tfv, y_train)

preds = clf.predict_proba(X_valid_tfv)
print ("logloss: %0.3f " % multiclass_logloss(y_valid, preds))

### SVM

Since SVMs take a lot of time, we will reduce the number of features from the TF-IDF using Singular Value Decomposition before applying SVM.

Also, note that before applying SVMs, we must standardize the data.

In [None]:
# apply SVD, I chose 120 components (120-200 components are good enough for SVM model)
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(X_train_tfv)
X_train_svd = svd.transform(X_train_tfv)
X_valid_svd = svd.transform(X_valid_tfv)

# scale the data obtained from SVD ... renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(X_train_svd)
X_train_svd_scl = scl.transform(X_train_svd)
X_valid_svd_scl = scl.transform(X_valid_svd)

In [None]:
# Fitting a simple SVM
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(X_train_svd_scl, y_train)

preds = clf.predict_proba(X_valid_svd_scl)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, preds))

### XGBoost

In [None]:
# Fitting a simple xgboost on word counts
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)

clf.fit(X_train_ctv.tocsc(), y_train)
preds = clf.predict_proba(X_valid_ctv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(y_valid, preds))

In [None]:
# Fitting a simple xgboost on tf-idf
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)

clf.fit(X_train_tfv.tocsc(), y_train)
preds = clf.predict_proba(X_valid_tfv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(y_valid, preds))

In [None]:
# Fitting a simple xgboost on SVD features
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)

clf.fit(X_train_svd, y_train)
preds = clf.predict_proba(X_valid_svd)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, preds))

In [None]:
# Fitting a VERY simple xgboost on SVD features
clf = xgb.XGBClassifier(nthread=10)

clf.fit(X_train_svd, y_train)
preds = clf.predict_proba(X_valid_svd)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, preds))

## Let's try using more of the engineered dataset

### CountVectorizer

In [None]:
cols_to_drop = ['id', 'author', 'text', 'cleaned_text', 'lemmatized_text', 'cleaned_lemmatized_text']
train = train_df.drop(columns=cols_to_drop).as_matrix()

# fit CountVectorizer on ENTIRE training dataset
ctv = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,3), stop_words=STOP_WORDS, 
                      min_df=0.0001)
ctv.fit(train_df.cleaned_lemmatized_text.values)
train_ctv = ctv.transform(train_df.cleaned_lemmatized_text.values)
print(train_ctv.shape)

In [None]:
# !!! NOTE: train_ctv will be a sparse array, so to concatenate you have to cast it to ".toarray()
train = np.concatenate([train, train_ctv.toarray()], axis=1)
print(train.shape)

# define train and validation datasets
X_train, X_valid, y_train, y_valid = train_test_split(train, y, 
                                                      stratify=y, test_size=0.1, 
                                                      random_state=42, shuffle=True)
# fit model
clf = MultinomialNB()
clf.fit(X_train, y_train)

preds = clf.predict_proba(X_valid)
print ("logloss: %0.3f " % multiclass_logloss(y_valid, preds))

In [None]:
clf = LogisticRegression(C=1.0)
clf.fit(X_train, y_train)

preds = clf.predict_proba(X_valid)
print ("logloss: %0.3f " % multiclass_logloss(y_valid, preds))

### TF-IDF

In [None]:
cols_to_drop = ['id', 'author', 'text', 'cleaned_text', 'lemmatized_text', 'cleaned_lemmatized_text']
train = train_df.drop(columns=cols_to_drop).as_matrix()

tfv = TfidfVectorizer(min_df=3, max_features=None, 
            strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1, sublinear_tf=1,
            stop_words=STOP_WORDS)

tfv.fit(train_df.cleaned_lemmatized_text.values)
train_tfv = ctv.transform(train_df.cleaned_lemmatized_text.values)
print(train_tfv.shape)

In [None]:
# !!! NOTE: train_ctv will be a sparse array, so to concatenate you have to cast it to ".toarray()
train = np.concatenate([train, train_tfv.toarray()], axis=1)
print(train.shape)

# define train and validation datasets
X_train, X_valid, y_train, y_valid = train_test_split(train, y, 
                                                      stratify=y, test_size=0.1, 
                                                      random_state=42, shuffle=True)
# fit model
clf = MultinomialNB()
clf.fit(X_train, y_train)

preds = clf.predict_proba(X_valid)
print ("logloss: %0.3f " % multiclass_logloss(y_valid, preds))

In [None]:
clf = LogisticRegression(C=1.0)
clf.fit(X_train, y_train)

preds = clf.predict_proba(X_valid)
print ("logloss: %0.3f " % multiclass_logloss(y_valid, preds))

### XGBoost

In [None]:
# Fitting a simple xgboost on tf-idf
clf = xgb.XGBClassifier(nthread=10)

clf.fit(X_train, y_train)
preds = clf.predict_proba(X_valid)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, preds))