In [1]:
import pandas as pd
import numpy as np

In [11]:
df = pd.read_csv('./data/train.csv')
df.dropna(axis=0)
df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [12]:
df.set_index('id', inplace=True)
df.head()

Unnamed: 0_level_0,text,author
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id26305,"This process, however, afforded me no means of...",EAP
id17569,It never once occurred to me that the fumbling...,HPL
id11008,"In his left hand was a gold snuff box, from wh...",EAP
id27763,How lovely is spring As we looked from Windsor...,MWS
id12958,"Finding nothing else, not even gold, the Super...",HPL


## Feature engineering

In [16]:
import re
from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))

In [28]:
def add_text_features(df):
    """
    `clean_text`: Lowercase, remove multiple spaces
    """
    def avg_word_length(x:str):
        word_lengths = [len(t) for t in x.split(' ') if t not in stopwords]
        if not word_lengths:
            return 0
        return np.mean(word_lengths) 

    df['clean_text'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]','', x.lower()))
    df['len_chars'] = df['clean_text'].apply(lambda x: len(x))
    df['len_words'] = df['clean_text'].apply(lambda x: len(x.split(' ')))
    df['len_non_stopwords'] = df['clean_text'].apply(lambda x: len([t for t in x.split(' ') if t not in stopwords])) 
    df['len_avg_word'] = df['clean_text'].apply(avg_word_length)
    df['len_commas'] = df['text'].apply(lambda x: x.count(','))

    return df

In [32]:
df = add_text_features(df)
df.head()

Unnamed: 0_level_0,text,author,clean_text,len_chars,len_words,len_non_stopwords,len_avg_word,len_commas
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
id26305,"This process, however, afforded me no means of...",EAP,this process however afforded me no means of a...,224,41,21,6.380952,4
id17569,It never once occurred to me that the fumbling...,HPL,it never once occurred to me that the fumbling...,70,14,6,6.166667,0
id11008,"In his left hand was a gold snuff box, from wh...",EAP,in his left hand was a gold snuff box from whi...,195,36,19,5.947368,4
id27763,How lovely is spring As we looked from Windsor...,MWS,how lovely is spring as we looked from windsor...,202,34,21,6.47619,3
id12958,"Finding nothing else, not even gold, the Super...",HPL,finding nothing else not even gold the superin...,170,27,16,7.1875,2


In [34]:
from sklearn.model_selection import train_test_split

In [36]:
features = [c for c in df.columns.values if c not in ['id', 'text', 'author']]
features_numeric = [c for c in df.columns.values if c not in ['id', 'text', 'author', 'clean_text']]
target = 'author'

In [37]:
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.3, random_state=42)

In [38]:
X_train.head()

Unnamed: 0_level_0,clean_text,len_chars,len_words,len_non_stopwords,len_avg_word,len_commas
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
id25464,i said in one of my letters my dear margaret t...,232,49,19,5.263158,4
id21332,he explained to me although i have forgotten t...,811,148,65,6.6,15
id10528,i listened in extremity of horror,33,6,3,7.666667,0
id09996,i came upon them suddenly in the leafy june an...,169,32,17,5.764706,3
id23845,as for the twentieth time or more arthur munro...,187,33,17,6.470588,1


In [41]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    """
        Use text column in data
    """
    def __init__(self, key):
        self.key = key
   
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.key]

class NumberSelector(TextSelector):
    def transform(self,X):
        return X[[self.key]]

In [43]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

text = Pipeline([
    ('selector', TextSelector(key='clean_text')),
    ('tfidf', TfidfVectorizer(stop_words='english')),
    
])
text.fit_transform(X_train)

<13705x21869 sparse matrix of type '<class 'numpy.float64'>'
	with 154700 stored elements in Compressed Sparse Row format>

In [51]:
from sklearn.preprocessing import StandardScaler

len_chars = Pipeline([
    ('selector', NumberSelector(key='len_chars')),
    ('standard', StandardScaler()),
])
length.fit_transform(X_train)

array([[ 0.78818875],
       [ 6.06778497],
       [-1.0263875 ],
       ...,
       [-0.45192366],
       [-0.14189556],
       [-0.38809435]])

In [52]:
len_words = Pipeline([
                ('selector', NumberSelector('len_words')),
                ('scaler', StandardScaler())
            ])
len_non_stopwords = Pipeline([
                ('selector', NumberSelector('len_non_stopwords')),
                ('scaler', StandardScaler())
            ])
len_avg_word = Pipeline([
                ('selector', NumberSelector('len_avg_word')),
                ('scaler', StandardScaler())
            ])
len_commas = Pipeline([
                ('selector', NumberSelector('len_commas')),
                ('scaler', StandardScaler())
            ])

To make a pipeline from all of our pipelines, we do the same thing, but now we use a FeatureUnion to join the feature processing pipelines.

The feature union itself is not a pipeline, it's just a union, so you need to do one more step to make it useable: pass it to a pipeline, with the same structure, an array of tuples, with the simple (name, object) format. . As you can see, we get a pipeline-ception going on the more complex you get!

In [53]:
from sklearn.pipeline import FeatureUnion

In [54]:
feats = FeatureUnion([
    ('text', text),
    ('len_chars', len_chars),
    ('len_words', len_words),
    ('len_non_stopwords', len_non_stopwords),
    ('len_avg_word', len_avg_word),
    ('len_commas', len_commas),
    
])

In [55]:
features = Pipeline([
    ('features', feats)
])
features.fit_transform(X_train)

<13705x21874 sparse matrix of type '<class 'numpy.float64'>'
	with 223225 stored elements in Compressed Sparse Row format>

In [56]:
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('features', feats),
    ('classifier', RandomForestClassifier(random_state=42))
])

pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
np.mean(preds == y_test)

0.6821586653047327

In [57]:
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'features', 'classifier', 'features__n_jobs', 'features__transformer_list', 'features__transformer_weights', 'features__verbose', 'features__text', 'features__len_chars', 'features__len_words', 'features__len_non_stopwords', 'features__len_avg_word', 'features__len_commas', 'features__text__memory', 'features__text__steps', 'features__text__verbose', 'features__text__selector', 'features__text__tfidf', 'features__text__selector__key', 'features__text__tfidf__analyzer', 'features__text__tfidf__binary', 'features__text__tfidf__decode_error', 'features__text__tfidf__dtype', 'features__text__tfidf__encoding', 'features__text__tfidf__input', 'features__text__tfidf__lowercase', 'features__text__tfidf__max_df', 'features__text__tfidf__max_features', 'features__text__tfidf__min_df', 'features__text__tfidf__ngram_range', 'features__text__tfidf__norm', 'features__text__tfidf__preprocessor', 'features__text__tfidf__smooth_idf', 'features__text__tfidf__stop

In [58]:
from sklearn.model_selection import GridSearchCV

hyperparameters = {
    'features__text__tfidf__max_df': [.9, .95],
    'features__text__tfidf__ngram_range': [(1,1), (1,2)],
    'classifier__max_depth': [50, 70],
    'classifier__min_samples_leaf': [1,2]
}

clf = GridSearchCV(pipeline, hyperparameters, cv=5)
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('features',
                                        FeatureUnion(n_jobs=None,
                                                     transformer_list=[('text',
                                                                        Pipeline(memory=None,
                                                                                 steps=[('selector',
                                                                                         TextSelector(key='clean_text')),
                                                                                        ('tfidf',
                                                                                         TfidfVectorizer(analyzer='word',
                                                                                                         binary=False,
                                                                 

In [59]:
clf.best_params_

{'classifier__max_depth': 70,
 'classifier__min_samples_leaf': 2,
 'features__text__tfidf__max_df': 0.9,
 'features__text__tfidf__ngram_range': (1, 1)}

In [60]:
# Refit with best params on entire training set
clf.refit

True

In [61]:
preds = clf.predict(X_test)
probs = clf.predict_proba(X_test)

np.mean(preds == y_test)

0.6607082056520259

In [62]:
submission = pd.read_csv('./data/test.csv')
submissions = add_text_features(submission)

predictions = clf.predict_proba(submission)
predictions = pd.DataFrame(data=predictions, columns = clf.best_estimator_.named_steps['classifier'].classes_)

result = pd.concat([submission[['id']], predictions], axis=1)
result.set_index('id', inplace = True)

In [63]:
result.head()

Unnamed: 0_level_0,EAP,HPL,MWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id02310,0.304545,0.26007,0.435385
id24541,0.578144,0.197817,0.22404
id00134,0.337125,0.449496,0.213379
id27757,0.531984,0.231957,0.236059
id04081,0.544827,0.187871,0.267301


In [64]:
result.to_csv('./data/submission.csv')