In [95]:
import pandas as pd

In [96]:
test = pd.read_csv('data/test.csv')
train = pd.read_csv('data/train.csv')

In [97]:
train.head()

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol,category
0,1,John Hansell,A marriage of 13 and 18 year old bourbons. A m...,85.0,97,51.5,2.0
1,2,Dave Broom,There have been some legendary Bowmores from t...,13500.0,97,42.9,1.0
2,3,John Hansell,This bottling celebrates master distiller Park...,150.0,97,50.0,2.0
3,4,John Hansell,What impresses me most is how this whisky evol...,4500.0,97,40.5,1.0
4,6,Davin de Kergommeaux,"After 40 years in barrels, the trademark Canad...",199.0,96,45.0,


In [98]:
train = train.dropna()

In [99]:
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [100]:
rfc = RandomForestClassifier()
vect = TfidfVectorizer(stop_words='english')

In [101]:
pipe = Pipeline([('vect', vect), ('rfc', rfc)])

In [102]:
pipe.fit(train['description'], train['category'])



Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_patte...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                               

In [103]:
pred = pipe.predict(test['description'])

In [104]:
submission = pd.DataFrame({'id': test['id'], 'category': pred})
submission['category'] = submission['category'].astype('int64')

In [105]:
submission.head()

Unnamed: 0,id,category
0,955,2
1,3532,2
2,1390,1
3,1024,1
4,1902,1


In [106]:
submission.to_csv('data/submission.csv', index=False)

In [107]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__min_df': (.02, .05),
    'vect__max_features': (100, 500, 1000),
    'rfc__n_estimators': (20, 100, 400)
}

In [108]:
from sklearn.model_selection import GridSearchCV

gs = GridSearchCV(pipe, parameters, cv=5, n_jobs=4)

In [109]:
gs.fit(train['description'], train['category'])

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [110]:
gs.best_score_

0.8974151857835219

In [111]:
import spacy

nlp = spacy.load('en_core_web_lg')

In [112]:
embeddings = [nlp(doc).vector for doc in train['description']]

In [113]:
rfc

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [114]:
rfc.fit(embeddings, train['category'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [115]:
rfc.score(embeddings, train['category'])

0.9899030694668821

In [117]:
test_embeddings = [nlp(doc).vector for doc in test['description']]

pred = rfc.predict(test_embeddings)

submission = pd.DataFrame({'id': test['id'], 'category': pred})
submission['category'] = submission['category'].astype('int64')

submission.to_csv('data/submission.csv', index=False)

In [42]:
doc = train['description'][10]

In [43]:
doc

"The complete package: uncut, unfiltered, full-flavored, richly textured (almost chewy), and very complex. Notes of toffee-coated nuts, vanilla fudge, polished leather, cedar-tinged tobacco, barrel char, cocoa powder, and a hint of fig, wrapped up with a firm oak grip on the finish. Worth every penny of the premium price being charged for this commemorative release. Editor's Choice."

In [44]:
doc = nlp(doc)

In [45]:
for chunk in doc.noun_chunks:
    if (chunk.is_stopword == False) and (chunk.is_punct == False):
    print(chunk.lemma_)

the complete package
note
toffee - coat nut
vanilla fudge
polished leather
cedar - tinge tobacco
barrel char
cocoa powder
a hint
fig
a firm oak grip
the finish
the premium price
this commemorative release
editor 's choice


In [86]:
def tokenize(doc):
    
    d = nlp(doc)
    tokens = []
    
    for token in d:
        if (token.is_stop == False) and (token.pos_ == 'ADJ'):
            tokens.append(token.lemma_)
    
    return tokens

In [87]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(analyzer=tokenize, max_df=.9, min_df=.1)

In [88]:
vect.fit(train['description'])

CountVectorizer(analyzer=<function tokenize at 0x1a43a10e18>, binary=False,
                decode_error='strict', dtype=<class 'numpy.int64'>,
                encoding='utf-8', input='content', lowercase=True, max_df=0.9,
                max_features=None, min_df=0.1, ngram_range=(1, 1),
                preprocessor=None, stop_words=None, strip_accents=None,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None,
                vocabulary=None)

In [89]:
vect.get_feature_names()

['creamy',
 'dark',
 'dry',
 'fresh',
 'good',
 'light',
 'long',
 'new',
 'old',
 'rich',
 'ripe',
 'soft',
 'spicy',
 'sweet']

In [90]:
len(vect.get_feature_names())

14

In [91]:
X = vect.transform(train['description'])

In [93]:
rfc.fit(X, train['category'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [94]:
rfc.score(X, train['category'])

0.7322294022617124