In [1]:
# Import Statements
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
import pandas as pd
df = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [10]:
df.head()

Unnamed: 0,id,description,category
0,1,A marriage of 13 and 18 year old bourbons. A m...,2
1,2,There have been some legendary Bowmores from t...,1
2,3,This bottling celebrates master distiller Park...,2
3,4,What impresses me most is how this whisky evol...,1
4,9,"A caramel-laden fruit bouquet, followed by une...",2


In [78]:
y = df['category']
X = df.drop(columns = ['category','id'])

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

## Text Feature Extraction & Classification Pieplines (Learn)


In [13]:
# Create Pipeline Components

vect = TfidfVectorizer(stop_words='english')
rfc = RandomForestClassifier()

In [14]:
# Define the Pipeline
pipe = Pipeline([
                 #Vectorizer
                 ('vect', vect), 
                 # Classifier
                 ('clf', rfc)
                ])

In [15]:
parameters = {
    'vect__max_df': ( 0.75, 1.0),
    'vect__min_df': (.02, .05),
    'vect__max_features': (500,1000),
    'clf__n_estimators':(5, 10,),
    'clf__max_depth':(15,20)
}

grid_search = GridSearchCV(pipe,parameters, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X.description, y)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:   42.9s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'vect__max_df': (0.75, 1.0), 'vect__min_df': (0.02, 0.05), 'vect__max_features': (500, 1000), 'clf__n_estimators': (5, 10), 'clf__max_depth': (15, 20)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [16]:
grid_search.best_score_

0.8534416086620263

In [21]:
pred = grid_search.predict(test['description'])
submission = pd.DataFrame({'id': test['id'], 'category':pred})
submission['category'] = submission['category'].astype('int64')

In [22]:
submission.head()

Unnamed: 0,id,category
0,955,2
1,3532,3
2,1390,1
3,1024,1
4,1902,1


In [24]:
submission.to_csv('./submission1.csv', index=False)

## Next Section: Latent Semantic Indexing 

In [31]:
from sklearn.decomposition import TruncatedSVD

#singular value decomposition component
svd = TruncatedSVD(n_components=100, 
                   algorithm='randomized',
                   n_iter=20)

In [42]:
params = { 
    'lsi__svd__n_components': [10,100,250],
    'lsi__vect__max_df':[.9, .95, 1.0]
}

In [43]:
# LSI
lsi = Pipeline([('vect', vect), ('svd', svd)])


# Pipe
pipe = Pipeline([('lsi', lsi), ('clf', rfc)])

In [44]:
# Fit
grid_search = GridSearchCV(pipe,params, cv=5, n_jobs=4, verbose=1)
grid_search.fit(X.description, y)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed:  1.1min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('lsi', Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid={'lsi__svd__n_components': [10, 100, 250], 'lsi__vect__max_df': [0.9, 0.95, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [45]:
grid_search.best_score_

0.8901778808971385

In [46]:
# Predictions on test sample
pred = grid_search.predict(test['description'])

In [74]:
submission2 = pd.DataFrame({'id': test['id'], 'category':pred})
submission2['category'] = submission2['category'].astype('int64')

In [75]:
submission2.head()

Unnamed: 0,id,category
0,955,2
1,3532,3
2,1390,1
3,1024,1
4,1902,1


In [76]:
submission2.to_csv('./submission2.csv', index=False)

## Next Section: Word Embeddings with Spacy (Learn)


In [80]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [81]:
doc = nlp("Two bananas in pyjamas")

In [82]:
bananas_vector = doc.vector
print(len(bananas_vector))

300


In [83]:
def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

In [84]:
X = get_word_vectors(X.description)
X_test = get_word_vectors(test.description)

len(X) == len(X_train.description)

False

In [85]:
rfc.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [89]:
rfc.score(X, y)

0.9911059551430781

In [90]:
pred = rfc.predict(X_test)

In [91]:
submission3 = pd.DataFrame({'id': test['id'], 'category':pred})
submission3['category'] = submission3['category'].astype('int64')

In [92]:
submission3.head()

Unnamed: 0,id,category
0,955,2
1,3532,1
2,1390,1
3,1024,1
4,1902,1


In [93]:
submission3.to_csv('./submission3.csv', index=False)

In [94]:
submission3.shape

(288, 2)