Lambda School Data Science

*Unit 4, Sprint 1, Module 3*

---

### Load Competition Data

In [46]:
import pandas as pd
train = pd.read_csv('https://raw.githubusercontent.com/michhottinger/DS-Unit-4-Sprint-1-NLP/master/module3-document-classification/train.csv.zip')
test = pd.read_csv('https://raw.githubusercontent.com/michhottinger/DS-Unit-4-Sprint-1-NLP/master/module3-document-classification/test.csv')

In [47]:
# Import Statements
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from sklearn.model_selection import train_test_split
X_train, y_train = train_test_split(train, test_size=0.20, random_state=42)

### Define Pipeline Components

In [49]:
# Create Pipeline Components

vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
rfc = RandomForestClassifier()


# Define the Pipeline

pipe = Pipeline([('vect', vect), ('clf', rfc)])

### Define Your Search Space
You're looking for both the best hyperparameters of your vectorizer and your classification model. 

In [50]:
parameters = {
    'vect__max_df': ( 0.75, 1.0),
    'vect__min_df': (.02, .05),
    'vect__max_features': (500,1000),
    'clf__n_estimators':(5, 10, 15),
    'clf__max_depth':(15,20)
}

grid_search = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(train.description, train.category)


Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   57.6s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  1.4min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'vect__max_df': (0.75, 1.0), 'vect__min_df': (0.02, 0.05), 'vect__max_features': (500, 1000), 'clf__n_estimators': (5, 10, 15), 'clf__max_depth': (15, 20)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [51]:
grid_search.best_score_

0.8565351894818252

### Make a Submission File
*Note:* In a typical Kaggle competition, you are only allowed two submissions a day, so you only submit if you feel you cannot achieve higher test accuracy. For this compeition the max daily submissions are capped at **20**. Submit for each demo and for your assignment. 

In [52]:
# Predictions on test sample
pred = grid_search.predict(test['description'])

In [53]:
submission = pd.DataFrame({'id': test['id'], 'category':pred})
submission['category'] = submission['category'].astype('int64')

In [54]:
# Make Sure the Category is an Integer
submission.head()

Unnamed: 0,id,category
0,955,2
1,3532,2
2,1390,1
3,1024,1
4,1902,1


In [55]:
# Save your Submission File
# Best to Use an Integer or Timestamp for different versions of your model
submission.to_csv('submission7.csv', index=False)

## Challenge

You're trying to achienve 90% Accuracy on your model.

## Latent Semantic Indexing (Learn)
<a id="p2"></a>

In [56]:
# Import

from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(algorithm='randomized',
                   n_iter=7)

In [57]:
params = { 
    'lsi__svd__n_components': [10,100,250],
    'lsi__vect__max_df':[.9, .95, 1.0],
    'clf__n_estimators':[5,10,20]
}


In [58]:
# LSI
lsi = Pipeline([('vect', vect), ('svd', svd)])


# Pipe
pipe = Pipeline([('lsi', lsi), ('clf', rfc)])

In [59]:
print(pipe)

Pipeline(memory=None,
     steps=[('lsi', Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm=...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])


In [60]:
# Fit
grid_search = GridSearchCV(pipe,params, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(train.description, train.category)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed: 10.2min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('lsi', Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm=...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'lsi__svd__n_components': [10, 100, 250], 'lsi__vect__max_df': [0.9, 0.95, 1.0], 'clf__n_estimators': [5, 10, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [61]:
grid_search.best_score_

0.8986852281515855

In [62]:
# Predictions on test sample
pred = grid_search.predict(test['description'])

In [63]:
submission = pd.DataFrame({'id': test['id'], 'category':pred})
submission['category'] = submission['category'].astype('int64')

In [64]:
# Make Sure the Category is an Integer
submission.head()

Unnamed: 0,id,category
0,955,2
1,3532,3
2,1390,1
3,1024,1
4,1902,1


In [65]:
# Save your Submission File
# Best to Use an Integer or Timestamp for different versions of your model
submission.to_csv('submission8.csv', index=False)

# Word Embeddings with Spacy (Learn)
<a id="p3"></a>

# Overview

In [66]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [67]:
train_str = train.to_string()
doc = nlp(train_str)

In [68]:
train_vector = doc.vector
print(len(train_vector))

300


In [69]:
def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

In [70]:
X = get_word_vectors(train.description)

len(X) == len(train.description)

True

In [71]:
rfc.fit(X, train.category)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [82]:
rfc.score(X, train.category)

0.9911059551430781

In [83]:
pred = rfc.predict(get_word_vectors(test.description))

In [74]:
#submission = pd.DataFrame({'id': test['id'], 'category':pred})
#submission['category'] = submission['category'].astype('int64')

#Make Sure the Category is an Integer
#submission.head()

In [75]:
# Save your Submission File
#Best to Use an Integer or Timestamp for different versions of your model
#submission.to_csv('submission3.csv', index=False)