## Martin Dionne

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.decomposition import PCA
#from sklearn.decomposition import SparsePCA
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import ComplementNB
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.ensemble import GradientBoostingClassifier

## 1: Face Recognition, but not evil this time

Using the faces dataset in:

```
from sklearn.datasets import fetch_lfw_people
faces = fetch_lfw_people(min_faces_per_person=60)
```

If you use the `faces.target` and `faces.target_names` attributes, you can build a facial recognition algorithm.

Use sklearn **gridsearch** (or an equivalent, like random search) to optimize the model for accuracy. Try both a SVM-based classifier and a logistic regression based classifier (with a feature pipeline of your choice) to get the best model. You should have at least 80% accuracy.

In [2]:
from sklearn.datasets import fetch_lfw_people
#faces = fetch_lfw_people(min_faces_per_person=60)
faces = fetch_lfw_people(min_faces_per_person=60, resize=0.4, color=False)

X = faces.data
y = faces.target
labels = faces.target_names

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
#X_train.shape, X_test.shape

In [4]:
# SVM
pipe_svm = Pipeline([
    ('pca', PCA()),
    ('svm', SVC())
])

param_grid_svm = [{
    'pca__whiten': [True, False],
    'pca__n_components': [75, 100, 175],
    'svm__kernel': ['poly', 'rbf', 'sigmoid'],
    'svm__C': [5e4, 1e5, 5e5],
    'svm__gamma': [0.001, 0.005, 0.01]
}]

grid_svm = GridSearchCV(pipe_svm, param_grid_svm, scoring='accuracy', cv=5)
grid_svm.fit(X_train, y_train)
grid_svm.best_params_

{'pca__n_components': 75,
 'pca__whiten': True,
 'svm__C': 100000.0,
 'svm__gamma': 0.01,
 'svm__kernel': 'rbf'}

In [5]:
#model = grid.best_estimator_
#y_pred = model.fit(X_train, y_train).predict(X_test)

In [6]:
# use the best params by default
grid_svm_pred = grid_svm.predict(X_test)

print(confusion_matrix(y_test, grid_svm_pred))
print(classification_report(y_test, grid_svm_pred))

[[ 5  1  1  0  0  0  0]
 [ 0 41  0  3  1  0  0]
 [ 0  1 10  4  0  0  1]
 [ 2  0  1 75  0  0  0]
 [ 0  0  1  3 10  0  0]
 [ 0  0  0  1  0 11  0]
 [ 0  0  0  0  0  0 20]]
              precision    recall  f1-score   support

           0       0.71      0.71      0.71         7
           1       0.95      0.91      0.93        45
           2       0.77      0.62      0.69        16
           3       0.87      0.96      0.91        78
           4       0.91      0.71      0.80        14
           5       1.00      0.92      0.96        12
           6       0.95      1.00      0.98        20

    accuracy                           0.90       192
   macro avg       0.88      0.83      0.85       192
weighted avg       0.90      0.90      0.89       192



In [7]:
# Logistic Regression
pipe_lgr = Pipeline([
    ('pca', PCA()),
    ('lgr', LogisticRegression())
])

param_grid_lgr = [{
    'pca__whiten': [True, False],
    'pca__n_components': [100, 150, 200, 250],
    'lgr__fit_intercept': [True, False],
    'lgr__C': [0.00005, 0.00001, 0.0005, 0.0001]
}]

grid_lgr = GridSearchCV(pipe_lgr, param_grid_lgr, scoring='accuracy', cv=5)
grid_lgr.fit(X_train, y_train)
grid_lgr.best_params_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'lgr__C': 1e-05,
 'lgr__fit_intercept': True,
 'pca__n_components': 200,
 'pca__whiten': False}

In [11]:
# use the best params by default
grid_lgr_pred = grid_lgr.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, grid_lgr_pred))
print(classification_report(y_test, grid_lgr_pred))

[[ 4  0  2  0  1  0  0]
 [ 1 36  1  6  0  1  0]
 [ 2  1 11  1  0  0  1]
 [ 0  3  0 74  1  0  0]
 [ 0  0  0  2 11  1  0]
 [ 0  0  0  0  0 11  1]
 [ 0  0  0  2  1  0 17]]
              precision    recall  f1-score   support

           0       0.57      0.57      0.57         7
           1       0.90      0.80      0.85        45
           2       0.79      0.69      0.73        16
           3       0.87      0.95      0.91        78
           4       0.79      0.79      0.79        14
           5       0.85      0.92      0.88        12
           6       0.89      0.85      0.87        20

    accuracy                           0.85       192
   macro avg       0.81      0.79      0.80       192
weighted avg       0.85      0.85      0.85       192



# 2: Bag of Words, Bag of Popcorn

By this point, you are ready for the [Bag of Words, Bag of Popcorn](https://www.kaggle.com/c/word2vec-nlp-tutorial/data) competition. 

Use NLP feature pre-processing (using, SKLearn, Gensim, Spacy or Hugginface) to build the best classifier you can. Use a  feature pipeline, and gridsearch for your final model.

A succesful project should get 90% or more on a **holdout** dataset you kept for yourself.

In [14]:
df = pd.read_csv('data/labeledTrainData.tsv.zip', delimiter="\t", quoting=3)
y = df['sentiment']
X = df['review']
raw_review = df['review']

import string
X = df['review'].str.lower().str.replace('[{}]'.format(string.punctuation),'')
X = X.str.replace('<[^<]+?>', '', regex=True)
X = X.str.replace('[^a-zA-Z]', ' ', regex=True)
X = X.str.replace('  ', ' ', regex=True)

In [30]:
from bs4 import BeautifulSoup 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

#review = BeautifulSoup(raw_review).get_text()    
review = raw_review.str.replace('<[^<]+?>', '', regex=True) 
letters_only = review.str.replace('[^a-zA-Z]', ' ', regex=True)
words = letters_only.str.lower().str.split()                             
stops = set(stopwords.words("english"))                  
words_list = words.apply(lambda x: [w for w in x if not w in stops])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Martin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [32]:
words_list

0        [stuff, going, moment, mj, started, listening,...
1        [classic, war, worlds, timothy, hines, enterta...
2        [film, starts, manager, nicholas, bell, giving...
3        [must, assumed, praised, film, greatest, filme...
4        [superbly, trashy, wondrously, unpretentious, ...
                               ...                        
24995    [seems, like, consideration, gone, imdb, revie...
24996    [believe, made, film, completely, unnecessary,...
24997    [guy, loser, get, girls, needs, build, picked,...
24998    [minute, documentary, bu, uel, made, early, on...
24999    [saw, movie, child, broke, heart, story, unfin...
Name: review, Length: 25000, dtype: object

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
X_test, X_holdout, y_test, y_holdout = train_test_split(X_test, y_test, test_size=0.50)

In [19]:
class DenseTransformer():

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

In [20]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('norm', Normalizer()),
    #('scale', StandardScaler()),
    ('dense', DenseTransformer()),
    #('pca', PCA()),
    #('pca', SparsePCA()),
    #('lr', LogisticRegression())
    ('cnb', ComplementNB())
    #('rfc', RandomForestClassifier())
    #('gbc', GradientBoostingClassifier())
])

param_grid = [{
    'tfidf__max_df': [0.92],
    'tfidf__min_df': [2],
    'tfidf__max_features': [14000, 20000, 30000],
    'tfidf__stop_words': ['english'],
    #'pca__n_components': [100, 150, 200, 250],
    #'lr__fit_intercept': [True, False],
    #'lr__C': [0.00001, 0.0001, 0.001, 0.01]
    'cnb__alpha': [0.8, 0.9, 1.1, 2, 4, 10, 100]
    #'rfc__n_estimators': [300, 400, 500]
    #'gbc__n_estimators': [300]

}]

grid = GridSearchCV(pipe, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=6)
grid.fit(X_train, y_train)
grid.best_params_

Fitting 5 folds for each of 21 candidates, totalling 105 fits
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   33.1s
[Parallel(n_jobs=6)]: Done 105 out of 105 | elapsed:  1.3min finished


{'cnb__alpha': 1.1,
 'tfidf__max_df': 0.92,
 'tfidf__max_features': 30000,
 'tfidf__min_df': 2,
 'tfidf__stop_words': 'english'}

In [21]:
# use the best params by default
grid_pred = grid.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, grid_pred))
print(classification_report(y_test, grid_pred))

[[1618  235]
 [ 308 1589]]
              precision    recall  f1-score   support

           0       0.84      0.87      0.86      1853
           1       0.87      0.84      0.85      1897

    accuracy                           0.86      3750
   macro avg       0.86      0.86      0.86      3750
weighted avg       0.86      0.86      0.86      3750



In [22]:
# run once everything is optimized
pred = grid.predict(X_holdout)
print(confusion_matrix(y_holdout, pred))
print(classification_report(y_holdout, pred))

[[1662  224]
 [ 296 1568]]
              precision    recall  f1-score   support

           0       0.85      0.88      0.86      1886
           1       0.88      0.84      0.86      1864

    accuracy                           0.86      3750
   macro avg       0.86      0.86      0.86      3750
weighted avg       0.86      0.86      0.86      3750

