## Martin Dionne

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer
#from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
#from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import MultinomialNB
import lightgbm as lgbm

## 1: Face Recognition, but not evil this time

Using the faces dataset in:

```
from sklearn.datasets import fetch_lfw_people
faces = fetch_lfw_people(min_faces_per_person=60)
```

If you use the `faces.target` and `faces.target_names` attributes, you can build a facial recognition algorithm.

Use sklearn **gridsearch** (or an equivalent, like random search) to optimize the model for accuracy. Try both a SVM-based classifier and a logistic regression based classifier (with a feature pipeline of your choice) to get the best model. You should have at least 80% accuracy.

In [2]:
from sklearn.datasets import fetch_lfw_people
faces = fetch_lfw_people(min_faces_per_person=60)
#faces = fetch_lfw_people(min_faces_per_person=60, resize=0.4, color=False)

X = faces.data
y = faces.target
labels = faces.target_names

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
#X_train.shape, X_test.shape

In [4]:
# SVM
pipe_svc = Pipeline([
    ('pca', PCA()),
    ('svm', SVC())
])

param_grid_svc = [{
    'pca__whiten': [True, False],
    'pca__n_components': [50, 75, 100, 175],
    'svm__kernel': ['poly', 'rbf', 'sigmoid'],
    'svm__C': [5e4, 1e5, 5e5, 1e6],
    'svm__gamma': [0.001, 0.005, 0.01, 0.1]
}]

grid_svc = GridSearchCV(pipe_svc, param_grid_svc, 
                        scoring='accuracy',
                        cv=5,
                        n_jobs=5,
                        verbose=3)

grid_svc.fit(X_train, y_train)
print(grid_svc.best_params_)
print(grid_svc.best_score_)

Fitting 5 folds for each of 384 candidates, totalling 1920 fits
[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:    3.3s
[Parallel(n_jobs=5)]: Done 118 tasks      | elapsed:    9.1s
[Parallel(n_jobs=5)]: Done 278 tasks      | elapsed:   19.4s
[Parallel(n_jobs=5)]: Done 502 tasks      | elapsed:   33.7s
[Parallel(n_jobs=5)]: Done 790 tasks      | elapsed:   55.5s
[Parallel(n_jobs=5)]: Done 1142 tasks      | elapsed:  1.4min
[Parallel(n_jobs=5)]: Done 1558 tasks      | elapsed:  2.0min
[Parallel(n_jobs=5)]: Done 1920 out of 1920 | elapsed:  2.8min finished
{'pca__n_components': 75, 'pca__whiten': True, 'svm__C': 100000.0, 'svm__gamma': 0.01, 'svm__kernel': 'rbf'}
0.8562211981566821


In [5]:
grid_svc_pred = grid_svc.predict(X_test)

print(confusion_matrix(y_test, grid_svc_pred))
print(classification_report(y_test, grid_svc_pred))

[[ 4  1  1  0  1  0  0]
 [ 0 41  0  3  1  0  0]
 [ 0  1 10  4  0  0  1]
 [ 2  0  1 75  0  0  0]
 [ 0  0  1  3 10  0  0]
 [ 0  0  0  1  0 11  0]
 [ 0  0  0  1  1  0 18]]
              precision    recall  f1-score   support

           0       0.67      0.57      0.62         7
           1       0.95      0.91      0.93        45
           2       0.77      0.62      0.69        16
           3       0.86      0.96      0.91        78
           4       0.77      0.71      0.74        14
           5       1.00      0.92      0.96        12
           6       0.95      0.90      0.92        20

    accuracy                           0.88       192
   macro avg       0.85      0.80      0.82       192
weighted avg       0.88      0.88      0.88       192



In [6]:
# Logistic Regression
pipe_lgr = Pipeline([
    ('pca', PCA()),
    ('lgr', LogisticRegression())
])

param_grid_lgr = [{
    'pca__whiten': [True, False],
    'pca__n_components': [100, 150, 250, 300],
    'lgr__fit_intercept': [True, False],
    'lgr__C': [1e-6, 5e-5, 1e-4, 0.0005]
}]

grid_lgr = GridSearchCV(pipe_lgr, param_grid_lgr, 
                        scoring='accuracy',
                        cv=5,
                        n_jobs=5,
                        verbose=3)

grid_lgr.fit(X_train, y_train)
print(grid_lgr.best_params_)
print(grid_svc.best_score_)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:    1.7s
[Parallel(n_jobs=5)]: Done 118 tasks      | elapsed:   10.4s
[Parallel(n_jobs=5)]: Done 278 tasks      | elapsed:   24.4s
[Parallel(n_jobs=5)]: Done 320 out of 320 | elapsed:   27.8s finished
{'lgr__C': 0.0005, 'lgr__fit_intercept': False, 'pca__n_components': 250, 'pca__whiten': False}
0.8562211981566821
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
grid_lgr_pred = grid_lgr.predict(X_test)

print(confusion_matrix(y_test, grid_lgr_pred))
print(classification_report(y_test, grid_lgr_pred))

[[ 4  1  1  0  1  0  0]
 [ 1 38  1  3  0  1  1]
 [ 2  3 10  1  0  0  0]
 [ 2  3  2 64  3  1  3]
 [ 0  0  0  2 12  0  0]
 [ 0  0  0  1  0 11  0]
 [ 0  1  0  0  1  1 17]]
              precision    recall  f1-score   support

           0       0.44      0.57      0.50         7
           1       0.83      0.84      0.84        45
           2       0.71      0.62      0.67        16
           3       0.90      0.82      0.86        78
           4       0.71      0.86      0.77        14
           5       0.79      0.92      0.85        12
           6       0.81      0.85      0.83        20

    accuracy                           0.81       192
   macro avg       0.74      0.78      0.76       192
weighted avg       0.82      0.81      0.81       192



# 2: Bag of Words, Bag of Popcorn

By this point, you are ready for the [Bag of Words, Bag of Popcorn](https://www.kaggle.com/c/word2vec-nlp-tutorial/data) competition. 

Use NLP feature pre-processing (using, SKLearn, Gensim, Spacy or Hugginface) to build the best classifier you can. Use a  feature pipeline, and gridsearch for your final model.

A succesful project should get 90% or more on a **holdout** dataset you kept for yourself.

In [8]:
df = pd.read_csv('data/labeledTrainData.tsv.zip', delimiter="\t", quoting=3)
y = df['sentiment']
X = df['review']
raw_review = df['review']

import string
X = X.str.replace('<[^<]+?>', '', regex=True)
#X = df['review'].str.lower().str.replace('[{}]'.format(string.punctuation),'')
#X = X.str.replace('[\W]+', ' ', regex=True).str.lower()
X = X.str.replace('[^a-zA-Z]', ' ', regex=True).str.lower()
X = X.str.replace('  ', ' ', regex=True)
#X[0]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)
#X_test, X_holdout, y_test, y_holdout = train_test_split(X_test, y_test, test_size=0.50)

In [10]:
import nltk
#nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from sklearn.base import TransformerMixin, BaseEstimator

class Lemmatizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y):
        return self

    def transform(self, X):
        return  X.apply(lambda text: " ".join([self.lemmatizer.lemmatize(word) for word in text.split()]))

In [11]:
import numpy as np
import spacy
nlp = spacy.load('en_core_web_md')
from sklearn.base import TransformerMixin, BaseEstimator

class Doc2Vec(BaseEstimator, TransformerMixin):
    def __init__(self, nlp, doc='doc'):
        self.doc = doc
        self.nlp = nlp

    def fit(self, X, y):
        return self

    def transform(self, X):
        if self.doc == 'word':
            # return a vector for each word
            # need more work
            vec = X.apply(lambda text: np.array([self.nlp(w).vector for w in text]) )
        elif self.doc == 'sentense':
            # return a vector for each sentense
            # need more work
            vec = X.apply(lambda text: np.array([self.nlp(s).vector for s in text.split('.')]) )
        else:
            # return a vector for the whole doc 
            vec = X.apply(lambda text: np.array(self.nlp(text).vector))
            #vec = np.array([self.nlp(text).vector for text in X])
        return  pd.DataFrame(vec.values.tolist(), index=vec.index)

In [12]:
# Fastest Model
pipe_lsvc = Pipeline([
    ('lemma', Lemmatizer()),
    ('tfidf', TfidfVectorizer()),
    #('norm', Normalizer()),
    #('d2v', Doc2Vec(nlp)),
    #('mnb', MultinomialNB())
    #('svc', SVC())
    ('lsvc', LinearSVC())
])

param_grid_lsvc = [{
    ##'tfidf__max_df': [0.92],
    ##'tfidf__min_df': [2],
    ##'tfidf__max_features': [6000],
    #'mnb__alpha': [0.5, 0.8, 1],
    ##'svc__kernel': ['rbf'],
    ##'svc__C': [0.75],
}]

grid_lsvc = GridSearchCV(pipe_lsvc, param_grid_lsvc, 
                    scoring='accuracy', 
                    cv=5,
                    n_jobs=5,
                    verbose=3)

grid_lsvc.fit(X_train, y_train)
print(grid_lsvc.best_params_)
print(grid_lsvc.best_score_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:   33.3s remaining:   50.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:   33.5s finished
{}
0.8918588235294116


In [13]:
grid_pred_lsvc = grid_lsvc.predict(X_test)

print(confusion_matrix(y_test, grid_pred_lsvc))
print(classification_report(y_test, grid_pred_lsvc))

[[1631  222]
 [ 161 1736]]
              precision    recall  f1-score   support

           0       0.91      0.88      0.89      1853
           1       0.89      0.92      0.90      1897

    accuracy                           0.90      3750
   macro avg       0.90      0.90      0.90      3750
weighted avg       0.90      0.90      0.90      3750



In [14]:
# Most Accurate Model
pipe_svm = Pipeline([
    ('lemma', Lemmatizer()),
    ('tfidf', TfidfVectorizer()),
    #('norm', Normalizer()),
    #('d2v', Doc2Vec(nlp)),
    #('pca', PCA())
    #('mnb', MultinomialNB())
    ('svm', SVC())
    #('lsvc', LinearSVC())
])

param_grid_svm  = [{
    #'tfidf__max_df': [0.92],
    #'tfidf__min_df': [2],
    #'tfidf__max_features': [6000],
    ##'pca__whiten': [True, False],
    ##'pca__n_components': [100, 200, 300],
    'svm__kernel': ['poly', 'rbf'],
    'svm__C': [5, 10, 15],
}]

grid_svm = GridSearchCV(pipe_svm, param_grid_svm, 
                    scoring='accuracy', 
                    cv=5,
                    n_jobs=5,
                    verbose=3)

grid_svm.fit(X_train, y_train)
print(grid_svm.best_params_)
print(grid_svm.best_score_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  30 out of  30 | elapsed: 122.4min finished
{'svm__C': 5, 'svm__kernel': 'rbf'}
0.8967529411764705


In [15]:
grid_pred_svm = grid_svm.predict(X_test)

print(confusion_matrix(y_test, grid_pred_svm))
print(classification_report(y_test, grid_pred_svm))

[[1645  208]
 [ 162 1735]]
              precision    recall  f1-score   support

           0       0.91      0.89      0.90      1853
           1       0.89      0.91      0.90      1897

    accuracy                           0.90      3750
   macro avg       0.90      0.90      0.90      3750
weighted avg       0.90      0.90      0.90      3750



In [16]:
# Last try
pipe_lgbm = Pipeline([
    #('lemma', Lemmatizer()),
    #('tfidf', TfidfVectorizer()),
    ('d2v', Doc2Vec(nlp)),
    #('norm', Normalizer()),
    ('lbgm', lgbm.LGBMClassifier())
])

param_grid_lgbm = [{
    #'tfidf__max_df': [0.92],
    #'tfidf__min_df': [2],
    #'tfidf__max_features': [6000],
    'lbgm__boosting_type' : ['goss'], 
    'lbgm__num_leaves': [15, 25, 35],
    'lbgm__max_depth': [-1],
    'lbgm__n_estimators': [200, 300, 400]
}]

grid_lgbm = GridSearchCV(pipe_lgbm, param_grid_lgbm, 
                    scoring='accuracy', 
                    cv=5,
                    n_jobs=5,
                    verbose=3)

grid_lgbm.fit(X_train, y_train)
print(grid_lgbm.best_params_)
print(grid_lgbm.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed: 76.8min
[Parallel(n_jobs=5)]: Done  45 out of  45 | elapsed: 138.9min finished
{'lbgm__boosting_type': 'goss', 'lbgm__max_depth': -1, 'lbgm__n_estimators': 400, 'lbgm__num_leaves': 35}
0.8483294117647059


In [17]:
grid_pred_lgbm = grid_lgbm.predict(X_test)

print(confusion_matrix(y_test, grid_pred_lgbm))
print(classification_report(y_test, grid_pred_lgbm))

[[1549  304]
 [ 258 1639]]
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      1853
           1       0.84      0.86      0.85      1897

    accuracy                           0.85      3750
   macro avg       0.85      0.85      0.85      3750
weighted avg       0.85      0.85      0.85      3750

