<a href="https://colab.research.google.com/github/nguyenthanhdat20130013/Machine-Learning/blob/main/Lab_8_20130013_NguyenThanhDat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This lab deals with **GridSearchCV** for tuning the hyper-parameters of an estimator and applying vectorization techniques to the **movie reviews dataset** for classification task. 

*   **Deadline: 23:59, 17/4/2023**



# Import libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from prettytable import PrettyTable
from sklearn.linear_model import LogisticRegression

#Task 1. With **iris** dataset
*  1.1. Apply **GridSearchCV** for **SVM** to find the best hyperparameters using the following param_grid.

```
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}
```




In [None]:
dataset1 = datasets.load_iris()
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}
X = dataset1['data']
y = dataset1['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)            
svc = svm.SVC(kernel='linear')
grid_fr_class = GridSearchCV(estimator=svc, param_grid=param_grid, scoring='accuracy', n_jobs=4, cv =10, refit=True,return_train_score=True)
grid_fr_class.fit(X_train, y_train)
y_predict = grid_fr_class.predict(X_test)
#best hyperparameters
best_svm_s1 = grid_fr_class.best_score_
best_svm_p1 = grid_fr_class.best_params_
# print best parameter after tuning
print("The best hyperparameters using the following param_grid.")
print("Best hyperparameters: ", best_svm_p1)
print("Best score: ", best_svm_s1)

accuracy_svm = accuracy_score(y_test, y_predict)
precision_svm = precision_score(y_test, y_predict, average= 'macro')
recall_svm = recall_score(y_test, y_predict, average= 'macro')
f1_svm = f1_score(y_test, y_predict, average= 'macro') 
cols = [("GridSearchCV for SVM",accuracy_svm,precision_svm,recall_svm,f1_svm)]
df1 = pd.DataFrame(data = cols, columns = ['Algorithm','Accuracy', 'Precision', 'Recall',' f1'])
df1



The best hyperparameters using the following param_grid.
Best hyperparameters:  {'C': 1, 'gamma': 1, 'kernel': 'linear'}
Best score:  0.9666666666666666


Unnamed: 0,Algorithm,Accuracy,Precision,Recall,f1
0,GridSearchCV for SVM,1.0,1.0,1.0,1.0


*  1.2. Apply **GridSearchCV** for **kNN** to find the best hyperparameters using the following param_grid.

```
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
```
where

    *  **n_neighbors**: Decide the best k based on the values we have computed earlier.
    *  **weights**: Check whether adding weights to the data points is beneficial to the model or not. 'uniform' assigns no weight, while 'distance' weighs points by the inverse of their distances meaning nearer points will have more weight than the farther points.
    *  **metric**: The distance metric to be used will calculating the similarity.


In [None]:
dataset1_2 = datasets.load_iris()
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
 
X = dataset1_2['data']
y = dataset1_2['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2) 
knn = KNeighborsClassifier(n_neighbors=5)

grid_fr_class1 = GridSearchCV(estimator=knn, param_grid=grid_params, scoring='accuracy', n_jobs=4, cv =10, refit=True,return_train_score=True)
grid_fr_class1.fit(X_train, y_train)
y_predict = grid_fr_class1.predict(X_test)
#best hyperparameters
best_knn_s1 = grid_fr_class1.best_score_
best_knn_p1 = grid_fr_class1.best_params_
# print best parameter after tuning
print("The best hyperparameters using the following param_grid.")
print("Best hyperparameters: ", best_knn_p1)
print("Best score: ", best_knn_s1)

accuracy_knn = accuracy_score(y_test, y_predict)
precision_knn = precision_score(y_test, y_predict, average= 'macro')
recall_knn = recall_score(y_test, y_predict, average= 'macro')
f1_knn = f1_score(y_test, y_predict, average= 'macro') 
cols = [(" GridSearchCV for kNN",accuracy_knn,precision_knn,recall_knn,f1_knn)]
df1_2 = pd.DataFrame(data = cols, columns = ['Algorithm','Accuracy', 'Precision', 'Recall',' f1'])
df1_2

The best hyperparameters using the following param_grid.
Best hyperparameters:  {'metric': 'minkowski', 'n_neighbors': 7, 'weights': 'uniform'}
Best score:  0.9666666666666666


Unnamed: 0,Algorithm,Accuracy,Precision,Recall,f1
0,GridSearchCV for kNN,1.0,1.0,1.0,1.0


*  1.3. Apply **GridSearchCV** for **Random Forest** to find the best hyperparameters using the following param_grid.

```
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}
```

In [None]:
dataset1_3 = datasets.load_iris()
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}
 
X = dataset1_3['data']
y = dataset1_3['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2) 
clf = RandomForestClassifier(n_estimators=100)

grid_fr_class2 = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='accuracy', n_jobs=4, cv =10, refit=True,return_train_score=True)
grid_fr_class2.fit(X_train, y_train)
y_predict = grid_fr_class2.predict(X_test)
#best hyperparameters
best_rf_s13 = grid_fr_class2.best_score_
best_rf_p13 = grid_fr_class2.best_params_
# print best parameter after tuning
print("The best hyperparameters using the following param_grid.")
print("Best hyperparameters: ", best_rf_p13)
print("Best score: ", best_rf_s13)

accuracy_clf = accuracy_score(y_test, y_predict)
precision_clf = precision_score(y_test, y_predict, average= 'macro')
recall_clf = recall_score(y_test, y_predict, average= 'macro')
f1_clf = f1_score(y_test, y_predict, average= 'macro') 
cols = [("GridSearchCV for Random Forest",accuracy_clf,precision_clf,recall_clf,f1_clf)]
df1_3 = pd.DataFrame(data = cols, columns = ['Algorithm','Accuracy', 'Precision', 'Recall',' f1'])
df1_3

The best hyperparameters using the following param_grid.
Best hyperparameters:  {'max_depth': 6, 'max_features': 'log2', 'max_leaf_nodes': 6, 'n_estimators': 25}
Best score:  0.9666666666666666


Unnamed: 0,Algorithm,Accuracy,Precision,Recall,f1
0,GridSearchCV for Random Forest,0.933333,0.921296,0.921296,0.921296


*   1.4 Compare the best obtained results from 1.1 to 1.3 (use PrettyTable to dispaly the results)

In [None]:
table = PrettyTable(['Algorithm', 'Best score', 'Best hyperparameters'])
table.add_row(['SVM',best_svm_s1, best_svm_p1])
table.add_row(['kNN',best_knn_s1, best_knn_p1])
table.add_row(['Random Forest',best_rf_s13, best_rf_p13])
# Print the table
print(table)


+---------------+--------------------+-----------------------------------------------------------------------------------+
|   Algorithm   |     Best score     |                                Best hyperparameters                               |
+---------------+--------------------+-----------------------------------------------------------------------------------+
|      SVM      | 0.9666666666666666 |                      {'C': 1, 'gamma': 1, 'kernel': 'linear'}                     |
|      kNN      | 0.9666666666666666 |          {'metric': 'minkowski', 'n_neighbors': 7, 'weights': 'uniform'}          |
| Random Forest | 0.9666666666666666 | {'max_depth': 6, 'max_features': 'log2', 'max_leaf_nodes': 6, 'n_estimators': 25} |
+---------------+--------------------+-----------------------------------------------------------------------------------+


In [None]:
table = PrettyTable()
table.field_names = ['Algorithm','Accuracy', 'Precision', 'Recall',' f1']
# add rows to the table
table.add_row(["GridSearchCV for SVM", accuracy_svm,precision_svm,recall_svm,f1_svm ])
table.add_row(["GridSearchCV for kNN", accuracy_knn,precision_knn,recall_knn,f1_knn])
table.add_row(["GridSearchCV for Random Forest", round(accuracy_clf,4), round(precision_clf,4), round(recall_clf,4) ,round(f1_clf,4)])

# print the table
print(table)


+--------------------------------+----------+-----------+--------+--------+
|           Algorithm            | Accuracy | Precision | Recall |   f1   |
+--------------------------------+----------+-----------+--------+--------+
|      GridSearchCV for SVM      |   1.0    |    1.0    |  1.0   |  1.0   |
|      GridSearchCV for kNN      |   1.0    |    1.0    |  1.0   |  1.0   |
| GridSearchCV for Random Forest |  0.9333  |   0.9213  | 0.9213 | 0.9213 |
+--------------------------------+----------+-----------+--------+--------+


#Task 2. 
For breast cancer dataset (https://tinyurl.com/3vme8hr3) which could be loaded from datasets in sklearn as follows:

```
#Import scikit-learn dataset library
from sklearn import datasets

#Load dataset
cancer = datasets.load_breast_cancer()
```

*   Apply **GridSearchCV** to different classification algorithms such as **SVM, kNN, LogisticRegression, RandomForest**.
*   Compare the results obtained by the best hyperparameters among classification algorithms.

In [2]:
cancer = datasets.load_breast_cancer()
X = cancer['data']
y = cancer['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

*   2.1. Apply **GridSearchCV** to **SVM** 


In [5]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)            
svc = svm.SVC(kernel='linear')
grid_fr_class = GridSearchCV(estimator=svc, param_grid=param_grid, scoring='accuracy', n_jobs=4, cv =10, refit=True,return_train_score=True)
grid_fr_class.fit(X_train, y_train)
y_predict = grid_fr_class.predict(X_test)
#best hyperparameters
best_svm_s2 = grid_fr_class.best_score_
best_svm_p2 = grid_fr_class.best_params_
# print best parameter after tuning
print("The best hyperparameters using the following param_grid.")
print("Best hyperparameters: ", best_svm_p2)
print("Best score: ", best_svm_s2)
accuracy_svm = accuracy_score(y_test, y_predict)
precision_svm = precision_score(y_test, y_predict, average= 'macro')
recall_svm = recall_score(y_test, y_predict, average= 'macro')
f1_svm = f1_score(y_test, y_predict, average= 'macro') 

cols = [("GridSearchCV for SVM",accuracy_svm,precision_svm,recall_svm,f1_svm)]
df2_1 = pd.DataFrame(data = cols, columns = ['Algorithm','Accuracy', 'Precision', 'Recall',' f1'])
df2_1


The best hyperparameters using the following param_grid.
Best hyperparameters:  {'C': 10, 'gamma': 1, 'kernel': 'linear'}
Best score:  0.9583091787439614


Unnamed: 0,Algorithm,Accuracy,Precision,Recall,f1
0,GridSearchCV for SVM,0.964912,0.958074,0.967257,0.962302


*   2.2. Apply **GridSearchCV** to **kNN** 

In [4]:
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
knn = KNeighborsClassifier(n_neighbors=5)

grid_fr_class1 = GridSearchCV(estimator=knn, param_grid=grid_params, scoring='accuracy', n_jobs=4, cv =10, refit=True,return_train_score=True)
grid_fr_class1.fit(X_train, y_train)
y_predict = grid_fr_class1.predict(X_test)
#best hyperparameters
best_knn_s2 = grid_fr_class1.best_score_
best_knn_p2 = grid_fr_class1.best_params_
# print best parameter after tuning
print("The best hyperparameters using the following param_grid.")
print("Best hyperparameters: ", best_knn_p2)
print("Best score: ", best_knn_s2)
accuracy_knn = accuracy_score(y_test, y_predict)
precision_knn = precision_score(y_test, y_predict, average= 'macro')
recall_knn = recall_score(y_test, y_predict, average= 'macro')
f1_knn = f1_score(y_test, y_predict, average= 'macro') 
cols = [(" GridSearchCV for kNN",accuracy_knn,precision_knn,recall_knn,f1_knn)]
df1_2 = pd.DataFrame(data = cols, columns = ['Algorithm','Accuracy', 'Precision', 'Recall',' f1'])
df1_2

The best hyperparameters using the following param_grid.
Best hyperparameters:  {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'uniform'}
Best score:  0.9404830917874397


Unnamed: 0,Algorithm,Accuracy,Precision,Recall,f1
0,GridSearchCV for kNN,0.929825,0.926992,0.915897,0.921053


*   2.3. Apply **GridSearchCV** to **LogisticRegression** 

In [7]:
# Create a LogisticRegression object
lr = LogisticRegression(max_iter=5000)
# Set up the hyperparameter grid
grid_params = {'penalty': ['l1', 'l2'],
               'C': [0.001, 0.01, 0.1, 1, 10, 100],
               'solver': ['liblinear', 'saga']}

# Apply GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(lr, param_grid=grid_params, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
#
y_predict = grid_search.predict(X_test)
#best hyperparameters
best_lr_s2 = grid_search.best_score_
best_lr_p2 = grid_search.best_params_
# print best parameter after tuning
print("The best hyperparameters using the following param_grid.")
print("Best hyperparameters: ", best_lr_p2)
print("Best score: ", best_lr_s2)
accuracy_lr = accuracy_score(y_test, y_predict)
precision_lr = precision_score(y_test, y_predict, average= 'macro')
recall_lr = recall_score(y_test, y_predict, average= 'macro')
f1_lr = f1_score(y_test, y_predict, average= 'macro') 
cols = [("GridSearchCV to LogisticRegression",accuracy_lr, precision_lr, recall_lr, f1_lr)]
df2_3 = pd.DataFrame(data = cols, columns = ['Algorithm','Accuracy', 'Precision', 'Recall',' f1'])
df2_3


The best hyperparameters using the following param_grid.
Best hyperparameters:  {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Best score:  0.9582417582417582


Unnamed: 0,Algorithm,Accuracy,Precision,Recall,f1
0,GridSearchCV to LogisticRegression,0.964912,0.967179,0.956565,0.961486


*   2.4. Apply **GridSearchCV** to **RandomForest** 

In [8]:
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}
clf = RandomForestClassifier(n_estimators=100)

grid_fr_class2 = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='accuracy', n_jobs=4, cv =10, refit=True,return_train_score=True)
grid_fr_class2.fit(X_train, y_train)
y_predict = grid_fr_class2.predict(X_test)
#best hyperparameters
best_rd_s2 = grid_fr_class2.best_score_
best_rd_p2 = grid_fr_class2.best_params_
# print best parameter after tuning
print("The best hyperparameters using the following param_grid.")
print("Best hyperparameters: ", best_rd_p2)
print("Best score: ", best_rd_s2)
accuracy_clf = accuracy_score(y_test, y_predict)
precision_clf = precision_score(y_test, y_predict, average= 'macro')
recall_clf = recall_score(y_test, y_predict, average= 'macro')
f1_clf = f1_score(y_test, y_predict, average= 'macro') 
cols = [("GridSearchCV for Random Forest",accuracy_clf,precision_clf,recall_clf,f1_clf)]
df2_4 = pd.DataFrame(data = cols, columns = ['Algorithm','Accuracy', 'Precision', 'Recall',' f1'])
df2_4

The best hyperparameters using the following param_grid.
Best hyperparameters:  {'max_depth': 6, 'max_features': 'log2', 'max_leaf_nodes': 9, 'n_estimators': 50}
Best score:  0.9582608695652175


Unnamed: 0,Algorithm,Accuracy,Precision,Recall,f1
0,GridSearchCV for Random Forest,0.991228,0.988095,0.993151,0.990528


*   2.5. Compare the best obtained results among classification algorithms (use PrettyTable to dispaly the results) 

In [10]:
table = PrettyTable(['Algorithm', 'Best score', 'Best hyperparameters'])
table.add_row(['SVM',best_svm_s2, best_svm_p2])
table.add_row(['kNN',best_knn_s2, best_knn_p2])
table.add_row(['LogisticRegression',best_lr_s2, best_lr_p2])
table.add_row(['Random Forest',best_rd_s2, best_rd_p2])
# Print the table
print(table)

+--------------------+--------------------+-----------------------------------------------------------------------------------+
|     Algorithm      |     Best score     |                                Best hyperparameters                               |
+--------------------+--------------------+-----------------------------------------------------------------------------------+
|        SVM         | 0.9583091787439614 |                     {'C': 10, 'gamma': 1, 'kernel': 'linear'}                     |
|        kNN         | 0.9404830917874397 |          {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'uniform'}          |
| LogisticRegression | 0.9582417582417582 |                 {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}                |
|   Random Forest    | 0.9582608695652175 | {'max_depth': 6, 'max_features': 'log2', 'max_leaf_nodes': 9, 'n_estimators': 50} |
+--------------------+--------------------+-------------------------------------------------------------

In [12]:
table = PrettyTable()
table.field_names = ['Algorithm','Accuracy', 'Precision', 'Recall',' f1']
# add rows to the table
table.add_row(["GridSearchCV for SVM", round(accuracy_svm,4),round(precision_svm,4),round(recall_svm,4),round(f1_svm,4)])
table.add_row(["GridSearchCV for kNN", round(accuracy_knn,4),round(precision_knn,4),round(recall_knn,4),round(f1_knn,4)])
table.add_row(["GridSearchCV for LogisticRegression", round(accuracy_lr,4),round(precision_lr,4),round(recall_lr,4),round(f1_lr,4)])
table.add_row(["GridSearchCV for Random Forest", round(accuracy_clf,4), round(precision_clf,4), round(recall_clf,4) ,round(f1_clf,4)])

# print the table
print(table)

+-------------------------------------+----------+-----------+--------+--------+
|              Algorithm              | Accuracy | Precision | Recall |   f1   |
+-------------------------------------+----------+-----------+--------+--------+
|         GridSearchCV for SVM        |  0.9649  |   0.9581  | 0.9673 | 0.9623 |
|         GridSearchCV for kNN        |  0.9298  |   0.927   | 0.9159 | 0.9211 |
| GridSearchCV for LogisticRegression |  0.9649  |   0.9672  | 0.9566 | 0.9615 |
|    GridSearchCV for Random Forest   |  0.9912  |   0.9881  | 0.9932 | 0.9905 |
+-------------------------------------+----------+-----------+--------+--------+


#Task 3. 
The dataset consists of **2000 user-created movie reviews** archived on the IMDb(Internet Movie Database). The reviews are equally partitioned into a positive set and a negative set (1000+1000). Each review consists of a plain text file (.txt) and a class label representing the overall user opinion. 
The class attribute has only two values: **pos** (positive) or **neg** (negative).


*   3.1 Importing additional libraries

In [13]:
import nltk, random
nltk.download('movie_reviews')#download movie reviews dataset
from nltk.corpus import movie_reviews
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import cross_val_score
from collections import Counter
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


*   3.2. Movie reviews information

In [14]:
#code
print(len(movie_reviews.fileids()))
print(movie_reviews.categories())
print(movie_reviews.words()[:100])
print(movie_reviews.fileids()[:10])

2000
['neg', 'pos']
['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]
['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt']


*   3.3. Create dataset from movie reviews

In [15]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.seed(123)
random.shuffle(documents)

In [16]:
print('Number of Reviews/Documents: {}'.format(len(documents)))
print('Corpus Size (words): {}'.format(np.sum([len(d) for (d,l) in documents])))
print('Sample Text of Doc 1:')
print('-'*30)
print(' '.join(documents[0][0][:50])) # first 50 words of the first document

Number of Reviews/Documents: 2000
Corpus Size (words): 1583820
Sample Text of Doc 1:
------------------------------
most movies seem to release a third movie just so it can be called a trilogy . rocky iii seems to kind of fit in that category , but manages to be slightly unique . the rocky formula of " rocky loses fight / rocky trains / rocky wins fight


In [17]:
sentiment_distr = Counter([label for (words, label) in documents])
print(sentiment_distr)

Counter({'pos': 1000, 'neg': 1000})


*   3.4. Train test split

In [18]:
train, test = train_test_split(documents, test_size = 0.33, random_state=42)

In [19]:
## Sentiment Distrubtion for Train and Test
print(Counter([label for (words, label) in train]))
print(Counter([label for (words, label) in test]))

Counter({'neg': 674, 'pos': 666})
Counter({'pos': 334, 'neg': 326})


In [20]:
X_train = [' '.join(words) for (words, label) in train]
X_test = [' '.join(words) for (words, label) in test]
y_train = [label for (words, label) in train]
y_test = [label for (words, label) in test]

*   3.5. Text Vectorization

In [21]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

tfidf_vec = TfidfVectorizer(min_df = 10, token_pattern = r'[a-zA-Z]+')
X_train_bow = tfidf_vec.fit_transform(X_train) # fit train
X_test_bow = tfidf_vec.transform(X_test) # transform test

*   3.6. Apply **SVM** with **GridSearchCV** 

In [22]:
# Set the parameters for the SVM and the GridSearchCV
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}
svc = svm.SVC()
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=5)

grid_search.fit(X_train_bow, y_train)
best_model = grid_search.best_estimator_
best_score = grid_search.best_score_
print("The best hyperparameters using the following param_grid.")
print("Best hyperparameters: ", best_model)
print("Best score: ", best_score)


The best hyperparameters using the following param_grid.
Best hyperparameters:  SVC(C=10, gamma=0.1)
Best score:  0.85


In [23]:
best_param = grid_search.best_params_
print(best_param)

{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}


*   3.7. Apply **RandomForest** with **GridSearchCV** 

In [26]:
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}
clf = RandomForestClassifier(n_estimators=100)

grid_fr_class = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5)
grid_fr_class.fit(X_train_bow, y_train)
#best hyperparameters
best_rf_s3 = grid_fr_class.best_score_
best_rf_p3 = grid_fr_class.best_params_
# print best parameter after tuning
print("The best hyperparameters using the following param_grid.")
print("Best hyperparameters: ", best_rf_p3)
print("Best score: ", best_rf_s3)


The best hyperparameters using the following param_grid.
Best hyperparameters:  {'max_depth': 9, 'max_features': 'sqrt', 'max_leaf_nodes': 9, 'n_estimators': 100}
Best score:  0.8059701492537314


*   3.8. Apply **kNN** with **GridSearchCV** 

In [28]:
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
knn = KNeighborsClassifier(n_neighbors=5)
grid_knn_class = GridSearchCV(knn, grid_params, refit = True, cv=5)
grid_knn_class.fit(X_train_bow, y_train)

best_knn_p3 = grid_knn_class.best_params_
best_knn_s3 = grid_knn_class.best_score_
print("The best hyperparameters using the following param_grid.")
print("Best hyperparameters: ", best_knn_p3)
print("Best score: ", best_knn_s3)
            

The best hyperparameters using the following param_grid.
Best hyperparameters:  {'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'distance'}
Best score:  0.6462686567164179


*   3.9. Apply **LogisticRegression** with **GridSearchCV** 

In [33]:
param_grid = {
    'penalty' : ['l2'],
    'C'       : np.logspace(-3,3,7),
    'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
}

logistic = LogisticRegression(max_iter=5000)
grid_lr_class = GridSearchCV(estimator=logistic, param_grid=param_grid, cv=5)
grid_lr_class.fit(X_train_bow, y_train)
#best hyperparameters
best_lr_s3 = grid_lr_class.best_score_
best_lr_p3 = grid_lr_class.best_params_
# print best parameter after tuning
print("The best hyperparameters using the following param_grid.")
print("Best hyperparameters: ", best_lr_p3)
print("Best score: ", best_lr_s3)



The best hyperparameters using the following param_grid.
Best hyperparameters:  {'C': 100.0, 'penalty': 'l2', 'solver': 'newton-cg'}
Best score:  0.8544776119402986


*   3.10. Compare the best obtained results among classification algorithms (use PrettyTable to dispaly the results) 

In [34]:
table = PrettyTable(['Algorithm', 'Best score', 'Best hyperparameters'])
table.add_row(['SVM',best_score, best_param])
table.add_row(['kNN',best_knn_s3, best_knn_p3])
table.add_row(['LogisticRegression',best_lr_s3, best_lr_p3])
table.add_row(['Random Forest',best_rf_s3, best_rf_p3])
# Print the table
print(table)

+--------------------+--------------------+------------------------------------------------------------------------------------+
|     Algorithm      |     Best score     |                                Best hyperparameters                                |
+--------------------+--------------------+------------------------------------------------------------------------------------+
|        SVM         |        0.85        |                      {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}                      |
|        kNN         | 0.6462686567164179 |         {'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'distance'}          |
| LogisticRegression | 0.8544776119402986 |                {'C': 100.0, 'penalty': 'l2', 'solver': 'newton-cg'}                |
|   Random Forest    | 0.8059701492537314 | {'max_depth': 9, 'max_features': 'sqrt', 'max_leaf_nodes': 9, 'n_estimators': 100} |
+--------------------+--------------------+------------------------------------------------------

#Finally,
Save a copy in your Github. Remember renaming the notebook.