<a href="https://colab.research.google.com/github/namthuan2304/ML_2024/blob/main/Lab_7_21130556_%C4%90%E1%BA%B7ngNamThu%E1%BA%ADn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This lab deals with **GridSearchCV** for tuning the hyper-parameters of an estimator and applying vectorization techniques to the **movie reviews dataset** for classification task.

*   **Deadline: 23:59, 22/4/2024 (lớp TH thứ 3) || 29/4/2024 (lớp TH thứ 5)**



In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd '/content/gdrive/MyDrive/MyDrive'

Mounted at /content/gdrive
/content/gdrive/MyDrive/MyDrive


# Import libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pylab as plt
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from pandas import read_csv, DataFrame
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.datasets import load_iris, load_breast_cancer
from prettytable import PrettyTable
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.feature_selection import RFE, SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import CategoricalNB, GaussianNB, BernoulliNB, MultinomialNB, ComplementNB
from pandas import Series
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer, OneHotEncoder, LabelEncoder
from pandas import get_dummies, concat

%pylab inline
#%run Utils.ipynb
# Warnings configuration
# ==============================================================================
import warnings
# warnings.filterwarnings('ignore')

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  warn("pylab import has clobbered these variables: %s"  % clobbered +


#Task 1. With **iris** dataset
*  1.1. Apply **GridSearchCV** for **SVM** to find the best hyperparameters using the following param_grid.

```
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}
```




In [40]:
def myGridSearchCV(X, y, classifier, params):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

    grid_search = GridSearchCV(
        estimator=classifier,
        param_grid=params,
        scoring='accuracy',
        refit=True,
        cv=10,
        return_train_score=True,
        n_jobs=-1
    )

    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    algoName = classifier.__class__.__name__
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    return algoName, best_params, best_score, accuracy, precision, recall, f1

In [None]:
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf', 'linear']
}

In [None]:
iris = load_iris(as_frame=True)
X = iris.data
y = iris.target
algoName, best_params, best_score, accuracy, precision, recall, f1 = myGridSearchCV(X, y, SVC(), param_grid)
print(f"Classifier: {algoName}, Params: {best_params}")

Classifier: SVC, Params: {'C': 1, 'gamma': 1, 'kernel': 'linear'}


In [None]:
table = PrettyTable(["Classifier", "Best_Score", "Accuracy", "Precision", "Recall", "F1_Score"])

In [None]:
table.add_row([algoName, best_score, accuracy, precision, recall, f1])

*  1.2. Apply **GridSearchCV** for **kNN** to find the best hyperparameters using the following param_grid.

```
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
```
where

    *  **n_neighbors**: Decide the best k based on the values we have computed earlier.
    *  **weights**: Check whether adding weights to the data points is beneficial to the model or not. 'uniform' assigns no weight, while 'distance' weighs points by the inverse of their distances meaning nearer points will have more weight than the farther points.
    *  **metric**: The distance metric to be used will calculating the similarity.


In [None]:
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}

In [None]:
algoName, best_params, best_score, accuracy, precision, recall, f1 = myGridSearchCV(X, y, KNeighborsClassifier(), grid_params)
print(f"Classifier: {algoName}, Params: {best_params}")

Classifier: KNeighborsClassifier, Params: {'metric': 'minkowski', 'n_neighbors': 9, 'weights': 'uniform'}


In [None]:
table.add_row([algoName, best_score, accuracy, precision, recall, f1])

*  1.3. Apply **GridSearchCV** for **Random Forest** to find the best hyperparameters using the following param_grid.

```
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}
```

In [None]:
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}

In [None]:
algoName, best_params, best_score, accuracy, precision, recall, f1 = myGridSearchCV(X, y, RandomForestClassifier(), param_grid)
print(f"Classifier: {algoName}, Params: {best_params}")

Classifier: RandomForestClassifier, Params: {'max_depth': 6, 'max_features': 'log2', 'max_leaf_nodes': 6, 'n_estimators': 25}


In [None]:
table.add_row([algoName, best_score, accuracy, precision, recall, f1])

*   1.4 Compare the best obtained results from 1.1 to 1.3 (use PrettyTable to dispaly the results)

In [None]:
print("Results of GridSearchCV for three classifiers (SVM, kNN, Random Forest)")
print(table)

Results of GridSearchCV for three classifiers (SVM, kNN, Random Forest)
+------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|       Classifier       |     Best_Score     |      Accuracy      |     Precision      |       Recall       |      F1_Score      |
+------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|          SVC           | 0.9833333333333332 |        1.0         |        1.0         |        1.0         |        1.0         |
|  KNeighborsClassifier  |       0.975        | 0.9666666666666667 | 0.9523809523809524 | 0.9743589743589745 | 0.9610256410256409 |
| RandomForestClassifier |        0.95        | 0.9666666666666667 | 0.9523809523809524 | 0.9743589743589745 | 0.9610256410256409 |
+------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+


#Task 2.
For breast cancer dataset (https://tinyurl.com/3vme8hr3) which could be loaded from datasets in sklearn as follows:

```
#Import scikit-learn dataset library
from sklearn import datasets

#Load dataset
cancer = datasets.load_breast_cancer()
```

*   Apply **GridSearchCV** to different classification algorithms such as **SVM, kNN, LogisticRegression, RandomForest**.
*   Compare the results obtained by the best hyperparameters among classification algorithms.

In [None]:
cancer = load_breast_cancer(as_frame=True)
X = cancer.data
y = cancer.target

*   2.1. Apply **GridSearchCV** to **SVM**


In [None]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}

In [None]:
algoName, best_params, best_score, accuracy, precision, recall, f1 = myGridSearchCV(X, y, SVC(), param_grid)
print(f"Classifier: {algoName}, Params: {best_params}")

Classifier: SVC, Params: {'C': 1000, 'gamma': 1, 'kernel': 'linear'}


In [4]:
table = PrettyTable(["Classifier", "Best_Score", "Accuracy", "Precision", "Recall", "F1_Score"])

In [None]:
table.add_row([algoName, best_score, accuracy, precision, recall, f1])

*   2.2. Apply **GridSearchCV** to **kNN**

In [None]:
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}

In [None]:
algoName, best_params, best_score, accuracy, precision, recall, f1 = myGridSearchCV(X, y, KNeighborsClassifier(), grid_params)
print(f"Classifier: {algoName}, Params: {best_params}")

Classifier: KNeighborsClassifier, Params: {'metric': 'manhattan', 'n_neighbors': 13, 'weights': 'uniform'}


In [None]:
table.add_row([algoName, best_score, accuracy, precision, recall, f1])

*   2.3. Apply **GridSearchCV** to **LogisticRegression**

In [None]:
params_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 300]
}

In [None]:
algoName, best_params, best_score, accuracy, precision, recall, f1 = myGridSearchCV(X, y, LogisticRegression(), params_grid)
print(f"Classifier: {algoName}, Params: {best_params}")

Classifier: LogisticRegression, Params: {'C': 10, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}




In [None]:
table.add_row([algoName, best_score, accuracy, precision, recall, f1])

*   2.4. Apply **GridSearchCV** to **RandomForest**

In [None]:
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}

In [None]:
algoName, best_params, best_score, accuracy, precision, recall, f1 = myGridSearchCV(X, y, RandomForestClassifier(), param_grid)
print(f"Classifier: {algoName}, Params: {best_params}")

Classifier: RandomForestClassifier, Params: {'max_depth': 6, 'max_features': None, 'max_leaf_nodes': 9, 'n_estimators': 50}


In [None]:
table.add_row([algoName, best_score, accuracy, precision, recall, f1])

*   2.5. Compare the best obtained results among classification algorithms (use PrettyTable to dispaly the results)

In [None]:
print("Results of GridSearchCV for four classifiers (SVM, kNN, Logistic Regression, Random Forest)")
print(table)

Results of GridSearchCV for four classifiers (SVM, kNN, Logistic Regression, Random Forest)
+------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|       Classifier       |     Best_Score     |      Accuracy      |     Precision      |       Recall       |      F1_Score      |
+------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|          SVC           | 0.9584541062801932 | 0.956140350877193  | 0.9605128205128205 | 0.9454365079365079 | 0.9521289997480473 |
|  KNeighborsClassifier  | 0.9363768115942029 | 0.9298245614035088 | 0.9412393162393162 | 0.9097222222222223 | 0.9220512820512821 |
|   LogisticRegression   | 0.9539130434782608 | 0.9649122807017544 | 0.9672297297297296 | 0.9573412698412699 | 0.9619111259605747 |
| RandomForestClassifier | 0.9606280193236716 | 0.9473684210526315 | 0.9615384615384616 | 0.9285714285714286 | 0.941

#Task 3. With **mobile price classification** dataset
* 3.1.  Apply **GridSearchCV** for **SVM, kNN, RandomForest** algorithms to find the best hyperparameters for each classification algorithm.


In [None]:
mobile = read_csv("mobile.csv")
X = mobile.drop(columns=['price_range'])
y = mobile['price_range']

*   3.1.1 Apply **GridSearchCV** to **SVM**

In [None]:
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf', 'linear']
}

In [None]:
algoName, best_params, best_score, accuracy, precision, recall, f1 = myGridSearchCV(X, y, SVC(), param_grid)
print(f"Classifier: {algoName}, Params: {best_params}")

Classifier: SVC, Params: {'C': 1, 'gamma': 1, 'kernel': 'linear'}


In [None]:
table = PrettyTable(["Classifier", "Best_Score", "Accuracy", "Precision", "Recall", "F1_Score"])

In [None]:
table.add_row([algoName, best_score, accuracy, precision, recall, f1])

*   3.1.2 Apply **GridSearchCV** to **kNN**

In [None]:
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}

In [None]:
algoName, best_params, best_score, accuracy, precision, recall, f1 = myGridSearchCV(X, y, KNeighborsClassifier(), grid_params)
print(f"Classifier: {algoName}, Params: {best_params}")

Classifier: KNeighborsClassifier, Params: {'metric': 'minkowski', 'n_neighbors': 15, 'weights': 'uniform'}


In [None]:
table.add_row([algoName, best_score, accuracy, precision, recall, f1])

*   3.1.3 Apply **GridSearchCV** to **RandomForest**

In [None]:
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}

In [None]:
algoName, best_params, best_score, accuracy, precision, recall, f1 = myGridSearchCV(X, y, RandomForestClassifier(), param_grid)
print(f"Classifier: {algoName}, Params: {best_params}")

Classifier: RandomForestClassifier, Params: {'max_depth': 6, 'max_features': 'log2', 'max_leaf_nodes': 9, 'n_estimators': 50}


In [None]:
table.add_row([algoName, best_score, accuracy, precision, recall, f1])

* 3.2. Compare the best obtained results among classification algorithms (use PrettyTable to dispaly the results)

In [None]:
print("Results of GridSearchCV for three classifiers (SVM, kNN, Random Forest)")
print(table)

Results of GridSearchCV for three classifiers (SVM, kNN, Random Forest)
+------------------------+--------------------+----------+--------------------+--------------------+--------------------+
|       Classifier       |     Best_Score     | Accuracy |     Precision      |       Recall       |      F1_Score      |
+------------------------+--------------------+----------+--------------------+--------------------+--------------------+
|          SVC           |      0.971875      |  0.9625  | 0.9628132188875204 | 0.962922135630298  | 0.9627869600870751 |
|  KNeighborsClassifier  | 0.9356249999999999 |  0.9125  | 0.915778108744907  | 0.9153934235712333 | 0.9144950550583786 |
| RandomForestClassifier | 0.8318749999999999 |  0.7875  | 0.7835033709840211 | 0.790004614438064  | 0.7853581940913821 |
+------------------------+--------------------+----------+--------------------+--------------------+--------------------+


#Task 4.
The dataset consists of **2000 user-created movie reviews** archived on the IMDb(Internet Movie Database). The reviews are equally partitioned into a positive set and a negative set (1000+1000). Each review consists of a plain text file (.txt) and a class label representing the overall user opinion.
The class attribute has only two values: **pos** (positive) or **neg** (negative).


*   4.1 Importing additional libraries

In [8]:
import nltk, random
nltk.download('movie_reviews')#download movie reviews dataset
from nltk.corpus import movie_reviews
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import cross_val_score
from collections import Counter
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


*   4.2. Movie reviews information

In [9]:
#code
print(len(movie_reviews.fileids()))
print(movie_reviews.categories())
print(movie_reviews.words()[:100])
print(movie_reviews.fileids()[:10])

2000
['neg', 'pos']
['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]
['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt']


*   4.3. Create dataset from movie reviews

In [10]:
documents = [(list(movie_reviews.words(fileid)), category)

             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.seed(123)
random.shuffle(documents)

In [11]:
print('Number of Reviews/Documents: {}'.format(len(documents)))
print('Corpus Size (words): {}'.format(np.sum([len(d) for (d,l) in documents])))
print('Sample Text of Doc 1:')
#print('-'*30)
print(' '.join(documents[0][0][:50])) # first 50 words of the first document

Number of Reviews/Documents: 2000
Corpus Size (words): 1583820
Sample Text of Doc 1:
most movies seem to release a third movie just so it can be called a trilogy . rocky iii seems to kind of fit in that category , but manages to be slightly unique . the rocky formula of " rocky loses fight / rocky trains / rocky wins fight


In [12]:
sentiment_distr = Counter([label for (words, label) in documents])
print(sentiment_distr)

Counter({'pos': 1000, 'neg': 1000})


*   4.4. Train test split

In [13]:
train, test = train_test_split(documents, test_size = 0.33, random_state=42)

In [14]:
## Sentiment Distrubtion for Train and Test
print(Counter([label for (words, label) in train])) #đếm số file .txt tương đương đếm số label của train gồm 674 neg, 666 pos.
print(Counter([label for (words, label) in test]))  #số file .txt của test gồm 674 neg, 666 pos.

X_train = [' '.join(words) for (words, label) in train]
X_test = [' '.join(words) for (words, label) in test]
y_train = [label for (words, label) in train] #label
y_test = [label for (words, label) in test] #label

Counter({'neg': 674, 'pos': 666})
Counter({'pos': 334, 'neg': 326})


*   4.5. Text Vectorization

In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

tfidf_vec = TfidfVectorizer(min_df = 10, token_pattern = r'[a-zA-Z]+')
X_train_bow = tfidf_vec.fit_transform(X_train) # fit train
X_test_bow = tfidf_vec.transform(X_test) # transform test

*   4.6. Apply **SVM** with **GridSearchCV**

In [16]:
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf', 'linear']
}

In [45]:
grid_search = GridSearchCV(
  estimator=SVC(),
  param_grid=param_grid,
  scoring='accuracy',
  refit=True,
  cv=10,
  return_train_score=True,
  n_jobs=-1
)

grid_search.fit(X_train_bow, y_train)
y_pred = grid_search.predict(X_test_bow)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(best_params)

{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}


In [17]:
table = PrettyTable(["Classifier", "Best_Score", "Accuracy", "Precision", "Recall", "F1_Score"])

In [18]:
table.add_row(["SVM Algorithm", best_score, accuracy, precision, recall, f1])

NameError: name 'best_score' is not defined

*   4.7. Apply **RandomForest** with **GridSearchCV**

In [19]:
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}

In [49]:
grid_search = GridSearchCV(
  estimator=RandomForestClassifier(),
  param_grid=param_grid,
  scoring='accuracy',
  refit=True,
  cv=10,
  return_train_score=True,
  n_jobs=-1
)

grid_search.fit(X_train_bow, y_train)
y_pred = grid_search.predict(X_test_bow)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(best_params)

{'max_depth': 6, 'max_features': 'sqrt', 'max_leaf_nodes': 9, 'n_estimators': 150}


In [20]:
table.add_row(["Random Forest", best_score, accuracy, precision, recall, f1])

NameError: name 'best_score' is not defined

*   4.8. Apply **kNN** with **GridSearchCV**

In [6]:
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}

In [None]:
grid_search = GridSearchCV(
  estimator=KNeighborsClassifier(),
  param_grid=grid_params,
  scoring='accuracy',
  refit=True,
  cv=10,
  return_train_score=True,
  n_jobs=-1
)

grid_search.fit(X_train_bow, y_train)
y_pred = grid_search.predict(X_test_bow)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(best_params)

In [None]:
table.add_row(["kNN Algorithm", best_score, accuracy, precision, recall, f1])

*   4.9. Apply **LogisticRegression** with **GridSearchCV**

In [None]:
params_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 300]
}

In [None]:
grid_search = GridSearchCV(
  estimator=LogisticRegression(),
  param_grid=params_grid,
  scoring='accuracy',
  refit=True,
  cv=10,
  return_train_score=True,
  n_jobs=-1
)

grid_search.fit(X_train_bow, y_train)
y_pred = grid_search.predict(X_test_bow)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(best_params)

In [None]:
table.add_row(["Logistic Regression", best_score, accuracy, precision, recall, f1])

*   4.10. Compare the best obtained results among classification algorithms (use PrettyTable to dispaly the results)

In [None]:
print("Results of GridSearchCV for four classifiers (SVM, Random Forest, kNN, Logistic Regression)")
print(table)

#Finally,
Save a copy in your Github. Remember renaming the notebook.