### Statistical Learning for Data Science 2 (229352)
#### Instructor: Donlapark Ponnoprat

#### [Course website](https://donlapark.pages.dev/229352/)

## Lab #4

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

from scipy.stats import uniform

In [2]:
train = fetch_20newsgroups(subset='train')
test = fetch_20newsgroups(subset='test')

Xtrain = train.data[:3000]
ytrain = train.target[:3000]
Xtest = test.data[:500]
ytest = test.target[:500]

print("X:", len(Xtest))
print("y:", len(ytest))

X: 500
y: 500


### Naive Bayes [(Documentation)](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html)

In [3]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB(alpha=0.1)

### Random Search Cross-Validation [(Documentation)](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)

### Uniform distribution in `Scipy` [(Documentation)](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.uniform.html)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

pipeline = Pipeline([('count', CountVectorizer(stop_words='english')),
                     ('nb', MultinomialNB())])

parameter = {'nb__alpha': uniform(loc=0.1, scale=10) #สุ่มค่าโดยใช้ library สุ่มจากค่าต่ำสุด
             }

clf = RandomizedSearchCV(pipeline, parameter, n_iter=10, cv=3, random_state=42) #n_iter=เทรนกี่ครั้ง, cv= k-fold, random_state= magic number
clf.fit(Xtrain, ytrain)

In [5]:
ypred = clf.predict(Xtest)
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.64      0.67      0.65        21
           1       0.75      0.43      0.55        21
           2       1.00      0.04      0.07        26
           3       0.48      0.71      0.57        34
           4       0.87      0.79      0.83        34
           5       0.64      0.81      0.71        26
           6       0.94      0.77      0.85        22
           7       0.74      1.00      0.85        28
           8       0.96      0.76      0.85        33
           9       0.88      0.84      0.86        25
          10       0.87      1.00      0.93        27
          11       0.69      1.00      0.82        20
          12       0.61      0.58      0.60        24
          13       0.87      0.87      0.87        23
          14       0.81      0.93      0.87        28
          15       0.77      0.93      0.84        29
          16       0.54      0.95      0.69        21
          17       0.85    

#### Exercise

1. For the Naive Bayes model, use grid search 5-fold cross-validation across different values of `alpha` to find the best model.

2. For the best value of `alpha`, compute the `f1_macro` score on the test set.
* What value of `alpha` did you obtain?
* What is the model's `f1_macro` score?

3. Repeat Exercise 1 and 2 for **random search** 5-fold cross validation across different values of `alpha`. Compute the `f1_macro` score on the test set.
* What value of `alpha` did you obtain?
* Did you get a better `f1_macro` score compared to grid search in Exercise 2?

In [6]:
# 1. For the Naive Bayes model, use grid search 5-fold cross-validation across different values of alpha to find the best model.
from sklearn.model_selection import GridSearchCV
parameter = {'nb__alpha': [0.1, 0.5, 1.0, 5.0, 10.0]}

clf = GridSearchCV(pipeline, parameter, cv=5)
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.79      0.71      0.75        21
           1       0.48      0.48      0.48        21
           2       1.00      0.04      0.07        26
           3       0.52      0.68      0.59        34
           4       0.78      0.82      0.80        34
           5       0.76      0.73      0.75        26
           6       0.69      0.82      0.75        22
           7       0.80      1.00      0.89        28
           8       0.93      0.85      0.89        33
           9       0.91      0.84      0.88        25
          10       0.93      0.96      0.95        27
          11       0.78      0.90      0.84        20
          12       0.56      0.62      0.59        24
          13       0.86      0.78      0.82        23
          14       0.81      0.93      0.87        28
          15       0.84      0.90      0.87        29
          16       0.57      0.95      0.71        21
          17       0.94    

In [7]:
# 2. For the best value of alpha, compute the f1_macro score on the test set.
from sklearn.metrics import f1_score

f1 = f1_score(ytest, ypred, average='macro')

# แสดงผลลัพธ์
print("best alpha:", clf.best_params_['nb__alpha'])
print(f"f1_macro score: {f1:.4f}")

# What value of alpha did you obtain?
# = 0.1

# What is the model's f1_macro score?
# = 0.7188

best alpha: 0.1
f1_macro score: 0.7188


In [8]:
# 3. Repeat Exercise 1 and 2 for random search 5-fold cross validation across different values of alpha. Compute the f1_macro score on the test set.
parameter = {'nb__alpha': uniform(loc=0.1, scale=20)}
clf = RandomizedSearchCV(pipeline, parameter, n_iter=10, cv=5)
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)

# แสดงผลลัพธ์
f1 = f1_score(ytest, ypred, average='macro')
print("best alpha:", clf.best_params_['nb__alpha'])
print(f"f1_macro score: {f1:.4f}")

# What value of alpha did you obtain?
# = 0.2612595202448963

# Did you get a better f1_macro score compared to grid search in Exercise 2?
# = No becasue f1 macro in exercise 2 is better.

best alpha: 0.2612595202448963
f1_macro score: 0.7150
