# Week8 - Homework KNN-NB-SVM

- Use GridSearchCV on X_train dataset
    - KNN, NB, SVM, Logistic Regression, Decision Trees
- Test on X_test dataset

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

In [2]:
X_train = pd.read_csv('https://github.com/msaricaumbc/DS_data/raw/master/ds602/movie/X_train.csv')
y_train = pd.read_csv('https://github.com/msaricaumbc/DS_data/raw/master/ds602/movie/y_train.csv')

In [3]:
X_test = pd.read_csv('https://github.com/msaricaumbc/DS_data/raw/master/ds602/movie/X_final.csv')
y_test = pd.read_csv('https://github.com/msaricaumbc/DS_data/raw/master/ds602/movie/y_final.csv')

In [4]:
X_train.head()

Unnamed: 0,review
0,"Shame, is a Swedish film in Swedish with Engli..."
1,I know it's rather unfair to comment on a movi...
2,"""Bread"" very sharply skewers the conventions o..."
3,After reading tons of good reviews about this ...
4,During the Civil war a wounded union soldier h...


In [5]:
y_train.head()

Unnamed: 0,sentiment
0,1
1,0
2,1
3,1
4,1


In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# taking a smaller subset
X_subset = X_train['review'].iloc[:10000]
y_subset = y_train['sentiment'].iloc[:10000]

# defining the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(solver='liblinear')),
])

# parameters for GridSearchCV
parameters = {
    'tfidf__max_df': (0.5, 0.75),
    'tfidf__ngram_range': ((1, 1), (1, 2)),
    'clf__C': (0.01, 1, 10),
}

# performing GridSearchCV
grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_subset, y_subset)

# printing best scores and parameters
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best score: 0.881
Best parameters set:
	clf__C: 10
	tfidf__max_df: 0.5
	tfidf__ngram_range: (1, 2)


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# defining pipelines for each model
pipelines = {
    'KNN': Pipeline([('tfidf', TfidfVectorizer()), ('clf', KNeighborsClassifier())]),
    'NB': Pipeline([('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())]),
    'SVM': Pipeline([('tfidf', TfidfVectorizer()), ('clf', SVC())]),
    'LogisticRegression': Pipeline([('tfidf', TfidfVectorizer()), ('clf', LogisticRegression(solver='liblinear'))]),
    'DecisionTrees': Pipeline([('tfidf', TfidfVectorizer()), ('clf', DecisionTreeClassifier())]),
}

# parameters for GridSearchCV for each model
parameters = {
    'KNN': {'clf__n_neighbors': (3, 5, 7)},
    'NB': {},  # NB doesn't need hyper-parameter tuning for this simple example
    'SVM': {'clf__C': (0.01, 1, 10)},
    'LogisticRegression': {'clf__C': (0.01, 1, 10)},
    'DecisionTrees': {'clf__max_depth': (3, 5, 7)},
}

# taking a smaller subset
X_subset = X_train['review'].iloc[:1000]
y_subset = y_train['sentiment'].iloc[:1000]

best_models = {}

for model_name, pipeline in pipelines.items():
    grid_search = GridSearchCV(pipeline, parameters[model_name], cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(X_subset, y_subset)
    
    best_models[model_name] = grid_search.best_estimator_
    
    print(f"{model_name} - Best Score: {grid_search.best_score_:.3f}")
    print(f"{model_name} - Best Parameters: {grid_search.best_params_}")

# testing each best model on X_test and y_test
X_test_subset = X_test['review']
y_test_subset = y_test['sentiment']

for model_name, model in best_models.items():
    predictions = model.predict(X_test_subset)
    print(f"\n{model_name} - Test Set Performance:")
    print("Accuracy:", accuracy_score(y_test_subset, predictions))
    print("Classification Report:\n", classification_report(y_test_subset, predictions))

Fitting 5 folds for each of 3 candidates, totalling 15 fits
KNN - Best Score: 0.684
KNN - Best Parameters: {'clf__n_neighbors': 7}
Fitting 5 folds for each of 1 candidates, totalling 5 fits
NB - Best Score: 0.809
NB - Best Parameters: {}
Fitting 5 folds for each of 3 candidates, totalling 15 fits
SVM - Best Score: 0.830
SVM - Best Parameters: {'clf__C': 10}
Fitting 5 folds for each of 3 candidates, totalling 15 fits
LogisticRegression - Best Score: 0.828
LogisticRegression - Best Parameters: {'clf__C': 10}
Fitting 5 folds for each of 3 candidates, totalling 15 fits
DecisionTrees - Best Score: 0.653
DecisionTrees - Best Parameters: {'clf__max_depth': 7}

KNN - Test Set Performance:
Accuracy: 0.662
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.53      0.61      5000
           1       0.63      0.79      0.70      5000

    accuracy                           0.66     10000
   macro avg       0.67      0.66      0.66     1000

Logistic Regression has the best scores.