# Week 4 - Grid Search Optimization
- Optimize LogReg, SVM, DT, KNN Models with GridSearch

In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# read in data, use sentiment simple
# dataframe with just text and sentiment
GoEmo_df = pd.read_csv('GoEmotions_Sentiment_simple.csv')
# remove ambiguous rows
GoEmo_df = GoEmo_df[GoEmo_df['Sentiment'] < 3]

In [3]:
GoEmo_df.head()

Unnamed: 0,text,Sentiment
0,Fast as [NAME] will carry me. Seriously uptown...,0
1,You blew it. They played you like a fiddle.,1
2,TL;DR No more Superbowls for [NAME]. Get ready...,2
4,Emotes have a ridiculous amount of effort put ...,1
5,Just life.. I feel like i'm just a parasite th...,2


In [4]:
# store simplified data in X and y
X = GoEmo_df['text']
y = GoEmo_df['Sentiment']

# splitting our data into train and test sets
# stratify on sentiment
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

# 1. Instantiate
bagofwords = CountVectorizer(stop_words="english",
                             min_df= 20)
# 2. Fit
bagofwords.fit(X_train)

# 3. Transform
X_train_transformed = bagofwords.transform(X_train)
X_test_transformed = bagofwords.transform(X_test) 
X_train_transformed

<38762x1713 sparse matrix of type '<class 'numpy.int64'>'
	with 154603 stored elements in Compressed Sparse Row format>

# Logistic Regression Grid Search

In [6]:
# pipeline for logistic regression test
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
# Ignore futurewarnings
import warnings
warnings.filterwarnings('ignore')

In [13]:
# Create placeholders for all three steps
estimators = [('model', LogisticRegression())]

my_pipe = Pipeline(estimators)

In [14]:
param_grid = [{
        'model': [LogisticRegression(solver='saga')], 
        'model__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'model__penalty': ['l1', 'l2']
    }]

In [None]:
grid = GridSearchCV(my_pipe, param_grid, cv=5)

fittedgrid = grid.fit(X_train_transformed, y_train)

In [16]:
fittedgrid.best_params_

{'model': LogisticRegression(C=1, penalty='l1', solver='saga'),
 'model__C': 1,
 'model__penalty': 'l1'}

In [7]:
# Fitting a model
logreg = LogisticRegression(C=1, penalty='l1', solver='saga')
logreg.fit(X_train_transformed, y_train)

# Training and test score
print(f"Train score: {logreg.score(X_train_transformed, y_train)}")
print(f"Test score: {logreg.score(X_test_transformed, y_test)}")

Train score: 0.7134564779939115
Test score: 0.6837706059902484


# SVM Grid Search

In [8]:
from sklearn.svm import LinearSVC

# Create placeholders for all three steps
estimators = [('model', LinearSVC())]

my_pipe = Pipeline(estimators)

param_grid = [{
        'model': [LinearSVC()],
        'model__C': [0.001, 0.1, 1, 10, 100],
        'model__penalty': ['l1', 'l2']
            }]


In [None]:
grid = GridSearchCV(my_pipe, param_grid, cv=5)

fittedgrid = grid.fit(X_train_transformed, y_train)

In [15]:
fittedgrid.best_params_

{'model': LinearSVC(C=0.1), 'model__C': 0.1, 'model__penalty': 'l2'}

In [9]:
SVM_model = LinearSVC(C=0.1, penalty='l2')
SVM_model.fit(X_train_transformed, y_train)

print(f"The TRAIN classification accuracy is:  {SVM_model.score(X_train_transformed,y_train)}")
print(f"The TEST classification accuracy is:  {SVM_model.score(X_test_transformed,y_test)}")

The TRAIN classification accuracy is:  0.7105154532789846
The TEST classification accuracy is:  0.6840801795526662


# Decision Tree Grid Search

In [10]:
from sklearn.tree import DecisionTreeClassifier

# Create placeholders for all three steps
estimators = [('model', DecisionTreeClassifier())]

my_pipe = Pipeline(estimators)

param_grid = [{
        'model': [DecisionTreeClassifier()], 
        'model__criterion': ['entropy', 'gini'],
        'model__max_depth': [10,20,30,40,50],
        'model__min_samples_leaf': [10, 20, 30,40,50]
    }]

In [17]:
grid = GridSearchCV(my_pipe, param_grid, cv=5)

fittedgrid = grid.fit(X_train_transformed, y_train)

In [18]:
fittedgrid.best_params_

{'model': DecisionTreeClassifier(criterion='entropy', max_depth=50, min_samples_leaf=20),
 'model__criterion': 'entropy',
 'model__max_depth': 50,
 'model__min_samples_leaf': 20}

In [11]:
# Decision tree
#Fit to the training data
DT_model = DecisionTreeClassifier(max_depth=30, min_samples_leaf=30)
DT_model.fit(X_train_transformed, y_train)

print(f"The TRAIN classification accuracy is:  {DT_model.score(X_train_transformed,y_train)}")
print(f"The TEST classification accuracy is:  {DT_model.score(X_test_transformed,y_test)}")

The TRAIN classification accuracy is:  0.6190341055673082
The TEST classification accuracy is:  0.6182957975388902


# KNN Grid Search

In [12]:
from sklearn.neighbors import KNeighborsClassifier

# Create placeholders for all three steps
estimators = [('model', KNeighborsClassifier())]

my_pipe = Pipeline(estimators)

param_grid = [{
        'model': [KNeighborsClassifier()], 
        'model__n_neighbors': [2,4,6,8,10,15,20],
        'model__p': [1,2]
    }]

In [21]:
grid = GridSearchCV(my_pipe, param_grid, cv=5)

fittedgrid = grid.fit(X_train_transformed, y_train)

In [22]:
fittedgrid.best_params_

{'model': KNeighborsClassifier(n_neighbors=10, p=1),
 'model__n_neighbors': 10,
 'model__p': 1}

In [13]:
# Instantiate the model & fit it to our data
KNN_model = KNeighborsClassifier(n_neighbors=10, p=1)
KNN_model.fit(X_train_transformed, y_train)

print(f"The TRAIN classification accuracy is:  {KNN_model.score(X_train_transformed,y_train)}")
print(f"The TEST classification accuracy is:  {KNN_model.score(X_test_transformed,y_test)}")

The TRAIN classification accuracy is:  0.6588153346060575
The TEST classification accuracy is:  0.609859917963006


# Grid Search Summary

Logistic Regression - (C=1, penalty='l1', solver='saga') <br>
SVM - (C=0.1, penalty='l2') <br>
Decision Tree - (Criterion='entropy', max_depth=50, min_samples_leaf=20) <br>
KNN - (n_neighbors=10, p=1) <br>


Update predictions of reddit comments with optimized models
- these codes are mainly copied from week3

In [14]:
# read in scraped data
reddit = pd.read_csv('reddit_comments.csv')

# rename 0 column to 'text'
reddit.rename(columns = {'0':'text'}, inplace = True)

# save text column to 1 variables
reddit_text = reddit['text']

In [16]:
# fit reddit comment to bag-of-words model
reddit_transformed =  bagofwords.transform(reddit_text)
reddit_transformed

<192x1713 sparse matrix of type '<class 'numpy.int64'>'
	with 883 stored elements in Compressed Sparse Row format>

In [17]:
# predict logistic regression model
reddit_log_pred = logreg.predict(reddit_transformed)

# predict SVM model
SVM_pred = SVM_model.predict(reddit_transformed)

# predict DT model
DT_pred = DT_model.predict(reddit_transformed)

# predict KNN model
KNN_pred = KNN_model.predict(reddit_transformed)

In [18]:
reddit_log_pred

array([0, 1, 1, 1, 0, 1, 2, 2, 1, 0, 0, 1, 0, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 0, 0, 1, 2, 2, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 2, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 0, 1, 2,
       0, 1, 1, 0, 0, 0, 2, 0, 0, 1, 1, 0, 0, 2, 2, 0, 1, 1, 1, 1, 1, 1,
       2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 0, 0, 0, 1, 2, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 2, 1, 1, 1, 0, 1, 0, 2, 2, 1, 0, 2, 1, 0,
       1, 1, 2, 2, 0, 1, 1, 2, 2, 0, 1, 2, 0, 2, 1, 2, 1, 1, 0, 2, 1, 2,
       1, 1, 2, 0, 1, 1, 0, 1, 0, 1, 1, 1, 2, 1, 1, 2, 0, 0, 1, 1, 2, 2,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 2, 0, 1, 0, 2, 1, 2], dtype=int64)

In [19]:
# save new predictions to new column
reddit['Logistic_Regression'] = reddit_log_pred
reddit['SVM'] = SVM_pred
reddit['Decision_Tree'] = DT_pred
reddit['KNN'] = KNN_pred

In [20]:
# check
reddit

Unnamed: 0,text,Logistic_Regression,SVM,Decision_Tree,KNN
0,"Chill everyone, the Newspost says that 7.33 wi...",0,0,0,1
1,At least CM (Crystal Maiden) not touched.,1,1,1,1
2,I wanna work with this devs since the work two...,1,1,1,1
3,7.32e hahahahahah,1,1,1,1
4,# 7.32e Summary\n\n* New hero added to the gam...,0,0,0,1
...,...,...,...,...,...
187,4.2 GB for minor Lina nerf. \n\n\nI'm done wi...,1,1,1,1
188,This is bad lol. Barely any changes other than...,0,0,0,0
189,"Hahahaha we overhyped the patch, it's literall...",2,2,2,2
190,No 7.33,1,1,1,1


In [21]:
# truncatedSVD for sparse matrix dimentionality reduction
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=3)

svd.fit(X_train_transformed)

reddit_pca = svd.transform(reddit_transformed)

In [22]:
# save PCAs to dataframe
reddit_pca_df = pd.DataFrame(reddit_pca)

In [23]:
# save PCA values to original dataframe
reddit['PCA1'] = reddit_pca_df[0]
reddit['PCA2'] = reddit_pca_df[1]
reddit['PCA3'] = reddit_pca_df[2]

In [24]:
reddit

Unnamed: 0,text,Logistic_Regression,SVM,Decision_Tree,KNN,PCA1,PCA2,PCA3
0,"Chill everyone, the Newspost says that 7.33 wi...",0,0,0,1,0.086364,-0.005339,0.057743
1,At least CM (Crystal Maiden) not touched.,1,1,1,1,0.000000,0.000000,0.000000
2,I wanna work with this devs since the work two...,1,1,1,1,0.093037,-0.012519,0.075822
3,7.32e hahahahahah,1,1,1,1,0.000000,0.000000,0.000000
4,# 7.32e Summary\n\n* New hero added to the gam...,0,0,0,1,0.070274,0.003729,0.049699
...,...,...,...,...,...,...,...,...
187,4.2 GB for minor Lina nerf. \n\n\nI'm done wi...,1,1,1,1,0.000000,0.000000,0.000000
188,This is bad lol. Barely any changes other than...,0,0,0,0,0.465283,-0.087559,0.630998
189,"Hahahaha we overhyped the patch, it's literall...",2,2,2,2,0.029562,-0.004852,0.014313
190,No 7.33,1,1,1,1,0.000000,0.000000,0.000000


In [25]:
reddit.to_csv('dota_comments_optimized.csv', index = False)