<a href="https://colab.research.google.com/github/WilliamYkZhang/COMP551_A2/blob/master/model_selection_linearSVC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Preprocessing
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
import pandas as pd

# Transformers 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer 

# Models 
from sklearn.svm import LinearSVC

# Utilities
import time

In [19]:
# Download stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Read DataFrame
stemmed_df = pd.read_csv("https://raw.githubusercontent.com/WilliamYkZhang/COMP551_A2/master/preprocessed_reddit_train_SnowballStemmer.csv?token=AKKZG4GENVP2WXEXXHZVHSS5WRRRY")

# Separate X and Y 
X_train = stemmed_df["cleaned"]
y_train = stemmed_df["label"]
X_train.head(5)

In [0]:
def cross_validation(model, X, y, folds):
    pipeline_tfidf = Pipeline([
        ('tfidf', TfidfVectorizer(sublinear_tf=True, stop_words=stopwords.words('english').append(["nt", "get", "like", "would","peopl", "one", "think", "time", "becaus"]), smooth_idf=True, norm="l2",lowercase=True, max_features=30000, use_idf=True, encoding = "utf-8",  decode_error = 'ignore', strip_accents='unicode',  analyzer = "word")),
        ('clf', model)],
         verbose=True)
    # Track CV time
    start = time.time()

    # Scores
    scores = cross_val_score(pipeline_tfidf, X, y, cv=folds, scoring='accuracy')

    return "Cross validation scores: {0}\nCross validation mean score: {1}\nValidation time: {2}s".format(scores, scores.mean(),time.time()-start) 

def grid_search_cv(model, X, y, params, folds):
    # Pipeline
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(sublinear_tf=True, stop_words=stopwords.words('english').append(["nt", "get", "like", "would","peopl", "one", "think", "time", "becaus"]), smooth_idf=True, norm="l2",lowercase=True, max_features=30000, use_idf=True, encoding = "utf-8",  decode_error = 'ignore', strip_accents='unicode',  analyzer = "word")),
        ('clf', model)],
         verbose=True)

    # Use GridSearch cross validation to find the best feature extraction and hyperparameters
    gs_CV = GridSearchCV(pipeline, param_grid=params, cv=folds)
    gs_CV.fit(X, y)
    print("Performing grid search...")
    print("Pipeline: ", [name for name, _ in pipeline.steps])
    print("Best parameter (CV score={0:.3f}):".format(gs_CV.best_score_))
    print("Best parameters set: {} \nBest estimator parameters {}.".format(gs_CV.best_params_, gs_CV.best_estimator_.get_params()))

    return (gs_CV.best_score_,gs_CV.best_params_, gs_CV.best_estimator_.get_params())

In [0]:
# Instantiate model
clf = LinearSVC(penalty="l2",loss="hinge",  multi_class="ovr", dual=True, fit_intercept=False, intercept_scaling=1.65, max_iter=2000, C=1.1, tol=0.00005)

"""
Results
0.5365000000000000 (loss="hinge")
0.5364857142857142 (loss="hinge", dual=True)
0.5364857142857142 (loss="hinge", dual=True, multi_class="ovr")
0.5365428571428572 (loss="hinge", dual=True, multi_class="ovr", fit_intercept=True)
0.5425000000000000 (loss="hinge", dual=True, multi_class="ovr", fit_intercept=False) Data is already centered by tfidf
0.5425285714285715 (loss="hinge",  multi_class="ovr", dual=True, fit_intercept=False, intercept_scaling=1.5)
0.5432285714285714 (loss="hinge",  multi_class="ovr", dual=True, fit_intercept=False, intercept_scaling=1.65, max_iter=2000, C=1.1)
0.5432571428571429 (loss="hinge",  multi_class="ovr", dual=True, fit_intercept=False, intercept_scaling=1.65, max_iter=2000, C=1.1, tol=0.00005)

0.5320857142857143 (loss="hinge", dual=True, multi_class="crammer_singer") Not used in practice, rarely leads to better accuracy and is more expensive to compute.
0.5286571428571428 (loss="square_hinge")

5 folds:
0.5622 BEST
"""

# Parameters
params = {
    'clf__tol':(0.0001, 0.0005, 0.00001),
} 

# Number of cross validation folds
folds=5

In [23]:
# Perform Grid Search CV to find the best parameters
best_scores, best_params, best_estimator_params = grid_search_cv(model=clf, X=X_train, y=y_train,params=params,folds=folds)

[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   1.2s


KeyboardInterrupt: ignored

In [101]:
# Validate model using cross validation
print(cross_validation(model=clf, X=X_train, y=y_train, folds=folds))

[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   1.9s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   9.4s
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   1.9s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   9.3s
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   1.9s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   9.3s
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   1.9s
[Pipeline] ............... (step 2 of 2) Processing clf, total=  10.1s
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   2.0s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   9.1s
Cross validation scores: [0.56092857 0.56692857 0.56242857 0.55971429 0.561     ]
Cross validation mean score: 0.5622
Validation time: 59.681588888168335s
