# SVM

### Import libraries

In [None]:
import pandas as pd
import numpy as np
import glob
import os

import matplotlib.pyplot as plt # graphs
import matplotlib.colors as colors

import joblib
import datetime

from collections import defaultdict

import sklearn
#from sklearn.utils import resample # downsample dataset
from sklearn.model_selection import train_test_split # split to training and testing datasets
from sklearn.model_selection import GridSearchCV # cross validation
#from sklearn.preprocessing import scale # scale and center data
from sklearn.svm import SVC # support vector classifier
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, accuracy_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn

### Import data

We work with 1956 comments from 5 different YouTube videos. The [YouTube Spam Collection Data Set](https://archive.ics.uci.edu/ml/datasets/YouTube+Spam+Collection#) is freely available.

In [None]:
path = r"data/YouTube-Spam-Collection/"
files = glob.glob(os.path.join(path, "*.csv"))

corpus = pd.concat((pd.read_csv(file) for file in files), ignore_index=True)
corpus.head()

In [None]:
assert len(corpus) == 1956

In [None]:
"""nltk.download("punkt")
nltk.download("stopwords")
nltk.download('omw-1.4')
nltk.download("wordnet")"""

### Statistics about the data

In [None]:
print(f"Data types:\n{corpus.dtypes}\n")
print(f"There are {len(corpus['CLASS'].unique())} comment types: {corpus['CLASS'].unique()}")
print(f"The dataset contains of {len(corpus)} examples: {len(corpus.loc[corpus['CLASS'] == 1])} spam and {len(corpus.loc[corpus['CLASS'] == 0])} legitimate comments")

corpus.groupby("CLASS").CONTENT.count().plot.bar(ylim=0)
plt.xticks([0,1],['legitimate', 'spam'], rotation=0)
plt.xlabel("comment type")
plt.ylabel("numberof comments")
plt.title("Dataset distribution", pad=20)
plt.show()

### Peprocessing

In [None]:
#nltk.download('stopwords')
#nltk.download('omw-1.4')
    
def preprocess_data(corpus,
                    irrelevant_features=["COMMENT_ID", "AUTHOR", "DATE"],
                    #rename_columns={"CONTENT":"COMMENT"}
                   ):
    
    # drop irrelevant features
    corpus.drop(irrelevant_features, inplace=True, axis=1)

    # remove blank rows if any
    corpus.dropna()
    
    # add column for representation
    corpus['REPR'] = corpus.loc[:, 'CONTENT']
        
    # lower case
    corpus['REPR'] = corpus['REPR'].str.lower()

    # change column name
    #for old, new in rename_columns:
        #corpus.rename({old : new}, axis=1, inplace=True)

    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words("english")
    
    for comment in corpus["REPR"]:    
        comment = nltk.word_tokenize(comment) # tokenizing nltk.WordPunctTokenizer().tokenize(comment.lower())?
        comment = [lemmatizer.lemmatize(word) for word in comment] # lemmatizing
        comment = [word for word in comment if word not in stop_words] # removing stopwords
        comment = " ".join(comment)

In [None]:
preprocess_data(corpus)

# binary feature representation
vectorizer = CountVectorizer(binary=True, max_df=0.95) #max_features=10000, tokenizer=lambda doc: doc)
BOW = vectorizer.fit_transform(corpus["REPR"])

# count based feature representation
vectorizer_2 = CountVectorizer(binary=False, max_df=0.95) #max_features=10000)
BOW_2 = vectorizer_2.fit_transform(corpus["REPR"])

# bag of 2-Grams
bigram_vectorizer = CountVectorizer(tokenizer=lambda doc: doc, ngram_range=[2,2])
BOW_3 = bigram_vectorizer.fit_transform(corpus["REPR"])

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, use_idf=True, stop_words='english') 
#use_idf=True, min_df= 3, stop_words="english", sublinear_tf=True, norm='l2', ngram_range=(1, 2))
tfidf_vectorizer.fit(corpus["REPR"])
corpus_tfidf = tfidf_vectorizer.transform(corpus["REPR"])

print("Formula String: ",corpus["REPR"][0])
embedding = corpus_tfidf[0].toarray()
print(len(embedding[0]))
print("Vector Representation: ",embedding)

### Support Vector Machine Classifier

### Train a SVM, save the model and report the classification performance

In [None]:
def split_data(features, labels):
    return train_test_split(features,labels, test_size=0.3,random_state=42,shuffle=True)


def save_model(model):
    now = datetime.datetime.now()
    model_output_path = "saved_models/"+model.__class__.__name__.lower()+"_"+str(now.minute)+"-"+str(now.second)+".joblib"
    joblib.dump(model, open(model_output_path, 'wb+'))


def report(model, best_model, X_test, y_test, labels):
    print(f"Best parameters set {model.best_params_} with accuracy {model.best_score_}")
    y_predict = best_model.predict(X_test)
    
    labels = sorted(list(set(labels)))
    print("\nConfusion matrix:")
    cm = confusion_matrix(y_test, y_predict, labels=labels) # lables=clf.classes_
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot()
    plt.show()
    
    print("\nClassification report:")
    print(classification_report(y_test, y_predict))
    
    print("\nAccuracy: {}%".format(accuracy_score(y_test, y_predict)*100))
    print("Precision: {}%".format(sklearn.metrics.precision_score(y_test, y_predict)*100))
    print("Recall: {}%".format(sklearn.metrics.recall_score(y_test, y_predict)*100))

    
def train_svm_classifier(features, labels):
    # split data
    X_train, X_test, y_train, y_test = split_data(features, labels)

    param = {'C': [0.1, 1, 10, 100, 1000],
             'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001],
             'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
            }

    clf = GridSearchCV(SVC(), param, cv=5, n_jobs=2, verbose=0)

    clf.fit(X_train, y_train)
    svm_clf = clf.best_estimator_
    
    # save model to disk
    save_model(svm_clf)
    
    # show test report
    #report(clf, svm_clf, X_test, y_test, labels)
    
    return svm_clf

In [None]:
#model = train_svm_classifier(BOW, np.asarray(corpus["CLASS"]))

In [None]:
#model = train_svm_classifier(BOW_2, np.asarray(corpus["CLASS"]))

In [None]:
#model = train_svm_classifier(BOW_3, np.asarray(corpus["CLASS"]))

In [None]:
model = train_svm_classifier(corpus_tfidf, np.asarray(corpus["CLASS"]))

In [None]:

def preprocess_single_comment(comment):
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words("english")

    comment = comment.lower()
    comment = nltk.word_tokenize(comment) # tokenizing
    comment = [lemmatizer.lemmatize(word) for word in comment] # lemmatizing
    comment = [word for word in comment if word not in stop_words] # removing stopwords
    print(comment)
    print(len(comment))
    print(" ".join(comment))
    return " ".join(comment)

def get_comment_embedding(comment):
    to_be_return = tfidf_vectorizer.transform([preprocess_single_comment(comment)])
    print("to be returned ", type(to_be_return))
    return to_be_return

voc_tfidf = tfidf_vectorizer.get_feature_names_out()
print(voc_tfidf)
print(len(voc_tfidf))
print(model.get_params())

comment_str = "4:20 one of a very diverse set of clips, but don't know what it is. Is it a walking plant?  Thanks Stern. And another one at 20:06!"
comment_str2 = "Go to my page"
print(type(model))
prediction_label = model.predict(get_comment_embedding(comment_str))
prediction_label2 = model.predict(get_comment_embedding(comment_str2))
print(prediction_label, len(prediction_label))
print(prediction_label2, len(prediction_label2))
