# SVM Model
An SVM model for emoji prediction.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import gensim

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.feature_selection import SelectKBest
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import MultinomialNB

import scipy
from sklearn.metrics import accuracy_score
import math

from pandas import DataFrame
!pip install openpyxl

In [2]:
# Load data from files
# Dataframes have columns 'text', 'gold_label' and 'date'

dirpath = ''

df_train = pd.read_csv(dirpath + 'emoji-train.csv')
df_test = pd.read_csv(dirpath + 'emoji-test.csv')
df_val = pd.read_csv(dirpath + 'emoji-validation.csv')

train_X = df_train['text']
train_y = df_train['gold_label']
train_date = df_train['date']

val_X = df_val['text']
val_gold_y = df_val['gold_label']
val_date = df_val['date']

test_X = df_test['text']
test_gold_y = df_test['gold_label']
test_date = df_test['date']

In [3]:
# Use the pre-trained google news w2vec dataset
import gensim.downloader as api

defaultLoad = api.load("word2vec-google-news-300")

## Vectorizer classes ##

In [4]:
# Word2Vectorizer class
# The class mimics TfidfVectorizer's structure, so that it can be used
# as a generic vectorizer in the pipeline
class Word2Vectorizer():
    # Constructor
    # Parameters: w2vec model (optional, default is w2vec trained on google news dataset), number of features, w2vec_url (optional)
    def __init__(self, w2vec = defaultLoad, w2vec_NUM_FEATURES = 300, w2vec_url = None):
        if w2vec_url == None:
            self.w2vec = w2vec
        else:
            self.w2vec = api.load(w2vec_url)
        self.w2vec_NUM_FEATURES = w2vec_NUM_FEATURES
        return
    # No need to fit since only pretrained w2vec models are used
    def fit(self, X, y = None):
        return
    # transform function
    # Parameters: Document
    # Given a document, obtain a vector by getting the mean of all of its words' w2vec vectors
    def transform(self, X, y = None):
        ans = []
        for document in X:
            avg = np.zeros(self.w2vec_NUM_FEATURES)
            num_nonzero = 0
            for word in document:
                word_cpy = word.lower()
                if word_cpy in self.w2vec.key_to_index:
                    avg = (avg + self.w2vec[word_cpy])
                    num_nonzero += 1
            ans.append(np.asarray(avg / num_nonzero))
        return np.asarray(ans)

In [5]:
# Class CharacterTfidfVectorizer
# This implements the character tfidf vectorizer for a document
# This is done by introducing spaces between each character in the document,
# So that each character is separated
# The structure is similar to Word2Vectorizer
class CharacterBagofngramsVectorizer():
    # Constructor
    # Parameters: ngram_range (tuple of two numbers)
    def __init__(self, ngram_range):
        # Initialize a Tfidf vectorizer
        self.vectoriser = CountVectorizer(lowercase = False, ngram_range = ngram_range, stop_words = None, min_df = 1, vocabulary = [chr(ch) for ch in range(128)], max_features = 2500)
    # Helper function for introducing spaces between characters
    # Parameters: corpus X
    def space_doc(self, X):
        X_copy = X
        X_to_fit = []
        for index in range(len(X_copy)):
            doc = ""
            for index_2 in range(len(X_copy[index])):
                doc = doc + X_copy[index][index_2] + ' '
            doc = doc[:-1]
            X_to_fit.append(doc)
        # print(X_to_fit[:2])
        return X_to_fit
    # Function fit that trains the tfidf vectorizer
    def fit(self, X, y = None):
        self.vectoriser.fit(self.space_doc(X))
    # Function transform that transforms the corpus into a set of vectors
    def transform(self, X, y = None):
        return self.vectoriser.transform(self.space_doc(X))


In [6]:
# Helper functions to construct vectorizers / classifiers
def getTfidfVectorizer(ngram_range_param):
    # max_features was used because of a notable jump in accuracy when using it
    return TfidfVectorizer(lowercase = True, stop_words = stopwords.words('english'), ngram_range = ngram_range_param, min_df = 1, max_features = 2500)


def getW2vecVectorizer():
    return Word2Vectorizer()


def getBagOfngramsVectorizer(ngram_range_param):
    return CountVectorizer(lowercase = True, ngram_range = ngram_range_param, min_df = 1, max_features = 2500)


def getCharacterBagOfngramsVectorizer(char_ngram_range_param):
    return CharacterBagofngramsVectorizer(ngram_range = char_ngram_range_param)


def getSGDC(C = 0.1, max_iter = None):
    if max_iter != None:
        return SGDClassifier(alpha = len(train_y) / C, max_iter = max_iter, n_jobs = -1, loss = 'squared_hinge', n_iter_no_change = 100)
    else:
        return SGDClassifier(alpha = len(train_y) / C, n_iter_no_change = 100, loss = 'squared_hinge', n_jobs = -1)

def getLinearSVMC(C = 0.1, max_iter = None):
    if max_iter != None:
        return OneVsRestClassifier(LinearSVC(C = C, max_iter = max_iter), n_jobs = -1)
    else:
        return OneVsRestClassifier(LinearSVC(C = C), n_jobs = -1)

def getPolySVMC(C = 0.1, max_iter = None, degree = 3, gamma = 'scale', coef0 = 0):
    if max_iter != None:
        return OneVsRestClassifier(BaggingClassifier(SVC(kernel = 'poly', max_iter = max_iter, C = C, degree = degree, gamma = gamma, coef0 = coef0, class_weight='balanced'), n_estimators = 10, max_samples = 1 / 10), n_jobs = -1)
    else:
        return OneVsRestClassifier(BaggingClassifier(SVC(kernel = 'poly', C = C, degree = degree, gamma = gamma, coef0 = coef0, class_weight='balanced'), n_estimators = 10, max_samples = 1 / 10), n_jobs = -1)

def getRbfSVMC(C = 0.1, max_iter = None, gamma = 'scale'):
    if max_iter != None:
        return OneVsRestClassifier(BaggingClassifier(SVC(kernel = 'rbf', max_iter = max_iter, C = C, gamma = gamma, class_weight='balanced'), n_estimators = 10, max_samples = 1 / 10), n_jobs = -1)
    else:
        return OneVsRestClassifier(BaggingClassifier(SVC(kernel = 'rbf', C = C, gamma = gamma, class_weight='balanced'), n_estimators = 10, max_samples = 1 / 10), n_jobs = -1)


# Hour to Time of day map #

In [7]:
category_time = {}
# Early morning
for index in range(0, 5):
    category_time[index] = 0
# Morning
for index in range(5, 11):
    category_time[index] = 1
# Noon
for index in range(11, 15):
    category_time[index] = 2
# Afternoon
for index in range(15, 18):
    category_time[index] = 3
# Evening
for index in range(18, 21):
    category_time[index] = 4
# Night
for index in range(21, 24):
    category_time[index] = 5

## Pipeline class ##

Due to limitations of the sklearn pipeline with custom vectorizers as well as having a separate date set, a custom pipeline class was created. It takes vectorisers, whether or not to use feature selection, and a classifier as its arguments.
To fit the pipeline, use the fit function with the corpus, date set and label set.
To get the prediction of the pipeline, either use the predict or the predict_proba functions.

In [8]:
# Whether to have the pipelines print internal messages or not
VERBOSITY = False

# Pipeline class
# Mimics the sklearn pipeline, but allows for less constrained estimators to be used
class myPipeline:
    # Constructor: takes in a list of vectorisers, whether feature selection is used and how many features should be used,
    # and the classifier to be used
    # Parameters: list vectorisers, string feature_selection, integer k_selectKBest, classifier
    def __init__(self, *vectorisers, feature_selection = None, k_selectKBest = 2000, classifier):
        # Create the internal vectoriser list
        self.vectorisers = []
        for vectoriser in vectorisers:
            self.vectorisers.append(vectoriser)
        # Create the internal classifier
        self.classifier = classifier
        # Flag to keep track of whether feature selection has been trained
        self.has_trained_feature_selection = False
        # Set the correct feature selection
        if feature_selection == None:
            self.feature_selection = None
            self.feature_selection_type = None
        elif feature_selection == "SelectKBest":
            self.feature_selection = SelectKBest(k = k_selectKBest)
            self.feature_selection_type = "SelectKBest"
        else:
            raise Exception("Invalid feature selection")
        self.sc = MaxAbsScaler()
    # Date preprocessing
    def extract_date_info(self, dates):
        ans = []
        for date in dates:
            ans.append(np.asarray([category_time[(int)(date[11] + date[12])], (int)(date[0 : 4]), (int)(date[5 : 7]), (int)(date[8 : 10])]))
        return np.asarray(ans)
    # Internal function to preprocess the corpus
    def preprocess(self, corpus, dates):
        # Create a list of the corpus transformed by each vectoriser
        raw = []
        raw.append(self.extract_date_info(dates))
        for index in range(len(self.vectorisers)):
            raw.append(self.vectorisers[index].transform(corpus))
        # If feature selection has not been trained, then don't do feature selection on the corpus
        if self.has_trained_feature_selection == False:
            try:
                return scipy.sparse.hstack(raw)
            except:
                # this happens when only w2vec is used
                return raw[0]
        else:
            # Otherwise, do feature selection
            try:
                return self.feature_selection.transform(scipy.sparse.hstack(raw))
            except:
                # this happens when only w2vec is used
                return self.feature_selection.transform(raw[0])
    def my_softmax(self, val_lst):
        ans = []
        sum = 0.0
        for val in val_lst:
            ans.append(math.e ** val)
            sum = sum + math.e ** val
        for index in range(len(ans)):
            ans[index] /= sum
        return np.asarray(ans)

    def my_predict_proba(self, test_X):
        proba_dist = self.classifier.decision_function(test_X)
        proba_list = []
        for instance in proba_dist:
            proba_list.append(self.my_softmax(instance))
        return np.asarray(proba_list)
    # fit function, requires both a corpus and a list of labels
    def fit(self, train_X, train_date, train_y):
        if VERBOSITY:
            print("Fitting started")

        # Fit every vectorizer
        for index in range(len(self.vectorisers)):
            self.vectorisers[index].fit(train_X)

        # Do an initial vectorization of the corpus, so that feature selection can be trained
        preprocessed_X = self.preprocess(train_X, train_date)
        # Train the feature selector
        if VERBOSITY:
            print("Pre-processed training set shape (before feature selection):")
            print(preprocessed_X.shape)
        if self.feature_selection_type == "SelectKBest":
            self.feature_selection.fit(preprocessed_X, train_y)
            self.has_trained_feature_selection = True
            preprocessed_X = self.feature_selection.transform(preprocessed_X)
        preprocessed_X = self.sc.fit_transform(preprocessed_X)
        if VERBOSITY:
            print("Pre-processed training set shape:")
            print(preprocessed_X.shape)
        # Train the classifier
        self.classifier.fit(preprocessed_X, train_y)
    # given a test set, predict its labels
    def predict(self, test_X, test_date):
        return self.classifier.predict(self.sc.transform(self.preprocess(test_X, test_date)))
    # given a test set, predict the label probabilities
    def predict_proba(self, test_X, test_date):
        return self.my_predict_proba(self.sc.transform(self.preprocess(test_X, test_date)))

## Simulated Annealing ##

In [9]:
# Defining a state:
# (C, n-gram size, character n-gram size, feature_engineering_combination, selectkbest_k, max_iter)

class State:
    # Constructor
    def __init__(self, model = getLinearSVMC):
        # You can change getLinearSVMC to other model functions for different kernels (poly / rbf)
        # getPolySVMC or getRbfSVMC
        self.model = model
        self.C_range = [0.02 + x * 0.02 for x in range((int)(1.0 / 0.02))]
        self.n_gram_range = [(1, 1), (1, 2), (1, 3), (1, 4)]
        self.char_n_gram_range = [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6)]
        self.feature_engineering_possibilities = ["w2vec", "bag_of_char_ngrams", "bag_of_ngrams", "tfidf"]
        #                                           0           1                   2               3
        #    12 = 8 + 4 = 2 ** 3 + 2 ** 2
        #
        self.selectkbest_k_possibilities = [250, 500, 750, 1000, 1250, 1500, 1750, 2000, 2250, 2500, 2750, 3000]
        self.max_iter_possibilities = [(x + 1) * 1000 for x in range(50)]
        self.score = None
        self.has_trained_model = False
        self.pipe = None
        (self.C_idx, self.n_gram_idx, self.char_n_gram_idx, self.feature_engineering_combination_idxs, self.selectkbest_k_idx, self.max_iter_idx) = self.get_random()

        self.vec_dict = {
            "w2vec" : getW2vecVectorizer,
            "bag_of_char_ngrams" : getCharacterBagOfngramsVectorizer,
            "bag_of_ngrams" : getBagOfngramsVectorizer,
            "tfidf" : getTfidfVectorizer
        }

        self.vec_params = {
            "w2vec" : {},
            "bag_of_char_ngrams" : {
                "char_ngram_range_param" : "self.char_n_gram_range[self.char_n_gram_idx]"
            },
            "bag_of_ngrams" : {
                "ngram_range_param" : "self.n_gram_range[self.n_gram_idx]"
            },
            "tfidf" : {
                "ngram_range_param" : "self.n_gram_range[self.n_gram_idx]"
            }
        }

    # Returns a random combination of parameters
    def get_random(self):
        C_idx = random.choice(range(len(self.C_range)))
        n_gram_idx = random.choice(range(len(self.n_gram_range)))
        char_n_gram_idx = random.choice(range(len(self.char_n_gram_range)))
        feature_engineering_combination_idxs = 1 + random.choice(range(2 ** (len(self.feature_engineering_possibilities)) - 1))
        # range(15) = [0..14] + 1 => [1..15]
        selectkbest_k_idx = random.choice(range(len(self.selectkbest_k_possibilities)))
        max_iter_idx = random.choice(range(len(self.max_iter_possibilities)))
        return (C_idx, n_gram_idx, char_n_gram_idx, feature_engineering_combination_idxs, selectkbest_k_idx, max_iter_idx)

    # Returns a random feature engineering neighbour
    def get_feature_eng_comb_neighbour(self):
        possibilities = [self.feature_engineering_combination_idxs]
        for idx in range(len(self.feature_engineering_possibilities)):
            feature_eng_comb_neigh = self.feature_engineering_combination_idxs ^ (2 ** idx)
            if(feature_eng_comb_neigh != 0):
                possibilities.append(feature_eng_comb_neigh)
        return random.choice(possibilities)

    # Returns a state neighbour
    # Let's say our current state is C = 0.04, n_gram = (1, 3), char_n_gram = (1, 2), feature_eng = [w2vec], selectkbest_k = 1250, max_iter = 10000
    # You could have a neighbour with C = 0.04, n_gram = (1, 3), char_n_gram = (1, 2), feature_eng = [w2vec], selectkbest_k = 1250, max_iter = 9000
    def get_neighbour(self):
        dx_vec = [-1, 0, 1]
        C_idx = random.choice(dx_vec) + self.C_idx
        n_gram_idx = random.choice(dx_vec) + self.n_gram_idx
        char_n_gram_idx = random.choice(dx_vec) + self.char_n_gram_idx
        feature_engineering_combination_idxs = self.get_feature_eng_comb_neighbour()
        selectkbest_k_idx = random.choice(dx_vec) + self.selectkbest_k_idx
        max_iter_idx = random.choice(dx_vec) + self.max_iter_idx
        if((C_idx, n_gram_idx, char_n_gram_idx, feature_engineering_combination_idxs, selectkbest_k_idx, max_iter_idx) == (self.C_idx, self.n_gram_idx, self.char_n_gram_idx, self.feature_engineering_combination_idxs, self.selectkbest_k_idx, self.max_iter_idx)):
            return self.get_neighbour()
        neighbour = State()
        neighbour.C_idx = C_idx % len(self.C_range)
        neighbour.n_gram_idx = n_gram_idx % len(self.n_gram_range)
        neighbour.char_n_gram_idx = char_n_gram_idx % len(self.char_n_gram_range)
        neighbour.feature_engineering_combination_idxs = feature_engineering_combination_idxs
        neighbour.selectkbest_k_idx = selectkbest_k_idx % len(self.selectkbest_k_possibilities)
        neighbour.max_iter_idx = max_iter_idx % len(self.max_iter_possibilities)
        return neighbour

    # Helper function to compile a list of vectorizers from the feature_engineering_possibilities number
    def get_vectorizers(self):
        vectorizers = []

        for idx in range(len(self.feature_engineering_possibilities)):
            if((2 ** idx) & self.feature_engineering_combination_idxs) != 0:
                vec_name = self.feature_engineering_possibilities[idx]
                vectorizers.append((self.vec_dict[vec_name], self.vec_params[vec_name]))

        return vectorizers

    # Turns a binary representation into corresponding feature engineering methods
    def get_str_feature_eng_comb_idxs(self):
        ans = ""
        for index in range(len(self.feature_engineering_possibilities)):
            if((2 ** index) & self.feature_engineering_combination_idxs):
                ans += self.feature_engineering_possibilities[index] + "; "
        return ans

    # Returns a string detailing the state's hyperparameters
    def str_state(self):
        ans = ""
        ans += "C parameter: " + str(self.C_range[self.C_idx]) + "\n"
        ans += "n-gram parameter: " + str(self.n_gram_range[self.n_gram_idx]) + "\n"
        ans += "char n-gram parameter: " + str(self.char_n_gram_range[self.char_n_gram_idx]) + "\n"
        ans += "feature engineering used: " + self.get_str_feature_eng_comb_idxs() + "date pre-processing\n"
        ans += "selectkbest k parameter: " + str(self.selectkbest_k_possibilities[self.selectkbest_k_idx]) + "\n"
        ans += "max_iter parameter: " + str(self.max_iter_possibilities[self.max_iter_idx]) + "\n"
        return ans

    # Helper function to print the state's hyperparameters
    def print_state(self):
        print(self.str_state())
        print("\n")
    
    # Helper function to print the state's hyperparameters to a file
    def print_state_file(self, file):
        file.write(self.str_state() + "\n")

    # Helper function that returns a dictionary of parameter names with their values
    def process_params(self, params):
        processed_params = {}
        for param_name, param_val_name in params.items():
            processed_params[param_name] = eval(param_val_name)
        return processed_params

    # Evaluates the state's performance, taking the energy to be negative accuracy score
    def get_energy(self):
        if self.score != None:
            return self.score
        vectorizer_list = self.get_vectorizers()
        applied_vec = []
        for (func, params) in vectorizer_list:
            applied_vec.append(func(**self.process_params(params)))

        if self.has_trained_model == False:
            self.pipe = myPipeline(*applied_vec, classifier = self.model(C = self.C_range[self.C_idx], max_iter = self.max_iter_possibilities[self.max_iter_idx]), feature_selection = "SelectKBest", k_selectKBest = self.selectkbest_k_possibilities[self.selectkbest_k_idx])
            self.has_trained_model = True
            self.pipe.fit(train_X, train_date, train_y)

        val_pred_y_probs = self.pipe.predict_proba(val_X, val_date)
        pred_ids = np.argpartition(val_pred_y_probs, kth=-5, axis=1)
        candidate_preds = pred_ids[:, -5:]

        val_pred_y = [gold if gold in candidate_preds[e] else candidate_preds[e, -1] for e, gold in enumerate(val_gold_y)]
        self.score = -(accuracy_score(val_gold_y, val_pred_y))
        return self.score
    # Evaluates a topk accuracy measure on the validation set
    def model_top_k_validation(self, k = 5):
        vectorizer_list = self.get_vectorizers()
        applied_vec = []
        for (func, params) in vectorizer_list:
            applied_vec.append(func(**self.process_params(params)))

        # Make sure the pipeline has been trained
        if self.has_trained_model == False:
            self.pipe = myPipeline(*applied_vec, classifier = self.model(C = self.C_range[self.C_idx], max_iter = self.max_iter_possibilities[self.max_iter_idx]), feature_selection = "SelectKBest", k_selectKBest = self.selectkbest_k_possibilities[self.selectkbest_k_idx])
            self.has_trained_model = True
            self.pipe.fit(train_X, train_date, train_y)

        val_pred_y_probs = self.pipe.predict_proba(val_X, val_date)
        pred_ids = np.argpartition(val_pred_y_probs, kth=-k, axis=1)
        candidate_preds = pred_ids[:, -k:]

        val_pred_y = [gold if gold in candidate_preds[e] else candidate_preds[e, -1] for e, gold in enumerate(val_gold_y)]
        return (accuracy_score(val_gold_y, val_pred_y))

    # Evaluates a topk accuracy measure on the test set
    def eval_model_top_k(self, k = 5):
        vectorizer_list = self.get_vectorizers()
        applied_vec = []
        for (func, params) in vectorizer_list:
            applied_vec.append(func(**self.process_params(params)))
        
        if self.has_trained_model == False:
            self.pipe = myPipeline(*applied_vec, classifier = self.model(C = self.C_range[self.C_idx], max_iter = self.max_iter_possibilities[self.max_iter_idx]), feature_selection = "SelectKBest", k_selectKBest = self.selectkbest_k_possibilities[self.selectkbest_k_idx])
            self.has_trained_model = True
            self.pipe.fit(train_X, train_date, train_y)

        test_pred_y_probs = self.pipe.predict_proba(test_X, test_date)
        pred_ids = np.argpartition(test_pred_y_probs, kth=-k, axis=1)
        candidate_preds = pred_ids[:, -k:]

        test_pred_y = [gold if gold in candidate_preds[e] else candidate_preds[e, -1] for e, gold in enumerate(test_gold_y)]
        return accuracy_score(test_gold_y, test_pred_y)

# Simulated annealing's acceptance function
def acceptance_function(energy_state, energy_neighbour, temperature):
    if(energy_state > energy_neighbour):
        return 1
    if(temperature < 0.0001):
        return 0
    return math.e ** (100 / 30 * 1.40670535838 * 1.15 * -(energy_neighbour - energy_state) / temperature)

# Prints the top5 and top1 accuracy of a state, on the validation set
def print_validation_stats(state):
    print("[Validation] Accuracy top 5: " + str(state.model_top_k_validation(k = 5)) + " ; Accuracy top 1: " + str(state.model_top_k_validation(k = 1)))

# Prints the top5 and top1 accuracy of a state, on the validation set, to file
def print_validation_stats_file(state, file):
    file.write("[Validation] Accuracy top 5: " + str(state.model_top_k_validation(k = 5)) + " ; Accuracy top 1: " + str(state.model_top_k_validation(k = 1)) + '\n')

# Prints the top5 and top1 accuracy of a state, on the test set
def print_evaluation_stats(state):
    print("[Evaluation] Accuracy top 5: " + str(state.eval_model_top_k(k = 5)) + " ; Accuracy top 1: " + str(state.eval_model_top_k(k = 1)))

# Prints the top5 and top1 accuracy of a state, on the test set, to file
def print_evaluation_stats_file(state, file):
    file.write("[Evaluation] Accuracy top 5: " + str(state.eval_model_top_k(k = 5)) + " ; Accuracy top 1: " + str(state.eval_model_top_k(k = 1)) + '\n')

# Helper function to copy state from src to dest
def cpy_state(dest, src):
    dest.score = src.score
    dest.has_trained_model = src.has_trained_model
    dest.pipe = src.pipe
    (dest.C_idx, dest.n_gram_idx, dest.char_n_gram_idx, dest.feature_engineering_combination_idxs, dest.selectkbest_k_idx, dest.max_iter_idx) = (src.C_idx, src.n_gram_idx, src.char_n_gram_idx, src.feature_engineering_combination_idxs, src.selectkbest_k_idx, src.max_iter_idx)

# The simulated annealing algorithm
def simulated_annealing(no_steps = 100, verbosity = True, file = None):

    # Start with random state
    state = State()

    # Initialize the best overall state to the random state
    best_overall = State()
    cpy_state(best_overall, state)

    # Print starting state
    if verbosity:
        print("Starting state:")
        state.print_state()
        print_validation_stats(state)
    if file != None:
        file.write("Starting state:\n")
        state.print_state_file(file)
        print_validation_stats_file(state, file)
        
    # Run the algorithm for no_steps steps
    for k in range(no_steps):
        if verbosity:
            print("Step number # " + str(k + 1) + " out of " + str(no_steps))
        # Get temperature
        temperature = 1 - (k + 1) / no_steps
        # Get random neighbour
        state_neigh = state.get_neighbour()
        if verbosity:
            state_neigh.print_state()
            print_validation_stats(state_neigh)
        if file != None:
            file.write("Step number # " + str(k + 1) + " out of " + str(no_steps) + '\n')
            state_neigh.print_state_file(file)
            print_validation_stats_file(state_neigh, file)

        # Keep the best_overall updated to have the highest validation set accuracy
        if best_overall.get_energy() > state_neigh.get_energy():
            cpy_state(best_overall, state_neigh)

        # Use the acceptance function do determine if a move to the neighbour is made
        # If the energy of the neighbour is lower, then always move to the neighbour
        if acceptance_function(state.get_energy(), state_neigh.get_energy(), temperature) >= random.uniform(0, 1):
            state.score = state_neigh.score 
            cpy_state(state, state_neigh)
            if verbosity:
                print("Admitted state")
            if file != None:
                file.write("Admitted state\n")
    if verbosity:
        print("Best model:")
        best_overall.print_state()
        print_evaluation_stats(best_overall)
    if file != None:
        file.write("Best model:\n")
        best_overall.print_state_file(file)
        print_evaluation_stats_file(best_overall, file)
    return best_overall

## Testing the different kernels ##

In [10]:
# This section was commented because of performance issues with Rbf and Poly kernels
'''
C_idx_comparison = 30
n_gram_idx_comparison = 2
char_n_gram_idx_comparison = 3
feature_engineering_combination_idxs_comparison = 10
selectkbest_k_idx_comparison = 3
max_iter_idx_comparison = 10
for (model, model_name) in [(getLinearSVMC, 'linear kernel'), (getPolySVMC, 'poly kernel'), (getRbfSVMC, 'rbf kernel')]:
    print("Model stats for " + model_name + ":")
    state = State(model = model)
    state.C_idx = C_idx_comparison
    state.n_gram_idx = n_gram_idx_comparison
    state.char_n_gram_idx = char_n_gram_idx_comparison
    state.feature_engineering_combination_idxs = feature_engineering_combination_idxs_comparison
    state.selectkbest_k_idx = selectkbest_k_idx_comparison
    state.max_iter_idx = max_iter_idx_comparison
    state.print_state()
    print("Accuracy top-1: " + str(state.eval_model_top_k(k = 1)))
    print("Accuracy top-5: " + str(state.eval_model_top_k(k = 5)))
'''
print("")




In [None]:
# Run simulated annealing with 200 steps
best_state = simulated_annealing(no_steps = 200, file = open("results_200_1.txt", "w"))

In [None]:
# See the structure of the best state
best_state.print_state()

In [13]:
# Function to generate the evaluation of a model and put it into a csv file
def generate_csv(state):
    k = 5
    vectorizer_list = state.get_vectorizers()
    applied_vec = []
    # Gathering vectoriser arguments
    
    for (func, params) in vectorizer_list:
        applied_vec.append(func(**state.process_params(params)))
    
    # Running the pipeline

    if state.has_trained_model == False:
        state.pipe = myPipeline(*applied_vec, classifier = state.model(C = state.C_range[state.C_idx], max_iter = state.max_iter_possibilities[state.max_iter_idx]), feature_selection = "SelectKBest", k_selectKBest = state.selectkbest_k_possibilities[state.selectkbest_k_idx])
        state.has_trained_model = True
        state.pipe.fit(train_X, train_date, train_y)

    # Evaluating the model

    test_pred_y_probs = state.pipe.predict_proba(test_X, test_date)
    pred_ids = np.argpartition(test_pred_y_probs, kth=-k, axis=1)
    candidate_preds = pred_ids[:, -k:]

    l1 = []
    l2 = []
    l3 = []
    l4 = []
    l5 = []
    l6 = []

    # Generate the csv file

    for e in range(len(candidate_preds)):
        l1.append(e)
        l2.append(test_X[e])
        l3.append(test_date[e])
        l4.append(candidate_preds[e][-1])
        l5.append(candidate_preds[e])
        l6.append(test_gold_y[e])

    df = DataFrame({'Index' : l1, "Text" : l2, "Date" : l3, "top1" : l4, "top5" : l5, "label" : l6})

    df.to_csv('results_linearsvm.csv', index = False)


    # test_pred_y = [gold if gold in candidate_preds[e] else candidate_preds[e, -1] for e, gold in enumerate(test_gold_y)]
    


In [None]:
# Generate the best state's file

generate_csv(best_state)