# Setup

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import product
import csv
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Load dataset

In [None]:
train = pd.read_csv("../dataset/prep_train.csv")#.fillna(0)
val = pd.read_csv("../dataset/prep_valid.csv")#.fillna(0)
test = pd.read_csv("../dataset/prep_test.csv")#.fillna(0)

In [None]:
Y_train = train.LABEL
Y_val = val.LABEL
Y_test = test.LABEL

# SVM

In [None]:
ranges = [(1,1),(2,2),(3,3),(1,2),(2,3),(1,3)]
kernel = ["linear", "poly", "rbf", "sigmoid"]

In [None]:
with open('svm.csv', 'w') as file:
    writer = csv.writer(file)
    for r in ranges:
        vectorizer = TfidfVectorizer(use_idf=True, ngram_range=r, min_df=3)
        vectorizer.fit(train.STATEMENT)
        X_train = np.asarray(vectorizer.transform(train.STATEMENT).todense())
        X_val = np.asarray(vectorizer.transform(val.STATEMENT).todense())
        X_test = np.asarray(vectorizer.transform(test.STATEMENT).todense())
        for k in kernel:
            clf = make_pipeline(StandardScaler(), SVC(kernel=k))
            clf.fit(X_train, Y_train)
            acc = clf.score(X_test, X_val)
            print(f"{r}, {k}, {acc}")
            data = [r, k, acc]
            writer.writerow(data)

# Random Forest

In [None]:
ranges = [(1,1),(2,2),(3,3),(1,2),(2,3),(1,3)]
max_depth = [None, 5, 10]
n_estimators = [5, 10, 50]
max_features = ["auto", "sqrt", "log2", None]

In [None]:
with open('rf.csv', 'w') as file:
    writer = csv.writer(file)
    for r in ranges:
        vectorizer = TfidfVectorizer(use_idf=True, ngram_range=r, min_df=3)
        vectorizer.fit(train.STATEMENT)
        X_train = np.asarray(vectorizer.transform(train.STATEMENT).todense())
        X_val = np.asarray(vectorizer.transform(val.STATEMENT).todense())
        X_test = np.asarray(vectorizer.transform(test.STATEMENT).todense())
        for d,n,f in product(*[max_depth, n_estimators, max_features]):
            clf = RandomForestClassifier(max_depth=d, n_estimators=n, max_features=f)
            clf.fit(X_train, Y_train)
            acc = clf.score(X_val, Y_val)
            print(f"{r}, {d}, {n}, {f}, {acc}")
            data = [r, d, n, f, acc]
            writer.writerow(data)