# **SVM Model**

# **Data Format:**
# **Columns: Text, Labels**

|  TEXT  |    LABEL    | 
|--------|-------------|
| TEXT_1 |  LABEL_1  |
| TEXT_2 |  LABEL_1  |
| TEXT_3 |  LABEL_2  |

# **GPU Configuration**

In [None]:
import tensorflow as tf
import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0";
tf.config.list_physical_devices('GPU')
tf.test.is_built_with_cuda()

# **Load Data**

In [None]:
from google.colab import files
files.upload()

In [None]:
import pandas as pd

csv_path = "pizza-ordering-tm-1-2019-concat.csv"
data = pd.read_csv(csv_path)
data.head(5)

# **Pre-Processing Label** 

In [None]:
data = data.drop(columns=['segment', 'speaker'])
data.columns = ['text', 'label']
data.label = data.label.apply(lambda x: x.split(".")[2])
data.head(5)

# **Reduce Data**

In [None]:
import random

df = data.copy()

def reduceData(df, n=10000000):
    dic = df.groupby(by="label").groups
    selected_texts = []
    selected_labels = []
    selected_id = []
    for k in dic.keys():
        if (len(dic[k]) > n):
            dic[k] = random.sample(list(dic[k]), n)
        for i in dic[k]:
            selected_labels.append(k)
            selected_texts.append(df.text[i])
    return pd.DataFrame(data={"text": selected_texts, "label": selected_labels})

df = reduceData(df, 500)
df.label.value_counts()

In [None]:
import numpy as np

labels = df.label.values
output_length = len(df.label.unique())
output_length

# **Text Preprocessing**

In [6]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import functools
from tensorflow.keras.preprocessing.sequence import pad_sequences

def vectorizer(texts):
    dic = {}
    r = []
    count = 1
    for t in texts:
        text = []
        for w in t.split(" "):
            if w in dic:
                text.append(dic[w])
            else:
                dic[w] = count
                text.append(dic[w])
                count += 1
        r.append(text)
    return r, dic

def textPreprocessing(texts):
    texts, dic = vectorizer(texts)
    vocab_size = len(dic.keys())
    max = len(functools.reduce(lambda a, b: a if len(a) > len(b) else b, texts))
    texts = pad_sequences(texts, maxlen = max, padding= "pre")
    return texts, vocab_size, max

texts, vocab_size, max_ = textPreprocessing(list(df.text.values))

# **Labels Preprocessing**

In [7]:
from sklearn.preprocessing import LabelEncoder

def labelsPreprocessing(labels):
    encoder = LabelEncoder()
    labels = encoder.fit_transform(labels)
    return labels, encoder

labels, encoder = labelsPreprocessing(list(df.label.values))

# **Training Model**

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import SVC
import seaborn as sn
import matplotlib.pyplot as plt

def plotConfusionMatrix(y_true, y_pred):
    data = {
      'real_value': y_true,
      'predicted': y_pred
    }
    fig, ax = plt.subplots(figsize=(15,15))
    df = pd.DataFrame(data, columns=['real_value','predicted'])
    confusion_matrix = pd.crosstab(df['real_value'], df['predicted'], rownames=['Real'], colnames=['Predicted'])
    sn.heatmap(confusion_matrix, annot=True, cbar=False, fmt='g')
    plt.show()

def runModel(texts, labels, encoder):
    X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2)
    model = SVC(C=1, gamma="auto", kernel='rbf')
    model.fit(X_train, y_train)
    
    prediction_test = encoder.inverse_transform(model.predict(X_test))
    print(classification_report(encoder.inverse_transform(y_test), prediction_test))
    
    y_true = encoder.inverse_transform(y_test)
    y_pred = prediction_test
    plotConfusionMatrix(y_true, y_pred)

In [None]:
runModel(texts, labels, encoder)