# Shallow ML Text Classification

## Importing Libraries

In [1]:
from typing import List

In [2]:
import re
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt

In [3]:
from nltk import pos_tag
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy.sparse import hstack
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

## Load Training Data

In [4]:
df_train = pd.read_csv('../data/train.csv')[['text_a','label']] # drop row numbers column
df_test = pd.read_csv('../data/test.csv')
df_validate = pd.read_csv('../data/dev.csv')

## Stopwords definition

In [5]:
# add stopwords
add_stopwords = set(StopWordRemoverFactory().get_stop_words())
print('sastrawi stopwords:', len(add_stopwords))

stopwords_set = set(stopwords.words())
print('nltk stopwords:', len(stopwords_set))
stopwords_set = stopwords_set.union(add_stopwords)
print('nltk added stopwords:', len(stopwords_set))

sastrawi stopwords: 123
nltk stopwords: 9380
nltk added stopwords: 9395


## Function to preprocess/normalize text

In [6]:
stemmer = StemmerFactory().create_stemmer()

In [7]:
def normalize_text(string: str, stem: bool=True, sw_elim: bool=True) -> List[str]:
  # filtering, only characters allowed
  filtered = re.sub('[^a-zA-Z]', ' ', string)
  # lower-cased and stemmed using Sastrawi
  stemmed = stemmer.stem(filtered) if stem else filtered.lower()
  # tokenize stemmed string
  tokenized = word_tokenize(stemmed)
  # eliminate stopwords
  res = [word for word in tokenized if word not in stopwords_set] if sw_elim else tokenized
  return res

## DataFrame before Preprocessing

In [8]:
df_train.head()

Unnamed: 0,text_a,label
0,betewe buka twitter cuman ngetweet liat home b...,no
1,mas piyuuu mugo2 corona tuh mulut tersumpal ma...,no
2,e100ss gini buka informasi sejelas nya identit...,yes
3,neng solo wes ono terduga corona cobo neng ati...,no
4,midiahn nii akun gak takut takut nya isu coron...,no


In [9]:
df_test.head()

Unnamed: 0,text_a,label
0,jek dajal ga depok bang,no
1,detikcom untung depok masuk wilayah nya ridwan...,no
2,df dom jakarta depok yg gunain vc cabang nya c...,no
3,your2rl depok jkt,no
4,doakan indonesia selamat virus corona pkb depo...,yes


In [10]:
df_validate.head()

Unnamed: 0,text_a,label
0,jek dajal ga depok bang,no
1,detikcom untung depok masuk wilayah nya ridwan...,no
2,df dom jakarta depok yg gunain vc cabang nya c...,no
3,your2rl depok jkt,no
4,doakan indonesia selamat virus corona pkb depo...,yes


## Proprocess DataFrame

In [11]:
df_train.text_a = df_train.text_a.apply(normalize_text, args=(False, True))
df_test.text_a = df_test.text_a.apply(normalize_text, args=(False, True))
df_validate.text_a = df_validate.text_a.apply(normalize_text, args=(False, True))

## DataFrame after Preprocessing

In [12]:
df_train.head()

Unnamed: 0,text_a,label
0,"[betewe, buka, twitter, cuman, ngetweet, liat,...",no
1,"[piyuuu, mugo, corona, tuh, mulut, tersumpal, ...",no
2,"[ss, gini, buka, informasi, sejelas, nya, iden...",yes
3,"[neng, solo, wes, terduga, corona, cobo, neng,...",no
4,"[midiahn, nii, akun, gak, takut, takut, nya, i...",no


In [13]:
df_test.head()

Unnamed: 0,text_a,label
0,"[jek, dajal, depok, bang]",no
1,"[detikcom, untung, depok, masuk, wilayah, nya,...",no
2,"[df, dom, jakarta, depok, yg, gunain, vc, caba...",no
3,"[rl, depok, jkt]",no
4,"[doakan, indonesia, selamat, virus, corona, pk...",yes


In [14]:
df_validate.head()

Unnamed: 0,text_a,label
0,"[jek, dajal, depok, bang]",no
1,"[detikcom, untung, depok, masuk, wilayah, nya,...",no
2,"[df, dom, jakarta, depok, yg, gunain, vc, caba...",no
3,"[rl, depok, jkt]",no
4,"[doakan, indonesia, selamat, virus, corona, pk...",yes


## Function to score model

In [86]:
def score_model(true: np.array, pred: np.array, is_svm: bool=True):
  model = 'SVM' if is_svm else 'NaiveBayes'
  print(f'{model} Model Accuracy Score: {accuracy_score(true, pred):.6f}')
  precision, recall, fscore, _ = precision_recall_fscore_support(true, pred, average='macro', zero_division=1)
  print(f'{model} Model Precision Score: {precision:.6f}')
  print(f'{model} Model Recall Score: {recall:.6f}')
  print(f'{model} Model FScore: {fscore:.6f}')
  print(f'{model} Model Confusion Matrix')
  print(confusion_matrix(true, pred))

## [Vector Space Model] Vectorizer using CountVectorizer

In [15]:
vectorizer = CountVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x)
X_train, y_train = vectorizer.fit_transform(df_train.text_a), df_train.label
X_test, y_test = vectorizer.transform(df_test.text_a), df_test.label
X_validate, y_validate = vectorizer.transform(df_validate.text_a), df_validate.label
print(len(vectorizer.get_feature_names_out()))

43643


## [Vector Space Model] Vectorizer using TfIdfVectorizer

In [98]:
vectorizer_tf = TfidfVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x)
X_train_tf, y_train_tf = vectorizer_tf.fit_transform(df_train.text_a), df_train.label
X_test_tf, y_test_tf = vectorizer_tf.transform(df_test.text_a), df_test.label
X_validate_tf, y_validate_tf = vectorizer_tf.transform(df_validate.text_a), df_validate.label
print(len(vectorizer_tf.get_feature_names_out()))

43643


## Linear SVM Model with CountVectorizer Data

### Training Linear SVM Model with CountVectorizer Data

In [16]:
svm_model = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
svm_model.fit(X_train, y_train)

### Scoring Linear SVM Model with CountVectorizer Data

In [87]:
y_pred_train = svm_model.predict(X_train)
score_model(y_train, y_pred_train)

SVM Model Accuracy Score: 0.986714
SVM Model Precision Score: 0.983475
SVM Model Recall Score: 0.983716
SVM Model FScore: 0.983595
SVM Model Confusion Matrix
[[15366   146]
 [  141  5948]]


In [19]:
y_pred_test = svm_model.predict(X_test)
score_model(y_test, y_pred_test)

SVM Model Accuracy Score: 0.833929
SVM Model Precision Score: 0.779125
SVM Model Recall Score: 0.801808
SVM Model FScore: 0.788916
SVM Model Confusion Matrix
[[1814  279]
 [ 186  521]]


In [20]:
y_pred_validate = svm_model.predict(X_validate)
score_model(y_validate, y_pred_validate)

SVM Model Accuracy Score: 0.833929
SVM Model Precision Score: 0.779125
SVM Model Recall Score: 0.801808
SVM Model FScore: 0.788916
SVM Model Confusion Matrix
[[1814  279]
 [ 186  521]]


## Linear SVM Model with TFIdfVectorizer Data

### Training Linear SVM Model with TFIdfVectorizer Data

In [99]:
svm_model_tf = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
svm_model_tf.fit(X_train_tf, y_train_tf)

### Scoring Linear SVM Model with TFIdfVectorizer Data

In [108]:
y_pred_train_tf = svm_model_tf.predict(X_train_tf)
score_model(y_train_tf, y_pred_train_tf)

SVM Model Accuracy Score: 0.952548
SVM Model Precision Score: 0.947118
SVM Model Recall Score: 0.934588
SVM Model FScore: 0.940575
SVM Model Confusion Matrix
[[15136   376]
 [  649  5440]]


In [109]:
y_pred_test_tf = svm_model_tf.predict(X_test_tf)
score_model(y_test_tf, y_pred_test_tf)

SVM Model Accuracy Score: 0.853214
SVM Model Precision Score: 0.804429
SVM Model Recall Score: 0.811429
SVM Model FScore: 0.807805
SVM Model Confusion Matrix
[[1875  218]
 [ 193  514]]


In [110]:
y_pred_validate_tf = svm_model_tf.predict(X_validate_tf)
score_model(y_validate_tf, y_pred_validate_tf)

SVM Model Accuracy Score: 0.853214
SVM Model Precision Score: 0.804429
SVM Model Recall Score: 0.811429
SVM Model FScore: 0.807805
SVM Model Confusion Matrix
[[1875  218]
 [ 193  514]]


## Naive Bayes Model with CountVectorizer Data

### Training Naive Bayes Model with CountVectorizer Data

In [21]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

### Scoring Naive Bayes Model with CountVectorizer Data

In [22]:
y_nb_pred_train = nb_model.predict(X_train)
score_model(y_train, y_nb_pred_train, False)

NaiveBayes Model Accuracy Score: 0.906162
NaiveBayes Model Precision Score: 0.879836
NaiveBayes Model Recall Score: 0.893411
NaiveBayes Model FScore: 0.886184
NaiveBayes Model Confusion Matrix
[[14312  1200]
 [  827  5262]]


In [23]:
y_nb_pred_test = nb_model.predict(X_test)
score_model(y_test, y_nb_pred_test, False)

NaiveBayes Model Accuracy Score: 0.866071
NaiveBayes Model Precision Score: 0.830640
NaiveBayes Model Recall Score: 0.802702
NaiveBayes Model FScore: 0.815016
NaiveBayes Model Confusion Matrix
[[1948  145]
 [ 230  477]]


In [24]:
y_nb_pred_validate = nb_model.predict(X_validate)
score_model(y_validate, y_nb_pred_validate, False)

NaiveBayes Model Accuracy Score: 0.866071
NaiveBayes Model Precision Score: 0.830640
NaiveBayes Model Recall Score: 0.802702
NaiveBayes Model FScore: 0.815016
NaiveBayes Model Confusion Matrix
[[1948  145]
 [ 230  477]]


## Naive Bayes Model with TfIdfVectorizer Data

### Training Naive Bayes Model with TfIdfVectorizer Data

In [104]:
nb_model_tf = MultinomialNB()
nb_model_tf.fit(X_train_tf, y_train_tf)

### Scoring Naive Bayes Model with TfIdfVectorizer Data

In [111]:
y_nb_pred_train_tf = nb_model_tf.predict(X_train_tf)
score_model(y_train_tf, y_nb_pred_train_tf, False)

NaiveBayes Model Accuracy Score: 0.896394
NaiveBayes Model Precision Score: 0.908763
NaiveBayes Model Recall Score: 0.831889
NaiveBayes Model FScore: 0.859825
NaiveBayes Model Confusion Matrix
[[15198   314]
 [ 1924  4165]]


In [112]:
y_nb_pred_test_tf = nb_model_tf.predict(X_test_tf)
score_model(y_test_tf, y_nb_pred_test_tf, False)

NaiveBayes Model Accuracy Score: 0.842143
NaiveBayes Model Precision Score: 0.863144
NaiveBayes Model Recall Score: 0.704740
NaiveBayes Model FScore: 0.740190
NaiveBayes Model Confusion Matrix
[[2056   37]
 [ 405  302]]


In [113]:
y_nb_pred_validate_tf = nb_model_tf.predict(X_validate_tf)
score_model(y_validate_tf, y_nb_pred_validate_tf, False)

NaiveBayes Model Accuracy Score: 0.842143
NaiveBayes Model Precision Score: 0.863144
NaiveBayes Model Recall Score: 0.704740
NaiveBayes Model FScore: 0.740190
NaiveBayes Model Confusion Matrix
[[2056   37]
 [ 405  302]]


## RBF SVM Model using CountVectorizer Data

### Training RBF SVM Model using CountVectorizer Data

In [78]:
svm_model_rbf = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto')
svm_model_rbf.fit(X_train, y_train)

### Scoring RBF SVM Model using CountVectorizer Data

In [82]:
y_pred_train_rbf = svm_model_rbf.predict(X_train)
score_model(y_train, y_pred_train_rbf)

SVM Model Accuracy Score: 0.718115
SVM Model Precision Score: 0.859057
SVM Model Recall Score: 0.500000
SVM Model FScore: 0.417967
SVM Model Confusion Matrix
[[15512     0]
 [ 6089     0]]


In [80]:
y_pred_test_rbf = svm_model_rbf.predict(X_test)
score_model(y_test, y_pred_test_rbf)

SVM Model Accuracy Score: 0.747500
SVM Model Precision Score: 0.873750
SVM Model Recall Score: 0.500000
SVM Model FScore: 0.427754
SVM Model Confusion Matrix
[[2093    0]
 [ 707    0]]


In [81]:
y_pred_validate_rbf = svm_model_rbf.predict(X_validate)
score_model(y_validate, y_pred_validate_rbf)

SVM Model Accuracy Score: 0.747500
SVM Model Precision Score: 0.873750
SVM Model Recall Score: 0.500000
SVM Model FScore: 0.427754
SVM Model Confusion Matrix
[[2093    0]
 [ 707    0]]


## RBF SVM Model using TfIdfVectorizer Data

### Training RBF SVM Model using TfIdfVectorizer Data

In [117]:
svm_model_rbf_tf = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto')
svm_model_rbf_tf.fit(X_train_tf, y_train_tf)

### Scoring RBF SVM Model using TfIdfVectorizer Data

In [118]:
y_pred_train_rbf_tf = svm_model_rbf_tf.predict(X_train_tf)
score_model(y_train_tf, y_pred_train_rbf_tf)

SVM Model Accuracy Score: 0.718115
SVM Model Precision Score: 0.859057
SVM Model Recall Score: 0.500000
SVM Model FScore: 0.417967
SVM Model Confusion Matrix
[[15512     0]
 [ 6089     0]]


In [119]:
y_pred_test_rbf_tf = svm_model_rbf_tf.predict(X_test_tf)
score_model(y_test_tf, y_pred_test_rbf_tf)

SVM Model Accuracy Score: 0.747500
SVM Model Precision Score: 0.873750
SVM Model Recall Score: 0.500000
SVM Model FScore: 0.427754
SVM Model Confusion Matrix
[[2093    0]
 [ 707    0]]


In [120]:
y_pred_validate_rbf_tf = svm_model_rbf_tf.predict(X_validate_tf)
score_model(y_validate_tf, y_pred_validate_rbf_tf)

SVM Model Accuracy Score: 0.747500
SVM Model Precision Score: 0.873750
SVM Model Recall Score: 0.500000
SVM Model FScore: 0.427754
SVM Model Confusion Matrix
[[2093    0]
 [ 707    0]]


## Uniform Naive Bayes Model using CountVectorizer Data

### Training Uniform Naive Bayes Model using CountVectorizer Data

In [29]:
nb_model_uni = MultinomialNB(fit_prior=False)
nb_model_uni.fit(X_train, y_train)

### Scoring Uniform Naive Bayes Model using CountVectorizer Data

In [75]:
y_nb_pred_train_uni = nb_model_uni.predict(X_train)
score_model(y_train, y_nb_pred_train_uni, False)

NaiveBayes Model Accuracy Score: 0.891440
NaiveBayes Model Precision Score: 0.858623
NaiveBayes Model Recall Score: 0.893137
NaiveBayes Model FScore: 0.872465
NaiveBayes Model Confusion Matrix
[[13794  1718]
 [  627  5462]]


In [31]:
y_nb_pred_test_uni = nb_model_uni.predict(X_test)
score_model(y_test, y_nb_pred_test_uni, False)

NaiveBayes Model Accuracy Score: 0.852857
NaiveBayes Model Precision Score: 0.803495
NaiveBayes Model Recall Score: 0.814000
NaiveBayes Model FScore: 0.808464
NaiveBayes Model Confusion Matrix
[[1868  225]
 [ 187  520]]


In [32]:
y_nb_pred_validate_uni = nb_model_uni.predict(X_validate)
score_model(y_validate, y_nb_pred_validate_uni, False)

NaiveBayes Model Accuracy Score: 0.852857
NaiveBayes Model Precision Score: 0.803495
NaiveBayes Model Recall Score: 0.814000
NaiveBayes Model FScore: 0.808464
NaiveBayes Model Confusion Matrix
[[1868  225]
 [ 187  520]]


## Oversampling

In [33]:
over_sampler = RandomOverSampler()
X_train_res, y_train_res = over_sampler.fit_resample(X_train, y_train)
X_train_res.shape, y_train_res.shape

((31024, 43643), (31024,))

## Linear SVM Model with Oversampled Data

### Training Linear SVM Model with Oversampled Data

In [34]:
svm_model_res = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
svm_model_res.fit(X_train_res, y_train_res)

### Scoring Linear SVM Model with Oversampled Data

In [35]:
y_pred_train_res = svm_model_res.predict(X_train_res)
score_model(y_train_res, y_pred_train_res)

SVM Model Accuracy Score: 0.990233
SVM Model Precision Score: 0.990269
SVM Model Recall Score: 0.990233
SVM Model FScore: 0.990233
SVM Model Confusion Matrix
[[15294   218]
 [   85 15427]]


In [36]:
y_pred_test_res = svm_model_res.predict(X_test)
score_model(y_test, y_pred_test_res)

SVM Model Accuracy Score: 0.817500
SVM Model Precision Score: 0.760053
SVM Model Recall Score: 0.788945
SVM Model FScore: 0.771619
SVM Model Confusion Matrix
[[1772  321]
 [ 190  517]]


In [37]:
y_pred_validate_res = svm_model_res.predict(X_validate)
score_model(y_validate, y_pred_validate_res)

SVM Model Accuracy Score: 0.817500
SVM Model Precision Score: 0.760053
SVM Model Recall Score: 0.788945
SVM Model FScore: 0.771619
SVM Model Confusion Matrix
[[1772  321]
 [ 190  517]]


## Naive Bayes Model with Oversampled Data

### Training Naive Bayes Model with Oversampled Data

In [38]:
nb_model_res = MultinomialNB()
nb_model_res.fit(X_train_res, y_train_res)

### Scoring Naive Bayes Model with Oversampled Data

In [76]:
y_nb_pred_train_res = nb_model_res.predict(X_train_res)
score_model(y_train_res, y_nb_pred_train_res, False)

NaiveBayes Model Accuracy Score: 0.915453
NaiveBayes Model Precision Score: 0.916231
NaiveBayes Model Recall Score: 0.915453
NaiveBayes Model FScore: 0.915413
NaiveBayes Model Confusion Matrix
[[13865  1647]
 [  976 14536]]


In [40]:
y_nb_pred_test_res = nb_model_res.predict(X_test)
score_model(y_test, y_nb_pred_test_res, False)

NaiveBayes Model Accuracy Score: 0.836071
NaiveBayes Model Precision Score: 0.782344
NaiveBayes Model Recall Score: 0.815417
NaiveBayes Model FScore: 0.795513
NaiveBayes Model Confusion Matrix
[[1794  299]
 [ 160  547]]


In [41]:
y_nb_pred_validate_res = nb_model_res.predict(X_validate)
score_model(y_validate, y_nb_pred_validate_res, False)

NaiveBayes Model Accuracy Score: 0.836071
NaiveBayes Model Precision Score: 0.782344
NaiveBayes Model Recall Score: 0.815417
NaiveBayes Model FScore: 0.795513
NaiveBayes Model Confusion Matrix
[[1794  299]
 [ 160  547]]


## Preprocess Dataset with StopWords

In [42]:
df_train_wsw = pd.read_csv('../data/train.csv')[['text_a','label']] # drop row numbers column
df_test_wsw = pd.read_csv('../data/test.csv')
df_validate_wsw = pd.read_csv('../data/dev.csv')

In [43]:
df_train_wsw.text_a = df_train_wsw.text_a.apply(normalize_text, args=(False, False))
df_test_wsw.text_a = df_test_wsw.text_a.apply(normalize_text, args=(False, False))
df_validate_wsw.text_a = df_validate_wsw.text_a.apply(normalize_text, args=(False, False))

In [44]:
vectorizer_wsw = CountVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x)
X_train_wsw, y_train_wsw = vectorizer_wsw.fit_transform(df_train_wsw.text_a), df_train_wsw.label
X_test_wsw, y_test_wsw = vectorizer_wsw.transform(df_test_wsw.text_a), df_test_wsw.label
X_validate_wsw, y_validate_wsw = vectorizer_wsw.transform(df_validate_wsw.text_a), df_validate_wsw.label

## Linear SVM Model using Data with StopWords

### Training Linear SVM Model using Data with StopWords

In [45]:
svm_model_wsw = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
svm_model_wsw.fit(X_train_wsw, y_train_wsw)

### Scoring Linear SVM Model using Data with StopWords

In [46]:
y_pred_train_wsw = svm_model_wsw.predict(X_train_wsw)
score_model(y_train_wsw, y_pred_train_wsw)

SVM Model Accuracy Score: 0.988658
SVM Model Precision Score: 0.985224
SVM Model Recall Score: 0.986815
SVM Model FScore: 0.986015
SVM Model Confusion Matrix
[[15373   139]
 [  106  5983]]


In [47]:
y_pred_test_wsw = svm_model_wsw.predict(X_test_wsw)
score_model(y_test_wsw, y_pred_test_wsw)

SVM Model Accuracy Score: 0.833929
SVM Model Precision Score: 0.779140
SVM Model Recall Score: 0.802276
SVM Model FScore: 0.789092
SVM Model Confusion Matrix
[[1813  280]
 [ 185  522]]


In [48]:
y_pred_validate_wsw = svm_model_wsw.predict(X_validate_wsw)
score_model(y_validate_wsw, y_pred_validate_wsw)

SVM Model Accuracy Score: 0.833929
SVM Model Precision Score: 0.779140
SVM Model Recall Score: 0.802276
SVM Model FScore: 0.789092
SVM Model Confusion Matrix
[[1813  280]
 [ 185  522]]


## Naive Bayes Model using Data with StopWords

### Training Naive Bayes Model using Data with StopWords

In [49]:
nb_model_wsw = MultinomialNB()
nb_model_wsw.fit(X_train_wsw, y_train_wsw)

### Scoring Naive Bayes Model using Data with StopWords

In [77]:
y_nb_pred_train_wsw = nb_model_wsw.predict(X_train_wsw)
score_model(y_train_wsw, y_nb_pred_train_wsw)

SVM Model Accuracy Score: 0.901116
SVM Model Precision Score: 0.872708
SVM Model Recall Score: 0.890446
SVM Model FScore: 0.880793
SVM Model Confusion Matrix
[[14192  1320]
 [  816  5273]]


In [51]:
y_nb_pred_test_wsw = nb_model_wsw.predict(X_test_wsw)
score_model(y_test_wsw, y_nb_pred_test_wsw)

SVM Model Accuracy Score: 0.869643
SVM Model Precision Score: 0.834547
SVM Model Recall Score: 0.809774
SVM Model FScore: 0.820874
SVM Model Confusion Matrix
[[1948  145]
 [ 220  487]]


In [52]:
y_nb_pred_validate_wsw = nb_model_wsw.predict(X_validate_wsw)
score_model(y_validate_wsw, y_nb_pred_validate_wsw)

SVM Model Accuracy Score: 0.869643
SVM Model Precision Score: 0.834547
SVM Model Recall Score: 0.809774
SVM Model FScore: 0.820874
SVM Model Confusion Matrix
[[1948  145]
 [ 220  487]]


## Bigram-Trigram

In [53]:
df_train_bg = pd.read_csv('../data/train.csv')[['text_a','label']] # drop row numbers column
df_test_bg = pd.read_csv('../data/test.csv')
df_validate_bg = pd.read_csv('../data/dev.csv')

In [54]:
df_train_bg.text_a = df_train_bg.text_a.apply(normalize_text, args=(False, True))
df_test_bg.text_a = df_test_bg.text_a.apply(normalize_text, args=(False, True))
df_validate_bg.text_a = df_validate_bg.text_a.apply(normalize_text, args=(False, True))

In [55]:
vectorizer_bg = CountVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x, ngram_range=(2,2))
X_train_bg, y_train_bg = vectorizer_bg.fit_transform(df_train_bg.text_a), df_train_bg.label
X_test_bg, y_test_bg = vectorizer_bg.transform(df_test_bg.text_a), df_test_bg.label
X_validate_bg, y_validate_bg = vectorizer_bg.transform(df_validate_bg.text_a), df_validate_bg.label
print(len(vectorizer_bg.get_feature_names_out()))

153416


## Linear SVM Model using Bigram Data

### Training Linear SVM Model using Bigram Data

In [56]:
svm_model_bg = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
svm_model_bg.fit(X_train_bg, y_train_bg)

### Scoring Linear SVM Model using Bigram Data

In [57]:
y_pred_train_bg = svm_model_bg.predict(X_train_bg)
score_model(y_train_bg, y_pred_train_bg)

SVM Model Accuracy Score: 0.993010
SVM Model Precision Score: 0.990298
SVM Model Recall Score: 0.992489
SVM Model FScore: 0.991386
SVM Model Confusion Matrix
[[15414    98]
 [   53  6036]]


In [58]:
y_pred_test_bg = svm_model_bg.predict(X_test_bg)
score_model(y_test_bg, y_pred_test_bg)

SVM Model Accuracy Score: 0.828929
SVM Model Precision Score: 0.778714
SVM Model Recall Score: 0.747416
SVM Model FScore: 0.760457
SVM Model Confusion Matrix
[[1909  184]
 [ 295  412]]


In [59]:
y_pred_validate_bg = svm_model_bg.predict(X_validate_bg)
score_model(y_validate_bg, y_pred_validate_bg)

SVM Model Accuracy Score: 0.828929
SVM Model Precision Score: 0.778714
SVM Model Recall Score: 0.747416
SVM Model FScore: 0.760457
SVM Model Confusion Matrix
[[1909  184]
 [ 295  412]]


## Naive Bayes Model with Bigram Data

### Training Naive Bayes Model with Bigram Data

In [60]:
nb_model_bg = MultinomialNB()
nb_model_bg.fit(X_train_bg, y_train_bg)

### Scoring Naive Bayes Model with Bigram Data

In [61]:
y_nb_pred_train_bg = nb_model_bg.predict(X_train_bg)
score_model(y_train_bg, y_nb_pred_train_bg)

SVM Model Accuracy Score: 0.977084
SVM Model Precision Score: 0.965134
SVM Model Recall Score: 0.979954
SVM Model FScore: 0.972151
SVM Model Confusion Matrix
[[15099   413]
 [   82  6007]]


In [62]:
y_nb_pred_test_bg = nb_model_bg.predict(X_test_bg)
score_model(y_test_bg, y_nb_pred_test_bg)

SVM Model Accuracy Score: 0.834643
SVM Model Precision Score: 0.780784
SVM Model Recall Score: 0.782616
SVM Model FScore: 0.781691
SVM Model Confusion Matrix
[[1858  235]
 [ 228  479]]


In [63]:
y_nb_pred_validate_bg = nb_model_bg.predict(X_validate_bg)
score_model(y_validate_bg, y_nb_pred_validate_bg)

SVM Model Accuracy Score: 0.834643
SVM Model Precision Score: 0.780784
SVM Model Recall Score: 0.782616
SVM Model FScore: 0.781691
SVM Model Confusion Matrix
[[1858  235]
 [ 228  479]]


In [64]:
df_train_tg = pd.read_csv('../data/train.csv')[['text_a','label']] # drop row numbers column
df_test_tg = pd.read_csv('../data/test.csv')
df_validate_tg = pd.read_csv('../data/dev.csv')

In [65]:
df_train_tg.text_a = df_train_tg.text_a.apply(normalize_text, args=(False, True))
df_test_tg.text_a = df_test_tg.text_a.apply(normalize_text, args=(False, True))
df_validate_tg.text_a = df_validate_tg.text_a.apply(normalize_text, args=(False, True))

In [66]:
vectorizer_tg = CountVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x, ngram_range=(3,3))
X_train_tg, y_train_tg = vectorizer_tg.fit_transform(df_train_tg.text_a), df_train_tg.label
X_test_tg, y_test_tg = vectorizer_tg.transform(df_test_tg.text_a), df_test_tg.label
X_validate_tg, y_validate_tg = vectorizer_tg.transform(df_validate_tg.text_a), df_validate_tg.label
print(len(vectorizer_tg.get_feature_names_out()))

175324


## Linear SVM Model using Trigram Data

### Training Linear SVM Model using Trigram Data

In [67]:
svm_model_tg = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
svm_model_tg.fit(X_train_tg, y_train_tg)

### Scoring Linear SVM Model using Trigram Data

In [68]:
y_pred_train_tg = svm_model_tg.predict(X_train_tg)
score_model(y_train_tg, y_pred_train_tg)

SVM Model Accuracy Score: 0.991158
SVM Model Precision Score: 0.989056
SVM Model Recall Score: 0.989105
SVM Model FScore: 0.989080
SVM Model Confusion Matrix
[[15416    96]
 [   95  5994]]


In [69]:
y_pred_test_tg = svm_model_tg.predict(X_test_tg)
score_model(y_test_tg, y_pred_test_tg)

SVM Model Accuracy Score: 0.816786
SVM Model Precision Score: 0.818723
SVM Model Recall Score: 0.662489
SVM Model FScore: 0.689918
SVM Model Confusion Matrix
[[2039   54]
 [ 459  248]]


In [70]:
y_pred_validate_tg = svm_model_tg.predict(X_validate_tg)
score_model(y_validate_tg, y_pred_validate_tg)

SVM Model Accuracy Score: 0.816786
SVM Model Precision Score: 0.818723
SVM Model Recall Score: 0.662489
SVM Model FScore: 0.689918
SVM Model Confusion Matrix
[[2039   54]
 [ 459  248]]


## Naive Bayes Model using Trigram Data

### Training Naive Bayes Model using Trigram Data

In [71]:
nb_model_tg = MultinomialNB()
nb_model_tg.fit(X_train_tg, y_train_tg)

### Scoring Naive Bayes Model using Trigram Data

In [72]:
y_nb_pred_train_tg = nb_model_tg.predict(X_train_tg)
score_model(y_train_tg, y_nb_pred_train_tg, False)

NaiveBayes Model Accuracy Score: 0.984121
NaiveBayes Model Precision Score: 0.977503
NaiveBayes Model Recall Score: 0.983657
NaiveBayes Model FScore: 0.980516
NaiveBayes Model Confusion Matrix
[[15275   237]
 [  106  5983]]


In [73]:
y_nb_pred_test_tg = nb_model_tg.predict(X_test_tg)
score_model(y_test_tg, y_nb_pred_test_tg, False)

NaiveBayes Model Accuracy Score: 0.816429
NaiveBayes Model Precision Score: 0.764842
NaiveBayes Model Recall Score: 0.715639
NaiveBayes Model FScore: 0.733489
NaiveBayes Model Confusion Matrix
[[1924  169]
 [ 345  362]]


In [74]:
y_nb_pred_validate_tg = nb_model_tg.predict(X_validate_tg)
score_model(y_validate_tg, y_nb_pred_validate_tg, False)

NaiveBayes Model Accuracy Score: 0.816429
NaiveBayes Model Precision Score: 0.764842
NaiveBayes Model Recall Score: 0.715639
NaiveBayes Model FScore: 0.733489
NaiveBayes Model Confusion Matrix
[[1924  169]
 [ 345  362]]
