# To Identify Terrorist Events using Event Triggers

## Imports

### Import Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Import Pre-defined Libraries/Packages/Modules/Methods

In [0]:
import pandas as pd
import numpy as np
import spacy as sp
import pickle
import os
from collections import OrderedDict
from collections import defaultdict
from collections import Counter
import string
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
import xml.etree.ElementTree as ET

### Installations/Loading

In [3]:
!pip3 install contractions
import contractions



In [4]:
!python -m spacy download en_core_web_lg
!pip3 install spacy-wordnet

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [5]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
!python -m nltk.downloader wordnet

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Import Custom Modules

## Global Objects


In [0]:
nlp = sp.load('en_core_web_lg')

In [0]:
HOME_PATH = './drive/My Drive/Colab Notebooks/201916007_PROJECT_IT550'
DATABASES_PATH = HOME_PATH + '/DATABASES'
INTERMEDIATE_FILES_PATH = HOME_PATH + '/INTERMEDIATE_FILES'
FIRE_DATABASE_TRAIN_PATH = DATABASES_PATH + '/data/English/Train'
FIRE_DATABASE_TEST_PATH = DATABASES_PATH + '/data/English/Test'
LEM = WordNetLemmatizer()
Encoder = LabelEncoder()

## Prepare data for classification

### Preprocessing

In [0]:
def preprocessing(data):
    """Function to preprocess the raw data"""
    
    global LEM

    # Remove contractions
    contractions_removed = contractions.fix(data.lower(), leftovers=True, slang=True)
    
    # Remove urls
    url_regex = r'\b((?:https?://)+(?:(?:www\.)?(?:[\da-z\.-]+)\.(?:[a-z]{2,6})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])?(?:/[\w\.-]*)*/?)\b'
    urls_extracted = re.findall(url_regex, contractions_removed)
    contractions_removed = re.sub(url_regex, " ", contractions_removed, flags=re.MULTILINE)
    
    # Remove punctuations and digits
    punc_num_to_be_removed = string.punctuation+string.digits
    punctuation_removed = ''.join([char for char in contractions_removed if char not in punc_num_to_be_removed])
    
    # Remove multiple whitespace
    multi_whitespace_removed = re.sub(r'\s+', ' ', punctuation_removed)

    # Tokenization
    tokens = word_tokenize(multi_whitespace_removed)

    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV

    Final_words = []
    for word, tag in pos_tag(tokens):
      if word not in stopwords.words('english') and word.isalpha():
        word_Final = LEM.lemmatize(word,tag_map[tag[0]])
        Final_words.append(word_Final)


    return Final_words
    

### FIRE DATABASE

In [0]:
train_file_list = sorted(os.listdir(FIRE_DATABASE_TRAIN_PATH))
test_file_list = sorted(os.listdir(FIRE_DATABASE_TEST_PATH))
train_file_dict = OrderedDict()
test_file_dict = OrderedDict()

In [0]:
def find_rec(node, element, result):
  for item in node.iter(element):
    result.append(item)
  return result

def get_label(label_elements):
  if label_elements:
    return  Counter(
        [(label.attrib['ID'],label.attrib['TYPE'])
        for label in label_elements])\
        .most_common(1)[0][0]

def get_data(node):
  data = ''
  for elem in node.iter():
    if elem.text:
      data += ' ' + elem.text
  return data

def extract_data_label_from_xml(xmlfile, idx, file_dict):

  tree = ET.parse(xmlfile)
  root = tree.getroot()
  
  label = get_label(find_rec(tree, 'MAN_MADE_EVENT', []))
  if label:
    file_dict[idx] = {'label':'', 'data':''}
    file_dict[idx]['label'] = label
    file_dict[idx]['data'] = get_data(root)

In [0]:
def extract_data_label_from_xml_file_list(file_list, file_dict, path):
  for idx,xmlfile in enumerate(file_list):
    extract_data_label_from_xml(xmlfile=path+'/'+xmlfile,
                                idx=idx,file_dict=file_dict)

## DONE!!!
# extract_data_label_from_xml_file_list(train_file_list,train_file_dict,
#                                       FIRE_DATABASE_TRAIN_PATH)
# extract_data_label_from_xml_file_list(test_file_list,test_file_dict,
#                                       FIRE_DATABASE_TEST_PATH)

In [0]:
def create_list_of_preprocessed_data(file_dict):
  file_data_list = list()
  file_label_list = list()
  for key, val in file_dict.items():
    file_data_list.append(preprocessing(val['data']))
    file_label_list.append(val['label'])
  return file_data_list, file_label_list

In [0]:
## DONE!!!
# train_file_data_list, train_file_label_list = create_list_of_preprocessed_data(train_file_dict)
# test_file_data_list, test_file_label_list = create_list_of_preprocessed_data(test_file_dict)

In [0]:
# pickle.dump(train_file_dict, open(INTERMEDIATE_FILES_PATH+'/FIREDATASET_train_file_dict.txt', 'wb'))
# pickle.dump(train_file_data_list, open(INTERMEDIATE_FILES_PATH+'/FIREDATASET_train_file_data_list.txt', 'wb'))
# pickle.dump(train_file_label_list, open(INTERMEDIATE_FILES_PATH+'/FIREDATASET_train_file_label_list.txt', 'wb'))

In [0]:
# pickle.dump(test_file_dict, open(INTERMEDIATE_FILES_PATH+'/FIREDATASET_test_file_dict.txt', 'wb'))
# pickle.dump(test_file_data_list, open(INTERMEDIATE_FILES_PATH+'/FIREDATASET_test_file_data_list.txt', 'wb'))
# pickle.dump(test_file_label_list, open(INTERMEDIATE_FILES_PATH+'/FIREDATASET_test_file_label_list.txt', 'wb'))

In [0]:
train_file_dict = pickle.load(open(INTERMEDIATE_FILES_PATH+'/FIREDATASET_train_file_dict.txt', 'rb'))
train_file_data_list = pickle.load(open(INTERMEDIATE_FILES_PATH+'/FIREDATASET_train_file_data_list.txt', 'rb'))
train_file_label_list = pickle.load(open(INTERMEDIATE_FILES_PATH+'/FIREDATASET_train_file_label_list.txt', 'rb'))

test_file_dict = pickle.load(open(INTERMEDIATE_FILES_PATH+'/FIREDATASET_test_file_dict.txt', 'rb'))
test_file_data_list = pickle.load(open(INTERMEDIATE_FILES_PATH+'/FIREDATASET_test_file_data_list.txt', 'rb'))
test_file_label_list = pickle.load(open(INTERMEDIATE_FILES_PATH+'/FIREDATASET_test_file_label_list.txt', 'rb'))

In [0]:
train_labels = list(zip(*train_file_label_list))[1]
test_labels = list(zip(*test_file_label_list))[1]

## Classification



In [0]:
Train_X = [' '.join(doc) for doc in train_file_data_list]
Test_X = [' '.join(doc) for doc in test_file_data_list]
Train_Y = train_labels
Test_Y = test_labels
corpus = Train_X + Test_X

In [0]:
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [0]:
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(corpus)
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

### Naive Bayes Classifier

In [0]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
predictions_NB = Naive.predict(Test_X_Tfidf)

In [22]:
print(classification_report(predictions_NB, Test_Y, zero_division=0))

              precision    recall  f1-score   support

           0       0.89      0.44      0.59       146
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.50      0.05      0.08        22
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          15       0.00      0.00      0.00        17

    accuracy                           0.35       187
   macro avg       0.09      0.03      0.04       187
weighted avg       0.75   

In [23]:
print(precision_score(predictions_NB, Test_Y, zero_division=0, average='weighted'))
print(recall_score(predictions_NB, Test_Y, zero_division=0, average='weighted'))
print(f1_score(predictions_NB, Test_Y, zero_division=0, average='weighted'))

0.7528223410576351
0.34759358288770054
0.4682251549493859


### SVM Classifier

In [0]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
predictions_SVM = SVM.predict(Test_X_Tfidf)

In [25]:
print(classification_report(predictions_SVM, Test_Y, zero_division=0))

              precision    recall  f1-score   support

           0       0.61      0.47      0.53        94
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         3
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00        17
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         5
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         4
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         4
          14       0.00      0.00      0.00        13
          15       0.00      0.00      0.00        44

    accuracy                           0.24       187
   macro avg       0.04   

In [26]:
print(precision_score(predictions_SVM, Test_Y, zero_division=0, average='weighted'))
print(recall_score(predictions_SVM, Test_Y, zero_division=0, average='weighted'))
print(f1_score(predictions_SVM, Test_Y, zero_division=0, average='weighted'))

0.30718954248366015
0.23529411764705882
0.26647767540751244


In [0]:
SVM = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
predictions_SVM = SVM.predict(Test_X_Tfidf)

In [28]:
print(classification_report(predictions_SVM, Test_Y, zero_division=0))

              precision    recall  f1-score   support

           0       1.00      0.39      0.56       187
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0

    accuracy                           0.39       187
   macro avg       0.07      0.03      0.04       187
weighted avg       1.00      0.39      0.56       187



In [29]:
print(precision_score(predictions_SVM, Test_Y, zero_division=0, average='weighted'))
print(recall_score(predictions_SVM, Test_Y, zero_division=0, average='weighted'))
print(f1_score(predictions_SVM, Test_Y, zero_division=0, average='weighted'))

1.0
0.3850267379679144
0.555984555984556


### Random Forest

In [0]:
RF = RandomForestClassifier()
RF.fit(Train_X_Tfidf,Train_Y)
predictions_RF = RF.predict(Test_X_Tfidf)

In [31]:
print(classification_report(predictions_RF, Test_Y, zero_division=0))

              precision    recall  f1-score   support

           0       0.62      0.58      0.60        78
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         3
           3       0.00      0.00      0.00         0
           4       0.50      0.03      0.05        37
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         4
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         8
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         2
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         8
          15       0.00      0.00      0.00        46

    accuracy                           0.25       187
   macro avg       0.07   

In [32]:
print(precision_score(predictions_RF, Test_Y, zero_division=0, average='weighted'))
print(recall_score(predictions_RF, Test_Y, zero_division=0, average='weighted'))
print(f1_score(predictions_RF, Test_Y, zero_division=0, average='weighted'))

0.35962566844919786
0.24598930481283418
0.26041409570821333


### XGBoost Classifier

In [0]:
XGB = XGBClassifier()
XGB.fit(Train_X_Tfidf,Train_Y)
predictions_XGB = XGB.predict(Test_X_Tfidf)

In [34]:
print(classification_report(predictions_XGB, Test_Y, zero_division=0))

              precision    recall  f1-score   support

           0       0.62      0.61      0.62        74
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00         6
           4       0.50      0.05      0.10        19
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         2
           7       0.00      0.00      0.00         6
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00        13
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         3
          14       0.00      0.00      0.00        24
          15       0.00      0.00      0.00        34

    accuracy                           0.25       187
   macro avg       0.07   

In [35]:
print(precision_score(predictions_XGB, Test_Y, zero_division=0, average='weighted'))
print(recall_score(predictions_XGB, Test_Y, zero_division=0, average='weighted'))
print(f1_score(predictions_XGB, Test_Y, zero_division=0, average='weighted'))

0.29812834224598933
0.24598930481283424
0.253614770939509


## Classification N-Gram (2,3)

In [0]:
Tfidf_vect = TfidfVectorizer(ngram_range=(2,3))
Tfidf_vect.fit(corpus)
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

### Naive Bayes Classifier

In [0]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
predictions_NB = Naive.predict(Test_X_Tfidf)

In [38]:
print(classification_report(predictions_NB, Test_Y, zero_division=0))

              precision    recall  f1-score   support

           0       0.96      0.41      0.57       169
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00        12
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         5

    accuracy                           0.37       187
   macro avg       0.06   

In [39]:
print(precision_score(predictions_NB, Test_Y, zero_division=0, average='weighted'))
print(recall_score(predictions_NB, Test_Y, zero_division=0, average='weighted'))
print(f1_score(predictions_NB, Test_Y, zero_division=0, average='weighted'))

0.8660873440285205
0.3689839572192513
0.517496172365589


### SVM Classifier

In [0]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
predictions_SVM = SVM.predict(Test_X_Tfidf)

In [41]:
print(classification_report(predictions_SVM, Test_Y, zero_division=0))

              precision    recall  f1-score   support

           0       0.90      0.41      0.57       158
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00        14

    accuracy                           0.35       187
   macro avg       0.06   

In [42]:
print(precision_score(predictions_SVM, Test_Y, zero_division=0, average='weighted'))
print(recall_score(predictions_SVM, Test_Y, zero_division=0, average='weighted'))
print(f1_score(predictions_SVM, Test_Y, zero_division=0, average='weighted'))

0.7627748068924539
0.34759358288770054
0.4775633573587538


In [0]:
SVM = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
predictions_SVM = SVM.predict(Test_X_Tfidf)

In [44]:
print(classification_report(predictions_SVM, Test_Y, zero_division=0))

              precision    recall  f1-score   support

           0       1.00      0.39      0.56       187
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0

    accuracy                           0.39       187
   macro avg       0.07      0.03      0.04       187
weighted avg       1.00      0.39      0.56       187



In [45]:
print(precision_score(predictions_SVM, Test_Y, zero_division=0, average='weighted'))
print(recall_score(predictions_SVM, Test_Y, zero_division=0, average='weighted'))
print(f1_score(predictions_SVM, Test_Y, zero_division=0, average='weighted'))

1.0
0.3850267379679144
0.555984555984556


### Random Forest

In [0]:
RF = RandomForestClassifier()
RF.fit(Train_X_Tfidf,Train_Y)
predictions_RF = RF.predict(Test_X_Tfidf)

In [47]:
print(classification_report(predictions_RF, Test_Y, zero_division=0))

              precision    recall  f1-score   support

           0       0.90      0.41      0.57       157
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00        15
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         3
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         3
          15       0.00      0.00      0.00         9

    accuracy                           0.35       187
   macro avg       0.06   

In [48]:
print(precision_score(predictions_RF, Test_Y, zero_division=0, average='weighted'))
print(recall_score(predictions_RF, Test_Y, zero_division=0, average='weighted'))
print(f1_score(predictions_RF, Test_Y, zero_division=0, average='weighted'))

0.757947118241236
0.34759358288770054
0.47661303505125746


### XGBoost Classifier

In [0]:
XGB = XGBClassifier()
XGB.fit(Train_X_Tfidf,Train_Y)
predictions_XGB = XGB.predict(Test_X_Tfidf)

In [50]:
print(classification_report(predictions_XGB, Test_Y, zero_division=0))

              precision    recall  f1-score   support

           0       0.62      0.46      0.53        97
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         3
           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00        17
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         2
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         5
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         2
          14       0.00      0.00      0.00        18
          15       0.00      0.00      0.00        39

    accuracy                           0.24       187
   macro avg       0.04   

In [51]:
print(precision_score(predictions_XGB, Test_Y, zero_division=0, average='weighted'))
print(recall_score(predictions_XGB, Test_Y, zero_division=0, average='weighted'))
print(f1_score(predictions_XGB, Test_Y, zero_division=0, average='weighted'))

0.32419786096256686
0.24064171122994651
0.2762395975065658


## Classification N-gram (3,4)

In [0]:
Tfidf_vect = TfidfVectorizer(ngram_range=(3,4))
Tfidf_vect.fit(corpus)
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

### Naive Bayes Classifier

In [0]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
predictions_NB = Naive.predict(Test_X_Tfidf)

In [54]:
print(classification_report(predictions_NB, Test_Y, zero_division=0))

              precision    recall  f1-score   support

           0       0.99      0.39      0.55       184
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         1

    accuracy                           0.38       187
   macro avg       0.06   

In [55]:
print(precision_score(predictions_NB, Test_Y, zero_division=0, average='weighted'))
print(recall_score(predictions_NB, Test_Y, zero_division=0, average='weighted'))
print(f1_score(predictions_NB, Test_Y, zero_division=0, average='weighted'))

0.9702911467617351
0.37967914438502676
0.5457887700534759


### SVM Classifier

In [0]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
predictions_SVM = SVM.predict(Test_X_Tfidf)

In [57]:
print(classification_report(predictions_SVM, Test_Y, zero_division=0))

              precision    recall  f1-score   support

           0       0.99      0.39      0.56       182
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         2

    accuracy                           0.38       187
   macro avg       0.06   

In [58]:
print(precision_score(predictions_SVM, Test_Y, zero_division=0, average='weighted'))
print(recall_score(predictions_SVM, Test_Y, zero_division=0, average='weighted'))
print(f1_score(predictions_SVM, Test_Y, zero_division=0, average='weighted'))

0.9597445038621509
0.37967914438502676
0.5441071202998021


In [0]:
SVM = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
predictions_SVM = SVM.predict(Test_X_Tfidf)

In [60]:
print(classification_report(predictions_SVM, Test_Y, zero_division=0))

              precision    recall  f1-score   support

           0       1.00      0.39      0.56       187
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0

    accuracy                           0.39       187
   macro avg       0.07      0.03      0.04       187
weighted avg       1.00      0.39      0.56       187



In [61]:
print(precision_score(predictions_SVM, Test_Y, zero_division=0, average='weighted'))
print(recall_score(predictions_SVM, Test_Y, zero_division=0, average='weighted'))
print(f1_score(predictions_SVM, Test_Y, zero_division=0, average='weighted'))

1.0
0.3850267379679144
0.555984555984556
