In [None]:
# Read and Split the given Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Text Normalizatin Libraries

import re
import string
import contractions
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
# Feature Extraction Using Feature Engineering Models

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
# Models Library

# Classifier Types
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier

# Logistic Regression
from sklearn.linear_model import LogisticRegression

# Support Vector Machines
from sklearn.svm import SVC

# Naive Bayes
from sklearn.naive_bayes import MultinomialNB

In [None]:
# Evaluation metrics
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [None]:
# read data from the json files

train_df = pd.read_json("train.jsonl", lines = True)
validation_df = pd.read_json("validation.jsonl", lines = True)

In [None]:
print("train data shape:",train_df.shape)
print("validation data shape:",validation_df.shape)

train data shape: (3200, 14)
validation data shape: (800, 14)


In [None]:
# Merge the complete data to divide the data train, test and validation

total_df = pd.concat([train_df, validation_df], ignore_index = True)

In [None]:
print("total data shape:",total_df.shape)

total data shape: (4000, 14)


In [None]:
print(total_df.columns)

Index(['uuid', 'postId', 'postText', 'postPlatform', 'targetParagraphs',
       'targetTitle', 'targetDescription', 'targetKeywords', 'targetMedia',
       'targetUrl', 'provenance', 'spoiler', 'spoilerPositions', 'tags'],
      dtype='object')


In [None]:
print("uuid:", total_df.loc[0, "uuid"])
print("postId:", total_df.loc[0, "postId"])
print("postText:", total_df.loc[0, "postPlatform"])
print("targetParagraphs:", total_df.loc[0, "targetParagraphs"])
print("targetTitle:", total_df.loc[0, "targetTitle"])
print("targetDescription:", total_df.loc[0, "targetDescription"])
print("targetKeywords:", total_df.loc[0, "targetKeywords"])
print("tragetMedia:", total_df.loc[0, "targetMedia"])
print("targetUrl:", total_df.loc[0, "targetUrl"])
print("provenance:", total_df.loc[0, "provenance"])
print("spoiler:", total_df.loc[0, "spoiler"])
print("spoilerPositions:", total_df.loc[0, "spoilerPositions"])
print("tags:", total_df.loc[0, "tags"])

uuid: 0af11f6b-c889-4520-9372-66ba25cb7657
postId: 532quh
postText: reddit
targetParagraphs: ['It’ll be just like old times this weekend for Tom Brady and Wes Welker.', 'Welker revealed Friday morning on a Miami radio station that he contacted Brady because he’ll be in town for Sunday’s game between the New England Patriots and Miami Dolphins at Gillette Stadium. It seemed like a perfect opportunity for the two to catch up.', 'But Brady’s definition of "catching up" involves far more than just a meal. In fact, it involves some literal "catching" as the Patriots quarterback looks to stay sharp during his four-game Deflategate suspension.', '"I hit him up to do dinner Saturday night. He’s like, ‘I’m going to be flying in from Ann Arbor later (after the Michigan-Colorado football game), but how about that morning we go throw?’ " Welker said on WQAM, per The Boston Globe. "And I’m just sitting there, I’m like, ‘I was just thinking about dinner, but yeah, sure. I’ll get over there early and

In [None]:
# Drop unwanted columns

total_df = total_df[["targetTitle", "targetDescription", "tags"]]

In [None]:
print(total_df.columns)

Index(['targetTitle', 'targetDescription', 'tags'], dtype='object')


In [None]:
total_df.rename(columns={"targetTitle":"Post", "targetDescription":"Content", "tags":"Spoiler_Type"}, inplace = True)
total_df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  total_df.rename(columns={"targetTitle":"Post", "targetDescription":"Content", "tags":"Spoiler_Type"}, inplace = True)


Unnamed: 0,Post,Content,Spoiler_Type
0,"Wes Welker Wanted Dinner With Tom Brady, But P...",It'll be just like old times this weekend for ...,[passage]
1,Hole In Ozone Layer Expected To Make Full Reco...,2070 is shaping up to be a great year for Moth...,[phrase]
2,Intellectual Stimulation Trumps Money For Empl...,By: Chad Brooks \r\nPublished: 09/18/2013 06:4...,[phrase]
3,"‘Follow your passion’ is wrong, here are 7 hab...",There's a lot more to work that loving your job,[multi]
4,Revealed: The perfect way to cook rice so that...,The question 'How does one cook rice properly?...,[phrase]


In [None]:
# Function to Make values in Spoiler_Type(list to string)
def list_to_string(spoiler_type):
  if spoiler_type[0] == "passage":
    return "passage"
  elif spoiler_type[0] == "phrase":
    return "phrase"
  else:
    return "multi"

In [None]:
total_df = total_df.values.tolist()

In [None]:
val = 0
for each in total_df:
  each[1] = str(each[0]) + " ? "+ str(each[1])
  total_df[val][1] = each[1]
  val += 1

In [None]:
total_df = pd.DataFrame(total_df, columns = ["Post", "Content", "Spoiler_Type"])

In [None]:
total_df["Spoiler_Type"] = total_df["Spoiler_Type"].apply(list_to_string)

In [None]:
total_df.head(5)

Unnamed: 0,Post,Content,Spoiler_Type
0,"Wes Welker Wanted Dinner With Tom Brady, But P...","Wes Welker Wanted Dinner With Tom Brady, But P...",passage
1,Hole In Ozone Layer Expected To Make Full Reco...,Hole In Ozone Layer Expected To Make Full Reco...,phrase
2,Intellectual Stimulation Trumps Money For Empl...,Intellectual Stimulation Trumps Money For Empl...,phrase
3,"‘Follow your passion’ is wrong, here are 7 hab...","‘Follow your passion’ is wrong, here are 7 hab...",multi
4,Revealed: The perfect way to cook rice so that...,Revealed: The perfect way to cook rice so that...,phrase


In [None]:
total_df.drop(["Post"], inplace = True, axis = 1)

In [None]:
total_df.head(5)

Unnamed: 0,Content,Spoiler_Type
0,"Wes Welker Wanted Dinner With Tom Brady, But P...",passage
1,Hole In Ozone Layer Expected To Make Full Reco...,phrase
2,Intellectual Stimulation Trumps Money For Empl...,phrase
3,"‘Follow your passion’ is wrong, here are 7 hab...",multi
4,Revealed: The perfect way to cook rice so that...,phrase


# Text Normalization

In [None]:
# define a function for text normalization
def normalize_text(text):
    # convert to lowercase
    # print(text)

    text = text.lower()
    
    # replace contractions
    text = contractions.fix(text)
    
    # remove special characters
    text = re.sub(r'\W', ' ', text)
    
    # remove digits
    text = re.sub(r'\d', '', text)
    
    # tokenize the text
    tokens = word_tokenize(text)
    
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    normalized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    # join tokens back into a string
    normalized_text = ' '.join(normalized_tokens)
    
    return normalized_text

In [None]:
total_df["Content"] = total_df["Content"].apply(normalize_text)

Wes Welker Wanted Dinner With Tom Brady, But Patriots QB Had A Better Idea ? It'll be just like old times this weekend for Tom Brady and Wes Welker. Welker revealed Friday morning on a Miami radio station that he contacted Brady because he'll be in town for Sunday's game between the New England Patriots and Miami Dolphins at Gillette Stadium.
Hole In Ozone Layer Expected To Make Full Recovery By 2070: NASA ? 2070 is shaping up to be a great year for Mother Earth.

That's when NASA scientists are predicting the
Intellectual Stimulation Trumps Money For Employee Happiness, Survey Finds ? By: Chad Brooks 
Published: 09/18/2013 06:40 AM EDT on BusinessNewsDaily

 Despite common belief, money isn't the key to employee happiness, new re...
‘Follow your passion’ is wrong, here are 7 habits you need instead ? There's a lot more to work that loving your job
Revealed: The perfect way to cook rice so that it's perfectly fluffy and NEVER sticks to the bottom of the pan  ? The question 'How does on

In [None]:
total_df.head(5)

Unnamed: 0,Content,Spoiler_Type
0,wes welker wanted dinner tom brady patriot qb ...,passage
1,hole ozone layer expected make full recovery n...,phrase
2,intellectual stimulation trump money employee ...,phrase
3,follow passion wrong habit need instead lot wo...,multi
4,revealed perfect way cook rice perfectly fluff...,phrase


In [None]:
total_df.rename(columns = {"Content": "Tokenized_Content"}, inplace=True)

In [None]:
def spoiler_type_to_num(text):
  if text == "phrase":
    return 0
  elif text == "passage":
    return 1
  elif text == "multi":
    return 2

In [None]:
total_df["Spoiler_Type"] = total_df["Spoiler_Type"].apply(spoiler_type_to_num)

In [None]:
total_df["Spoiler_Type"].head(5)

0    1
1    0
2    0
3    2
4    0
Name: Spoiler_Type, dtype: int64

In [None]:
# Split the data into train, validation, test 

X, X_test, Y, Y_test = train_test_split(total_df["Tokenized_Content"], total_df["Spoiler_Type"], random_state = 4, test_size = 0.20, stratify = total_df["Spoiler_Type"])
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size = 0.10, random_state = 4, stratify = Y)

In [None]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

print(total_df["Spoiler_Type"].shape)

print(Y_train.shape)
print(Y_val.shape)
print(Y_test.shape)

(2880,)
(320,)
(800,)
(4000,)
(2880,)
(320,)
(800,)


In [None]:
print(type(X_train))
print(type(X_val))
print(type(X_test))

print(type(Y_train))
print(type(Y_val))
print(type(Y_test))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [None]:
Y_test

2429    0
1181    2
3149    1
2179    1
3422    1
       ..
1111    0
1025    0
2370    2
536     1
2716    0
Name: Spoiler_Type, Length: 800, dtype: int64

In [None]:
X_train = pd.DataFrame(X_train, columns = ["Tokenized_Content"])
X_val = pd.DataFrame(X_val, columns = ["Tokenized_Content"])
X_test = pd.DataFrame(X_test, columns = ["Tokenized_Content"])

Y_train = pd.DataFrame(Y_train, columns = ["Spoiler_Type"])
Y_val = pd.DataFrame(Y_val, columns = ["Spoiler_Type"])
Y_test = pd.DataFrame(Y_test, columns = ["Spoiler_Type"])

In [None]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

print(total_df["Spoiler_Type"].shape)

print(Y_train.shape)
print(Y_val.shape)
print(Y_test.shape)

(2880, 1)
(320, 1)
(800, 1)
(4000,)
(2880, 1)
(320, 1)
(800, 1)


# Feature Engineering

In [None]:
# Feature Engineering
  # Extract Features based on model
def get_features(train_data, val_data, test_data, model="bag_of_words"):
  if model == "bag_of_words":
    vectorizer = CountVectorizer()
    vectorizer.fit(train_data["Tokenized_Content"])
    return vectorizer.transform(train_data["Tokenized_Content"]), vectorizer.transform(val_data["Tokenized_Content"]), vectorizer.transform(test_data["Tokenized_Content"])
  elif model == "tf_idf_model":
    vectorizer = TfidfVectorizer(stop_words = "english", max_features=10000)
    return vectorizer.fit_transform(train_data["Tokenized_Content"]), vectorizer.transform(val_data["Tokenized_Content"]), vectorizer.transform(test_data["Tokenized_Content"])    
  elif model == "****":
    pass

In [None]:
Y_test.shape

(800, 1)

# Logistic Regression

In [None]:
# Using Bag Of Words Model
train_x_bag, val_x_bag, test_x_bag = get_features(X_train, X_val, X_test, "bag_of_words")

# 1. Multi-Class
model = LogisticRegression(max_iter = 1000, multi_class = "multinomial", class_weight = "balanced")
model.fit(train_x_bag, Y_train)

predict_val_multi_bag = model.predict(val_x_bag)
predict_test_multi_bag = model.predict(test_x_bag)

print("Y_val shape:", Y_val.shape)
print("Predict_val_multi_bag:", predict_val_multi_bag.shape)

print("Y_test shape:", Y_test.shape)
print("Predict_test_multi_bag:", predict_test_multi_bag.shape)
  # Evaluation Metrics:

print("F1-Score Multi Class \n")
print("F1-Score for Multi Class Validation Data:{:.2f}%".format(f1_score(Y_val, predict_val_multi_bag, average="weighted")))
print("F1-Score for Multi Class Test Data:{:.2f}%".format(f1_score(Y_test, predict_test_multi_bag, average="weighted")))
print("\n")
print("Accuracy-Score for Multi Class Validation Data:{:.2f}%".format(accuracy_score(Y_val, predict_val_multi_bag)))
print("Accuracy-Score for Multi Class Test Data:{:.2f}%".format(accuracy_score(Y_test, predict_test_multi_bag)))

# 2. OneVsRest
model = OneVsRestClassifier(LogisticRegression(max_iter = 1000, class_weight = "balanced"))
model.fit(train_x_bag, Y_train)

predict_val_ovr_bag = model.predict(val_x_bag)
predict_test_ovr_bag = model.predict(test_x_bag)

  # Evaluation Metrics:

print("F1-Score One Vs Rest \n")
print("F1-Score for One Vs Rest Class Validation Data:{:.2f}%".format(f1_score(Y_val, predict_val_ovr_bag, average="weighted")))
print("F1-Score for One Vs Rest Test Data:{:.2f}%".format(f1_score(Y_test, predict_test_ovr_bag, average="weighted")))
print("\n")
print("Accuracy-Score for One Vs Rest Validation Data:{:.2f}%".format(accuracy_score(Y_val, predict_val_ovr_bag)))
print("Accuracy-Score for One Vs Rest Test Data:{:.2f}%".format(accuracy_score(Y_test, predict_test_ovr_bag)))


# 3. OneVsOne

model = OneVsOneClassifier(LogisticRegression(max_iter = 1000, class_weight = "balanced"))
model.fit(train_x_bag, Y_train)

predict_val_ovo_bag = model.predict(val_x_bag)
predict_test_ovo_bag = model.predict(test_x_bag)

  # Evaluation Metrics:

print("F1-Score One Vs One \n")
print("F1-Score for One Vs One Class Validation Data:{:.2f}%".format(f1_score(Y_val, predict_val_ovo_bag, average="weighted")))
print("F1-Score for One Vs One Test Data:{:.2f}%".format(f1_score(Y_test, predict_test_ovo_bag, average="weighted")))
print("\n")
print("Accuracy-Score for One Vs One Validation Data:{:.2f}%".format(accuracy_score(Y_val, predict_val_ovo_bag)))
print("Accuracy-Score for One Vs One Test Data:{:.2f}%".format(accuracy_score(Y_test, predict_test_ovo_bag)))


  y = column_or_1d(y, warn=True)


Y_val shape: (320, 1)
Predict_val_multi_bag: (320,)
Y_test shape: (800, 1)
Predict_test_multi_bag: (800,)
F1-Score Multi Class 

F1-Score for Multi Class Validation Data:0.48%
F1-Score for Multi Class Test Data:0.46%


Accuracy-Score for Multi Class Validation Data:0.48%
Accuracy-Score for Multi Class Test Data:0.46%
F1-Score One Vs Rest 

F1-Score for One Vs Rest Class Validation Data:0.47%
F1-Score for One Vs Rest Test Data:0.47%


Accuracy-Score for One Vs Rest Validation Data:0.47%
Accuracy-Score for One Vs Rest Test Data:0.47%


  y = column_or_1d(y, warn=True)


F1-Score One Vs One 

F1-Score for One Vs One Class Validation Data:0.48%
F1-Score for One Vs One Test Data:0.48%


Accuracy-Score for One Vs One Validation Data:0.48%
Accuracy-Score for One Vs One Test Data:0.48%


In [None]:
# Using TF-IDF Model
train_x_bag, val_x_bag, test_x_bag = get_features(X_train, X_val, X_test, "tf_idf_model")

# 1. Multi-Class
model = LogisticRegression(max_iter = 1000, multi_class = "multinomial", class_weight = "balanced")
model.fit(train_x_bag, Y_train)

predict_val_multi_bag = model.predict(val_x_bag)
predict_test_multi_bag = model.predict(test_x_bag)

print("Y_val shape:", Y_val.shape)
print("Predict_val_multi_bag:", predict_val_multi_bag.shape)

print("Y_test shape:", Y_test.shape)
print("Predict_test_multi_bag:", predict_test_multi_bag.shape)
  # Evaluation Metrics:

print("F1-Score Multi Class \n")
print("F1-Score for Multi Class Validation Data:{:.2f}%".format(f1_score(Y_val, predict_val_multi_bag, average="weighted")))
print("F1-Score for Multi Class Test Data:{:.2f}%".format(f1_score(Y_test, predict_test_multi_bag, average="weighted")))
print("\n")
print("Accuracy-Score for Multi Class Validation Data:{:.2f}%".format(accuracy_score(Y_val, predict_val_multi_bag)))
print("Accuracy-Score for Multi Class Test Data:{:.2f}%".format(accuracy_score(Y_test, predict_test_multi_bag)))

# 2. OneVsRest
model = OneVsRestClassifier(LogisticRegression(max_iter = 1000, class_weight = "balanced"))
model.fit(train_x_bag, Y_train)

predict_val_ovr_bag = model.predict(val_x_bag)
predict_test_ovr_bag = model.predict(test_x_bag)

  # Evaluation Metrics:

print("F1-Score Multi Class \n")
print("F1-Score for One Vs Rest Class Validation Data:{:.2f}%".format(f1_score(Y_val, predict_val_ovr_bag, average="weighted")))
print("F1-Score for One Vs Rest Test Data:{:.2f}%".format(f1_score(Y_test, predict_test_ovr_bag, average="weighted")))
print("\n")
print("Accuracy-Score for One Vs Rest Validation Data:{:.2f}%".format(accuracy_score(Y_val, predict_val_ovr_bag)))
print("Accuracy-Score for One Vs Rest Test Data:{:.2f}%".format(accuracy_score(Y_test, predict_test_ovr_bag)))


# 3. OneVsOne

model = OneVsOneClassifier(LogisticRegression(max_iter = 1000, class_weight = "balanced"))
model.fit(train_x_bag, Y_train)

predict_val_ovo_bag = model.predict(val_x_bag)
predict_test_ovo_bag = model.predict(test_x_bag)

  # Evaluation Metrics:

print("F1-Score Multi Class \n")
print("F1-Score for One Vs One Class Validation Data:{:.2f}%".format(f1_score(Y_val, predict_val_ovo_bag, average="weighted")))
print("F1-Score for One Vs One Test Data:{:.2f}%".format(f1_score(Y_test, predict_test_ovo_bag, average="weighted")))
print("\n")
print("Accuracy-Score for One Vs One Validation Data:{:.2f}%".format(accuracy_score(Y_val, predict_val_ovo_bag)))
print("Accuracy-Score for One Vs One Test Data:{:.2f}%".format(accuracy_score(Y_test, predict_test_ovo_bag)))


  y = column_or_1d(y, warn=True)


Y_val shape: (320, 1)
Predict_val_multi_bag: (320,)
Y_test shape: (800, 1)
Predict_test_multi_bag: (800,)
F1-Score Multi Class 

F1-Score for Multi Class Validation Data:0.49%
F1-Score for Multi Class Test Data:0.49%


Accuracy-Score for Multi Class Validation Data:0.49%
Accuracy-Score for Multi Class Test Data:0.49%
F1-Score Multi Class 

F1-Score for One Vs Rest Class Validation Data:0.50%
F1-Score for One Vs Rest Test Data:0.49%


Accuracy-Score for One Vs Rest Validation Data:0.50%
Accuracy-Score for One Vs Rest Test Data:0.49%


  y = column_or_1d(y, warn=True)


F1-Score Multi Class 

F1-Score for One Vs One Class Validation Data:0.50%
F1-Score for One Vs One Test Data:0.49%


Accuracy-Score for One Vs One Validation Data:0.49%
Accuracy-Score for One Vs One Test Data:0.49%


In [None]:
# Using **** Model
    # 1. Multi-Class
    # 2. OneVsRest
    # 3. OneVsOne

# Support Vector Machines

In [None]:
# Using Bag Of Words Model
train_x_bag, val_x_bag, test_x_bag = get_features(X_train, X_val, X_test, "bag_of_words")

# 1. Multi-Class
model = SVC(max_iter=1000, class_weight="balanced", decision_function_shape="ovr")
model.fit(train_x_bag, Y_train)

predict_val_ovr_bag = model.predict(val_x_bag)
predict_test_ovr_bag = model.predict(test_x_bag)

  # Evaluation Metrics:

print("F1-Score One Vs Rest Class \n")
print("F1-Score for One Vs Rest Class Validation Data:{:.2f}%".format(f1_score(Y_val, predict_val_ovr_bag, average="weighted")))
print("F1-Score for One Vs Rest Test Data:{:.2f}%".format(f1_score(Y_test, predict_test_ovr_bag, average="weighted")))
print("\n")
print("Accuracy-Score for One Vs Rest Validation Data:{:.2f}%".format(accuracy_score(Y_val, predict_val_ovr_bag)))
print("Accuracy-Score for One Vs Rest Test Data:{:.2f}%".format(accuracy_score(Y_test, predict_test_ovr_bag)))


# 3. OneVsOne

model = SVC(max_iter=1000, class_weight="balanced", decision_function_shape="ovo")
model.fit(train_x_bag, Y_train)

predict_val_ovo_bag = model.predict(val_x_bag)
predict_test_ovo_bag = model.predict(test_x_bag)

  # Evaluation Metrics:

print("F1-Score One Vs One \n")
print("F1-Score for One Vs One Class Validation Data:{:.2f}%".format(f1_score(Y_val, predict_val_ovo_bag, average="weighted")))
print("F1-Score for One Vs One Test Data:{:.2f}%".format(f1_score(Y_test, predict_test_ovo_bag, average="weighted")))
print("\n")
print("Accuracy-Score for One Vs One Validation Data:{:.2f}%".format(accuracy_score(Y_val, predict_val_ovo_bag)))
print("Accuracy-Score for One Vs One Test Data:{:.2f}%".format(accuracy_score(Y_test, predict_test_ovo_bag)))


  y = column_or_1d(y, warn=True)


F1-Score One Vs Rest Class 

F1-Score for One Vs Rest Class Validation Data:0.36%
F1-Score for One Vs Rest Test Data:0.38%


Accuracy-Score for One Vs Rest Validation Data:0.44%
Accuracy-Score for One Vs Rest Test Data:0.44%


  y = column_or_1d(y, warn=True)


F1-Score One Vs One 

F1-Score for One Vs One Class Validation Data:0.36%
F1-Score for One Vs One Test Data:0.38%


Accuracy-Score for One Vs One Validation Data:0.44%
Accuracy-Score for One Vs One Test Data:0.44%


In [None]:
# Using TF-IDF Model
train_x_bag, val_x_bag, test_x_bag = get_features(X_train, X_val, X_test, "tf_idf_model")

# 1. Multi-Class
model = SVC(max_iter=1000, class_weight="balanced", decision_function_shape="ovr")
model.fit(train_x_bag, Y_train)

predict_val_ovr_bag = model.predict(val_x_bag)
predict_test_ovr_bag = model.predict(test_x_bag)

  # Evaluation Metrics:

print("F1-Score One Vs Rest Class \n")
print("F1-Score for One Vs Rest Class Validation Data:{:.2f}%".format(f1_score(Y_val, predict_val_ovr_bag, average="weighted")))
print("F1-Score for One Vs Rest Test Data:{:.2f}%".format(f1_score(Y_test, predict_test_ovr_bag, average="weighted")))
print("\n")
print("Accuracy-Score for One Vs Rest Validation Data:{:.2f}%".format(accuracy_score(Y_val, predict_val_ovr_bag)))
print("Accuracy-Score for One Vs Rest Test Data:{:.2f}%".format(accuracy_score(Y_test, predict_test_ovr_bag)))


# 3. OneVsOne

model = SVC(max_iter=1000, class_weight="balanced", decision_function_shape="ovo")
model.fit(train_x_bag, Y_train)

predict_val_ovo_bag = model.predict(val_x_bag)
predict_test_ovo_bag = model.predict(test_x_bag)

  # Evaluation Metrics:

print("F1-Score One Vs One \n")
print("F1-Score for One Vs One Class Validation Data:{:.2f}%".format(f1_score(Y_val, predict_val_ovo_bag, average="weighted")))
print("F1-Score for One Vs One Test Data:{:.2f}%".format(f1_score(Y_test, predict_test_ovo_bag, average="weighted")))
print("\n")
print("Accuracy-Score for One Vs One Validation Data:{:.2f}%".format(accuracy_score(Y_val, predict_val_ovo_bag)))
print("Accuracy-Score for One Vs One Test Data:{:.2f}%".format(accuracy_score(Y_test, predict_test_ovo_bag)))


  y = column_or_1d(y, warn=True)


F1-Score One Vs Rest Class 

F1-Score for One Vs Rest Class Validation Data:0.35%
F1-Score for One Vs Rest Test Data:0.33%


Accuracy-Score for One Vs Rest Validation Data:0.44%
Accuracy-Score for One Vs Rest Test Data:0.43%


  y = column_or_1d(y, warn=True)


F1-Score One Vs One 

F1-Score for One Vs One Class Validation Data:0.35%
F1-Score for One Vs One Test Data:0.33%


Accuracy-Score for One Vs One Validation Data:0.44%
Accuracy-Score for One Vs One Test Data:0.43%


In [None]:
# Using **** Model
    # 1. Multi-Class
    # 2. OneVsRest
    # 3. OneVsOne

# Naive Bayes

In [None]:
# Using Bag Of Words Model
train_x_bag, val_x_bag, test_x_bag = get_features(X_train, X_val, X_test, "bag_of_words")

# 1. Multi-Class
model = MultinomialNB()
model.fit(train_x_bag, Y_train)

predict_val_multi_bag = model.predict(val_x_bag)
predict_test_multi_bag = model.predict(test_x_bag)

print("Y_val shape:", Y_val.shape)
print("Predict_val_multi_bag:", predict_val_multi_bag.shape)

print("Y_test shape:", Y_test.shape)
print("Predict_test_multi_bag:", predict_test_multi_bag.shape)
  # Evaluation Metrics:

print("F1-Score Multi Class \n")
print("F1-Score for Multi Class Validation Data:{:.2f}%".format(f1_score(Y_val, predict_val_multi_bag, average="weighted")))
print("F1-Score for Multi Class Test Data:{:.2f}%".format(f1_score(Y_test, predict_test_multi_bag, average="weighted")))
print("\n")
print("Accuracy-Score for Multi Class Validation Data:{:.2f}%".format(accuracy_score(Y_val, predict_val_multi_bag)))
print("Accuracy-Score for Multi Class Test Data:{:.2f}%".format(accuracy_score(Y_test, predict_test_multi_bag)))

# 2. OneVsRest
model = OneVsRestClassifier(MultinomialNB())
model.fit(train_x_bag, Y_train)

predict_val_ovr_bag = model.predict(val_x_bag)
predict_test_ovr_bag = model.predict(test_x_bag)

  # Evaluation Metrics:

print("F1-Score One Vs Rest \n")
print("F1-Score for One Vs Rest Class Validation Data:{:.2f}%".format(f1_score(Y_val, predict_val_ovr_bag, average="weighted")))
print("F1-Score for One Vs Rest Test Data:{:.2f}%".format(f1_score(Y_test, predict_test_ovr_bag, average="weighted")))
print("\n")
print("Accuracy-Score for One Vs Rest Validation Data:{:.2f}%".format(accuracy_score(Y_val, predict_val_ovr_bag)))
print("Accuracy-Score for One Vs Rest Test Data:{:.2f}%".format(accuracy_score(Y_test, predict_test_ovr_bag)))


# 3. OneVsOne

model = OneVsOneClassifier(MultinomialNB())
model.fit(train_x_bag, Y_train)

predict_val_ovo_bag = model.predict(val_x_bag)
predict_test_ovo_bag = model.predict(test_x_bag)

  # Evaluation Metrics:

print("F1-Score One Vs One \n")
print("F1-Score for One Vs One Class Validation Data:{:.2f}%".format(f1_score(Y_val, predict_val_ovo_bag, average="weighted")))
print("F1-Score for One Vs One Test Data:{:.2f}%".format(f1_score(Y_test, predict_test_ovo_bag, average="weighted")))
print("\n")
print("Accuracy-Score for One Vs One Validation Data:{:.2f}%".format(accuracy_score(Y_val, predict_val_ovo_bag)))
print("Accuracy-Score for One Vs One Test Data:{:.2f}%".format(accuracy_score(Y_test, predict_test_ovo_bag)))


Y_val shape: (320, 1)
Predict_val_multi_bag: (320,)
Y_test shape: (800, 1)
Predict_test_multi_bag: (800,)
F1-Score Multi Class 

F1-Score for Multi Class Validation Data:0.45%
F1-Score for Multi Class Test Data:0.46%


Accuracy-Score for Multi Class Validation Data:0.46%
Accuracy-Score for Multi Class Test Data:0.48%
F1-Score One Vs Rest 

F1-Score for One Vs Rest Class Validation Data:0.45%
F1-Score for One Vs Rest Test Data:0.46%


Accuracy-Score for One Vs Rest Validation Data:0.45%
Accuracy-Score for One Vs Rest Test Data:0.47%
F1-Score One Vs One 

F1-Score for One Vs One Class Validation Data:0.45%
F1-Score for One Vs One Test Data:0.46%


Accuracy-Score for One Vs One Validation Data:0.46%
Accuracy-Score for One Vs One Test Data:0.48%


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [None]:
# Using TF-IDF Model
train_x_bag, val_x_bag, test_x_bag = get_features(X_train, X_val, X_test, "tf_idf_model")

# 1. Multi-Class
model = MultinomialNB()
model.fit(train_x_bag, Y_train)

predict_val_multi_bag = model.predict(val_x_bag)
predict_test_multi_bag = model.predict(test_x_bag)

print("Y_val shape:", Y_val.shape)
print("Predict_val_multi_bag:", predict_val_multi_bag.shape)

print("Y_test shape:", Y_test.shape)
print("Predict_test_multi_bag:", predict_test_multi_bag.shape)
  # Evaluation Metrics:

print("F1-Score Multi Class \n")
print("F1-Score for Multi Class Validation Data:{:.2f}%".format(f1_score(Y_val, predict_val_multi_bag, average="weighted")))
print("F1-Score for Multi Class Test Data:{:.2f}%".format(f1_score(Y_test, predict_test_multi_bag, average="weighted")))
print("\n")
print("Accuracy-Score for Multi Class Validation Data:{:.2f}%".format(accuracy_score(Y_val, predict_val_multi_bag)))
print("Accuracy-Score for Multi Class Test Data:{:.2f}%".format(accuracy_score(Y_test, predict_test_multi_bag)))

# 2. OneVsRest
model = OneVsRestClassifier(MultinomialNB())
model.fit(train_x_bag, Y_train)

predict_val_ovr_bag = model.predict(val_x_bag)
predict_test_ovr_bag = model.predict(test_x_bag)

  # Evaluation Metrics:

print("F1-Score Multi Class \n")
print("F1-Score for One Vs Rest Class Validation Data:{:.2f}%".format(f1_score(Y_val, predict_val_ovr_bag, average="weighted")))
print("F1-Score for One Vs Rest Test Data:{:.2f}%".format(f1_score(Y_test, predict_test_ovr_bag, average="weighted")))
print("\n")
print("Accuracy-Score for One Vs Rest Validation Data:{:.2f}%".format(accuracy_score(Y_val, predict_val_ovr_bag)))
print("Accuracy-Score for One Vs Rest Test Data:{:.2f}%".format(accuracy_score(Y_test, predict_test_ovr_bag)))


# 3. OneVsOne

model = OneVsOneClassifier(MultinomialNB())
model.fit(train_x_bag, Y_train)

predict_val_ovo_bag = model.predict(val_x_bag)
predict_test_ovo_bag = model.predict(test_x_bag)

  # Evaluation Metrics:

print("F1-Score Multi Class \n")
print("F1-Score for One Vs One Class Validation Data:{:.2f}%".format(f1_score(Y_val, predict_val_ovo_bag, average="weighted")))
print("F1-Score for One Vs One Test Data:{:.2f}%".format(f1_score(Y_test, predict_test_ovo_bag, average="weighted")))
print("\n")
print("Accuracy-Score for One Vs One Validation Data:{:.2f}%".format(accuracy_score(Y_val, predict_val_ovo_bag)))
print("Accuracy-Score for One Vs One Test Data:{:.2f}%".format(accuracy_score(Y_test, predict_test_ovo_bag)))


Y_val shape: (320, 1)
Predict_val_multi_bag: (320,)
Y_test shape: (800, 1)
Predict_test_multi_bag: (800,)
F1-Score Multi Class 

F1-Score for Multi Class Validation Data:0.45%
F1-Score for Multi Class Test Data:0.43%


Accuracy-Score for Multi Class Validation Data:0.49%
Accuracy-Score for Multi Class Test Data:0.48%
F1-Score Multi Class 

F1-Score for One Vs Rest Class Validation Data:0.44%
F1-Score for One Vs Rest Test Data:0.44%


Accuracy-Score for One Vs Rest Validation Data:0.49%
Accuracy-Score for One Vs Rest Test Data:0.48%
F1-Score Multi Class 

F1-Score for One Vs One Class Validation Data:0.45%
F1-Score for One Vs One Test Data:0.43%


Accuracy-Score for One Vs One Validation Data:0.49%
Accuracy-Score for One Vs One Test Data:0.48%


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [None]:
# Using **** Model
    # 1. Multi-Class
    # 2. OneVsRest
    # 3. OneVsOne