In [None]:
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn import svm
import matplotlib.pyplot as plt



nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
df = pd.read_csv('summarised_weighted_discourse.csv')

# Classifier Input: Original Records

In [None]:
use_df = df[['clean_lower','Label']].copy()
use_df
use_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2010 entries, 0 to 2009
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   clean_lower  2010 non-null   object
 1   Label        2010 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 31.5+ KB


In [None]:
df_build = use_df.copy()
df_build

Unnamed: 0,clean_lower,Label
0,clandestine industries hi i note you have ...,0
1,sailor moon musicals i tried to add the clari...,0
2,that was my point i wanted to rewrite the enti...,0
3,channel 4 documentary nice work prioryman i...,0
4,is this species named after sir david attenbor...,0
...,...,...
2005,series scrapped on 4th july on a few ppl hav l...,2
2006,no actually she has an economics degree,2
2007,liar liar pants on fire seriously i looked at ...,2
2008,i love to eat rectal yoghurt,2


In [None]:
y = df_build['Label']
X = df_build[['clean_lower']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=460)

In [None]:
# initialise model and vectorizers
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
NB = MultinomialNB()
RF = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=450)
XGB = XGBClassifier()
LGBM = lgb.LGBMClassifier()
LR = LogisticRegression(random_state=450)
vectorizer1 = TfidfVectorizer()


# construct the column transfomer
column_transformer = ColumnTransformer(
    [('tfidf1', vectorizer1, 'clean_lower')],
    remainder='passthrough')

# fit the model
SVM_pipe = Pipeline([
                  ('tfidf', column_transformer),
                  ('classify', SVM)
                ])
SVM_pipe.fit(X_train,y_train)

# NB_pipe = Pipeline([
#                   ('tfidf', column_transformer),
#                   ('classify', NB)
#                 ])
# NB_pipe.fit(X_train,y_train)

RF_pipe = Pipeline([
                  ('tfidf', column_transformer),
                  ('classify', RF)
                ])
RF_pipe.fit(X_train,y_train)

XGB_pipe = Pipeline([
                  ('tfidf', column_transformer),
                  ('classify', XGB)
                ])
XGB_pipe.fit(X_train,y_train)

LGBM_pipe = Pipeline([
                  ('tfidf', column_transformer),
                  ('classify', LGBM)
                ])
LGBM_pipe.fit(X_train,y_train)

LR_pipe = Pipeline([
                  ('tfidf', column_transformer),
                  ('classify', LR)
                ])
LR_pipe.fit(X_train,y_train)

Pipeline(steps=[('tfidf',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('tfidf1', TfidfVectorizer(),
                                                  'clean_lower')])),
                ('classify', LogisticRegression(random_state=450))])

In [None]:
NB_df = df_build[['clean_lower','Label']].copy()

In [None]:
NB_df

Unnamed: 0,clean_lower,Label
0,clandestine industries hi i note you have ...,0
1,sailor moon musicals i tried to add the clari...,0
2,that was my point i wanted to rewrite the enti...,0
3,channel 4 documentary nice work prioryman i...,0
4,is this species named after sir david attenbor...,0
...,...,...
2005,series scrapped on 4th july on a few ppl hav l...,2
2006,no actually she has an economics degree,2
2007,liar liar pants on fire seriously i looked at ...,2
2008,i love to eat rectal yoghurt,2


In [None]:
y_NB = NB_df['Label']
X_NB = NB_df[['clean_lower']]

In [None]:
X_NB_train, X_NB_test, y_NB_train, y_NB_test = train_test_split(X_NB, y_NB, test_size=0.7, random_state=460)

In [None]:
NB_pipe = Pipeline([
                  ('tfidf', column_transformer),
                  ('classify', NB)
                ])
NB_pipe.fit(X_NB_train,y_NB_train)

Pipeline(steps=[('tfidf',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('tfidf1', TfidfVectorizer(),
                                                  'clean_lower')])),
                ('classify', MultinomialNB())])

In [None]:
predictions_NB = NB_pipe.predict(X_NB_test)
nb_accuracy = accuracy_score(predictions_NB, y_test)*100

NB_matrix = classification_report(y_NB_test,predictions_NB,labels=[0,1,2])
print("Accuracy:", nb_accuracy)
print('Classification report : \n',NB_matrix)

Accuracy: 60.83866382373845
Classification report : 
               precision    recall  f1-score   support

           0       0.67      0.73      0.70       475
           1       0.54      0.62      0.58       463
           2       0.61      0.46      0.53       469

    accuracy                           0.61      1407
   macro avg       0.61      0.61      0.60      1407
weighted avg       0.61      0.61      0.60      1407



In [None]:
predictions_SVM = SVM_pipe.predict(X_NB_test)
svm_accuracy = accuracy_score(predictions_SVM, y_NB_test)*100

SVM_matrix = classification_report(y_NB_test,predictions_SVM,labels=[0,1,2])
print("Accuracy:", svm_accuracy)
print('Classification report : \n',SVM_matrix)

Accuracy: 60.056858564321246
Classification report : 
               precision    recall  f1-score   support

           0       0.69      0.67      0.68       475
           1       0.56      0.53      0.55       463
           2       0.55      0.60      0.58       469

    accuracy                           0.60      1407
   macro avg       0.60      0.60      0.60      1407
weighted avg       0.60      0.60      0.60      1407



In [None]:
predictions_LGBM = LGBM_pipe.predict(X_NB_test)

lgbm_accuracy = accuracy_score(predictions_LGBM, y_NB_test)*100
LGBM_matrix = classification_report(y_NB_test,predictions_LGBM)
print("Accuracy:", lgbm_accuracy)
print('Classification report : \n',LGBM_matrix)

Accuracy: 54.44207533759773
Classification report : 
               precision    recall  f1-score   support

           0       0.64      0.59      0.62       475
           1       0.46      0.50      0.48       463
           2       0.54      0.54      0.54       469

    accuracy                           0.54      1407
   macro avg       0.55      0.54      0.55      1407
weighted avg       0.55      0.54      0.55      1407



In [None]:
predictions_XGB = XGB_pipe.predict(X_NB_test)

xgb_accuracy = accuracy_score(predictions_XGB, y_NB_test)*100
XGB_matrix = classification_report(y_NB_test,predictions_XGB,labels=[0,1,2])
print("Accuracy:", xgb_accuracy)
print('Classification report : \n',XGB_matrix)

Accuracy: 58.42217484008528
Classification report : 
               precision    recall  f1-score   support

           0       0.67      0.59      0.62       475
           1       0.55      0.52      0.53       463
           2       0.55      0.65      0.59       469

    accuracy                           0.58      1407
   macro avg       0.59      0.58      0.58      1407
weighted avg       0.59      0.58      0.58      1407



In [None]:
predictions_LR = LR_pipe.predict(X_NB_test)

lr_accuracy = accuracy_score(predictions_LR, y_NB_test)*100
LR_matrix = classification_report(y_NB_test,predictions_LR)
print("Accuracy:", lr_accuracy)
print('Classification report : \n',LR_matrix)

Accuracy: 59.985785358919685
Classification report : 
               precision    recall  f1-score   support

           0       0.68      0.68      0.68       475
           1       0.56      0.53      0.55       463
           2       0.56      0.59      0.57       469

    accuracy                           0.60      1407
   macro avg       0.60      0.60      0.60      1407
weighted avg       0.60      0.60      0.60      1407

