Step 1: Import Essential Libraries

In [1]:
import pandas as pd 
import numpy as np

Step 2: Load Dataset

In [2]:
df = pd.read_csv('df_file.csv')
df.head()

Unnamed: 0,Text,Label
0,Budget to set scene for election\n \n Gordon B...,0
1,Army chiefs in regiments decision\n \n Militar...,0
2,Howard denies split over ID cards\n \n Michael...,0
3,Observers to monitor UK election\n \n Minister...,0
4,Kilroy names election seat target\n \n Ex-chat...,0


Step03: Understand Data frame

In [3]:
df.shape

(2225, 2)

In [4]:
df['Label'].value_counts()

Label
1    511
4    510
0    417
2    401
3    386
Name: count, dtype: int64

In [5]:
df.isnull().sum()

Text     0
Label    0
dtype: int64

In [6]:
df.duplicated().sum()

98

Step 4: Data cleaning

In [7]:
df.drop_duplicates(inplace=True)
df['Label'].value_counts()

Label
1    505
4    503
0    403
3    369
2    347
Name: count, dtype: int64

Step 5: pre processing

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
import spacy

In [9]:
nlp = spacy.load("en_core_web_lg")

df['vector'] = df['Text'].apply(lambda text: nlp(text).vector) 
df.head() 

Unnamed: 0,Text,Label,vector
0,Budget to set scene for election\n \n Gordon B...,0,"[-1.9447731, 0.952523, -1.6278499, 0.81113255,..."
1,Army chiefs in regiments decision\n \n Militar...,0,"[-2.6137152, 0.84850717, -1.4028622, 0.4991233..."
2,Howard denies split over ID cards\n \n Michael...,0,"[-1.8860146, 1.6810012, -1.4558353, 0.5083195,..."
3,Observers to monitor UK election\n \n Minister...,0,"[-2.3990412, 0.34461203, -1.3932669, 0.8179685..."
4,Kilroy names election seat target\n \n Ex-chat...,0,"[-2.113165, 0.64551234, -0.9113166, 0.48627117..."


Step 6: Modeling

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X = df['vector']
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

In [13]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)


In [14]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer


Attempt 1 

In [15]:
from sklearn.ensemble import RandomForestClassifier

# Train model (Random Forest as example)
clf = RandomForestClassifier()
clf.fit(X_train_2d, y_train)

# Predict
y_pred = clf.predict(X_test_2d)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.91      0.88        79
           1       0.99      0.98      0.99       110
           2       0.88      0.95      0.91        62
           3       0.96      0.88      0.91        73
           4       0.94      0.91      0.93       102

    accuracy                           0.93       426
   macro avg       0.92      0.93      0.92       426
weighted avg       0.93      0.93      0.93       426



Attempt 2

In [16]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(scaled_train_embed, y_train)

y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.82      0.78        79
           1       0.89      0.85      0.87       110
           2       0.75      0.82      0.78        62
           3       0.78      0.68      0.73        73
           4       0.84      0.83      0.84       102

    accuracy                           0.81       426
   macro avg       0.80      0.80      0.80       426
weighted avg       0.81      0.81      0.81       426

