Step 1: Import Essential Libraries

In [1]:
import pandas as pd
import numpy as np
import spacy


Step 2: Load Dataset

In [2]:
df = pd.read_csv('df_file.csv') 
df.head()


Unnamed: 0,Text,Label
0,Budget to set scene for election\n \n Gordon B...,0
1,Army chiefs in regiments decision\n \n Militar...,0
2,Howard denies split over ID cards\n \n Michael...,0
3,Observers to monitor UK election\n \n Minister...,0
4,Kilroy names election seat target\n \n Ex-chat...,0


Step03: Understand Data frame

In [3]:
df.shape  

(2225, 2)

In [4]:
df.isnull().sum()

Text     0
Label    0
dtype: int64

In [5]:
df.duplicated().sum() 

98

In [6]:
df.drop_duplicates(inplace=True)
df['Label'].value_counts()

Label
1    505
4    503
0    403
3    369
2    347
Name: count, dtype: int64

Step 4: Data cleaning

In [7]:
df.drop_duplicates(inplace=True)

Step 5: pre processing

In [8]:
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(str(text).lower())
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return tokens

df['tokens'] = df['Text'].apply(preprocess) 


In [None]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=2, workers=4)
w2v_model.train(df['tokens'], total_examples=len(df['tokens']), epochs=10)


(3998528, 4191590)

In [10]:
def document_vector(tokens):
    valid_tokens = [token for token in tokens if token in w2v_model.wv]
    if valid_tokens:
        return np.mean(w2v_model.wv[valid_tokens], axis=0)
    else:
        return np.zeros(w2v_model.vector_size)

df['doc_vector'] = df['tokens'].apply(document_vector)

Step 6: Modeling

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [12]:
X = np.vstack(df['doc_vector'].values)
y = df['Label'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


LogisticRegression

In [13]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.90      0.91      0.91        79
           1       0.99      0.99      0.99       110
           2       0.94      0.94      0.94        62
           3       0.94      0.92      0.93        73
           4       0.95      0.96      0.96       102

    accuracy                           0.95       426
   macro avg       0.94      0.94      0.94       426
weighted avg       0.95      0.95      0.95       426



model save

In [14]:
import joblib

joblib.dump(clf, 'logistic_regression_model.pkl')

['logistic_regression_model.pkl']