Step 1: Import Essential Libraries

In [58]:
import pandas as pd 

Step 2: Load Dataset

In [59]:
df = pd.read_csv('df_file.csv')
df.head()

Unnamed: 0,Text,Label
0,Budget to set scene for election\n \n Gordon B...,0
1,Army chiefs in regiments decision\n \n Militar...,0
2,Howard denies split over ID cards\n \n Michael...,0
3,Observers to monitor UK election\n \n Minister...,0
4,Kilroy names election seat target\n \n Ex-chat...,0


Step03: Understand Data frame

In [60]:
df.shape

(2225, 2)

In [61]:
df['Label'].value_counts()

Label
1    511
4    510
0    417
2    401
3    386
Name: count, dtype: int64

In [62]:
df.isnull().sum()

Text     0
Label    0
dtype: int64

In [63]:
df.duplicated().sum()

98

Step 4: Data cleaning

In [64]:
df.drop_duplicates(inplace=True)
df['Label'].value_counts()


Label
1    505
4    503
0    403
3    369
2    347
Name: count, dtype: int64

Step 5: pre processing

In [65]:
from sklearn.feature_extraction.text import CountVectorizer
import spacy

In [66]:
nlp = spacy.load("en_core_web_sm") 

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [67]:
preprocess("Thor ate pizza")

'thor eat pizza'

In [68]:
df['Cleaned_Text'] = df['Text'].apply(preprocess)
df.head()

Unnamed: 0,Text,Label,Cleaned_Text
0,Budget to set scene for election\n \n Gordon B...,0,budget set scene election \n \n Gordon Brown ...
1,Army chiefs in regiments decision\n \n Militar...,0,army chief regiment decision \n \n military c...
2,Howard denies split over ID cards\n \n Michael...,0,Howard deny split ID card \n \n Michael Howar...
3,Observers to monitor UK election\n \n Minister...,0,observer monitor UK election \n \n Ministers ...
4,Kilroy names election seat target\n \n Ex-chat...,0,kilroy name election seat target \n \n ex cha...


Step 6: Modeling

In [69]:
from sklearn.model_selection import train_test_split

In [70]:
X = df['Text']
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Attempt 1 : Use 1-gram which is nothing but a Bag Of Words (BOW) model

In [71]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


In [72]:

#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 1))),       
     ('Multi NB', MultinomialNB())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.96      0.96        79
           1       1.00      1.00      1.00       110
           2       0.90      0.98      0.94        62
           3       0.99      0.93      0.96        73
           4       0.98      0.95      0.97       102

    accuracy                           0.97       426
   macro avg       0.96      0.97      0.96       426
weighted avg       0.97      0.97      0.97       426



Attempt 2 : Use 1-gram and bigrams

In [73]:
#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 2))), 
     ('Multi NB', MultinomialNB())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.99      0.93        79
           1       1.00      1.00      1.00       110
           2       0.87      0.98      0.92        62
           3       1.00      0.81      0.89        73
           4       0.99      0.95      0.97       102

    accuracy                           0.95       426
   macro avg       0.95      0.95      0.94       426
weighted avg       0.96      0.95      0.95       426



Attempt 3 : Use 1-gram to trigrams

In [74]:
#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_1_3_grams', CountVectorizer(ngram_range = (1, 3))),    
     ('Multi NB', MultinomialNB())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.99      0.92        79
           1       1.00      1.00      1.00       110
           2       0.86      1.00      0.93        62
           3       1.00      0.77      0.87        73
           4       1.00      0.95      0.97       102

    accuracy                           0.95       426
   macro avg       0.94      0.94      0.94       426
weighted avg       0.95      0.95      0.95       426



Use text pre-processing to remove stop words, punctuations and apply lemmatization

In [75]:
X_cleaned = df['Cleaned_Text']
y_cleaned = df['Label']

X_train_cleaned, X_test_cleaned, y_train_cleaned, y_test_cleaned = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=42)

Attempt 4

In [76]:
# 1. Create a pipeline object
clf = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range=(1, 1))),       
     ('Multi NB', MultinomialNB())         
])

# 2. Fit with X_train and y_train
clf.fit(X_train_cleaned, y_train_cleaned)

# 3. Get the predictions for X_test and store them in y_pred
y_pred = clf.predict(X_test_cleaned)

# 4. Print the classification report
print(classification_report(y_test_cleaned, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95        79
           1       1.00      1.00      1.00       110
           2       0.91      0.98      0.95        62
           3       1.00      0.92      0.96        73
           4       0.98      0.95      0.97       102

    accuracy                           0.97       426
   macro avg       0.96      0.97      0.96       426
weighted avg       0.97      0.97      0.97       426



Attempt 4

In [77]:
# 1. Create a pipeline object
clf = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range=(1, 2))),       
     ('Multi NB', MultinomialNB())         
])

# 2. Fit with X_train and y_train
clf.fit(X_train_cleaned, y_train_cleaned)

# 3. Get the predictions for X_test and store them in y_pred
y_pred = clf.predict(X_test_cleaned)

# 4. Print the classification report
print(classification_report(y_test_cleaned, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.99      0.95        79
           1       1.00      1.00      1.00       110
           2       0.92      0.98      0.95        62
           3       1.00      0.89      0.94        73
           4       0.99      0.96      0.98       102

    accuracy                           0.97       426
   macro avg       0.96      0.96      0.96       426
weighted avg       0.97      0.97      0.97       426



Attempt 5

In [78]:
# 1. Create a pipeline object
clf = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range=(1, 3))),       
     ('Multi NB', MultinomialNB())         
])

# 2. Fit with X_train and y_train
clf.fit(X_train_cleaned, y_train_cleaned)

# 3. Get the predictions for X_test and store them in y_pred
y_pred = clf.predict(X_test_cleaned)

# 4. Print the classification report
print(classification_report(y_test_cleaned, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.99      0.95        79
           1       1.00      1.00      1.00       110
           2       0.90      0.98      0.94        62
           3       1.00      0.86      0.93        73
           4       0.99      0.96      0.98       102

    accuracy                           0.96       426
   macro avg       0.96      0.96      0.96       426
weighted avg       0.97      0.96      0.96       426

