In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
df=pd.read_csv("news_dataset.csv")

In [3]:
df

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [5]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [6]:
df=df.dropna()

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18285 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      18285 non-null  int64 
 1   title   18285 non-null  object
 2   author  18285 non-null  object
 3   text    18285 non-null  object
 4   label   18285 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 857.1+ KB


In [8]:
max_lengths = df.apply(lambda col: col.astype(str).apply(len).max())

# Print or use the max_lengths as needed
print(max_lengths)


id             5
title        456
author       158
text      142961
label          1
dtype: int64


In [9]:
df["title"][5]

'Jackie Mason: Hollywood Would Love Trump if He Bombed North Korea over Lack of Trans Bathrooms (Exclusive Video) - Breitbart'

In [10]:
df=df.drop(["id","text","author"],axis=1)

In [11]:
df.reset_index(drop=True, inplace=True)


In [12]:
class preprocessing():
    
    def __init__(self):
        self.stopword = set(stopwords.words('english'))
        self.data = None
        self.data2 = None

    def text_preproceesing_train(self,data1):
        self.data=data1
        lm=WordNetLemmatizer()
        corpus=[]
        for i in range(len(df)):
            review=re.sub(r'^\w*$'," ",self.data["title"][i] )
            review=review.lower()
            review=review.split()
            review=[lm.lemmatize(x) for x in review if x not in stopword]
            review=" ".join(review)
            
            corpus.append(review)
        return corpus
    def text_preproceesing_pred(self,data2):
        self.data2=data2
        lm=WordNetLemmatizer()
        processed=[]
        for i in range(len(df)):
            review=re.sub(r'^\w*$'," ",self.data2 )
            review=review.lower()
            review=review.split()
            review=[lm.lemmatize(x) for x in review if x not in stopword]
            review=" ".join(review)
            
            processed.append(review)
        return processed
    def corpus_gen(self):
        t1=self.text_preproceesing_train(self.data)
        tf=TfidfVectorizer()
        x=tf.fit_transform(t1).toarray()
        y=self.data['label']
        return x,y
    
    def corpus_pred(self):
        t2=self.text_preproceesing_pred(self.data2)
        tf=TfidfVectorizer()
        x=tf.transform(t2).toarray()
        

In [14]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK stopwords
nltk.download('stopwords')

# Create a set of stopwords
stopword = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\awscl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
proc1=preprocessing()
p2=proc1.text_preproceesing_train(df)
x, y = proc1.corpus_gen()


In [None]:
y

0        1
1        0
2        1
3        1
4        1
        ..
18280    0
18281    0
18282    0
18283    1
18284    1
Name: label, Length: 18285, dtype: int64

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=34, stratify=y)




In [17]:
class model_gen():
    def  __init__(self,x_train,x_test,y_train,y_test):
        self.x_train = x_train
        self.x_test= x_test
        self.y_train = y_train
        self.y_test=y_test
        self.model = None  

    def model_build(self):
        rf=RandomForestClassifier()
        self.model = rf  # Store the model instance
        self.model.fit(self.x_train, self.y_train)

    # def train_evaluation(self):
    #     y_pred_train = self.model.predict(self.x_train)
        
    #     acc_scr_train = (self.y_train.astype(int), y_pred_train.astype(int))
    #     print("Accuracy Score On Training Data Set :",acc_scr_train)
    #     print()
        
    #     con_mat_train = confusion_matrix(self.y_train,y_pred_train)
    #     print("Confusion Matrix On Training Data Set :\n",con_mat_train)
    #     print()
        
    #     class_rep_train = classification_report(self.y_train,y_pred_train)
    #     print("Classification Report On Training Data Set :\n",class_rep_train)
        
    # def test_evaluation(self):
    #     y_pred_test = self.model.predict(self.x_test)
        
    #     acc_scr_test = accuracy_score(self.y_test,y_pred_test)
    #     print("Accuracy Score On Testing Data Set :",acc_scr_train)
    #     print()
        
    #     con_mat_test = confusion_matrix(self.y_test,y_pred_test)
    #     print("Confusion Matrix On Testing Data Set :\n",con_mat_train)
    #     print()
        
    #     class_rep_test = classification_report(self.y_test,y_pred_test)
    #     print("Classification Report On Testing Data Set :\n",class_rep_train)
        

        




In [18]:
m1=model_gen(x_train,x_test,y_train,y_test)
m1.model_build()


In [19]:
class CustomModel():
    def __init__(self, parent_instance):
        self.parent_instance = parent_instance

    def train_evaluation(self):
        y_pred_train = self.parent_instance.model.predict(self.parent_instance.x_train)

        acc_scr_train = accuracy_score(self.parent_instance.y_train, y_pred_train)
        print("Custom Accuracy Score On Training Data Set :", acc_scr_train)
        print()

        con_mat_train = confusion_matrix(self.parent_instance.y_train, y_pred_train)
        print("Custom Confusion Matrix On Training Data Set :\n", con_mat_train)
        print()

        class_rep_train = classification_report(self.parent_instance.y_train, y_pred_train)
        print("Custom Classification Report On Training Data Set :\n", class_rep_train)

    def test_evaluation(self):
        y_pred_test = self.parent_instance.model.predict(self.parent_instance.x_test)

        acc_scr_test = accuracy_score(self.parent_instance.y_test, y_pred_test)
        print("Custom Accuracy Score On Testing Data Set :", acc_scr_test)
        print()

        con_mat_test = confusion_matrix(self.parent_instance.y_test, y_pred_test)
        print("Custom Confusion Matrix On Testing Data Set :\n", con_mat_test)
        print()

        class_rep_test = classification_report(self.parent_instance.y_test, y_pred_test)
        print("Custom Classification Report On Testing Data Set :\n", class_rep_test)



In [20]:
m2 = CustomModel(m1)
m2.train_evaluation()
m2.test_evaluation()

Custom Accuracy Score On Training Data Set : 0.9999218688960075

Custom Confusion Matrix On Training Data Set :
 [[7251    1]
 [   0 5547]]

Custom Classification Report On Training Data Set :
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7252
           1       1.00      1.00      1.00      5547

    accuracy                           1.00     12799
   macro avg       1.00      1.00      1.00     12799
weighted avg       1.00      1.00      1.00     12799

Custom Accuracy Score On Testing Data Set : 0.9360189573459715

Custom Confusion Matrix On Testing Data Set :
 [[2809  300]
 [  51 2326]]

Custom Classification Report On Testing Data Set :
               precision    recall  f1-score   support

           0       0.98      0.90      0.94      3109
           1       0.89      0.98      0.93      2377

    accuracy                           0.94      5486
   macro avg       0.93      0.94      0.94      5486
weighted avg    