#### 2. Classification (Supervised ML)

Project: Fake News / Misinformation Detection

Dataset: Fake News Dataset

Goal: Classify whether a news article is fake or real

Techniques:

Baseline: Logistic Regression, Random Fores.

Evaluate with accuracy, precision, recall, F1-score.

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import re

In [3]:
true=pd.read_csv("True.csv")
false=pd.read_csv("Fake.csv")

In [4]:
true.shape,false.shape

((21417, 4), (23481, 4))

In [5]:
true.info(), false.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB


(None, None)

In [6]:
true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [7]:
false.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [8]:
true['target']=1
false['target']=0

In [9]:
df = pd.concat([true, false])

In [10]:
df.head()

Unnamed: 0,title,text,subject,date,target
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44898 entries, 0 to 23480
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   target   44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 2.1+ MB


In [12]:
df['target'].unique

<bound method Series.unique of 0        1
1        1
2        1
3        1
4        1
        ..
23476    0
23477    0
23478    0
23479    0
23480    0
Name: target, Length: 44898, dtype: int64>

In [13]:
df =df.drop(['subject', 'date'], axis = 1)

In [14]:
df.head()

Unnamed: 0,title,text,target
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,1


In [15]:
df.isnull().sum()

title     0
text      0
target    0
dtype: int64

In [16]:
df.duplicated().sum()

5793

In [17]:
df.drop_duplicates(keep = 'first', inplace = True)

In [18]:
df.duplicated().sum()

0

In [19]:
df['target'].value_counts()

target
1    21197
0    17908
Name: count, dtype: int64

In [20]:
def clean_text(text):
    text = text.lower() 
    text = re.sub(r'http\S+|www\S+|https\S+', '', text) ## for url 
    text = re.sub(r'[^a-z\s]', '', text)    ## Remove punctuation & numbers
    text = re.sub(r'\s+', ' ', text).strip()    ## for space
    return text

In [21]:
df['text_clean'] = df['text'].apply(clean_text)
print(df[['text', 'text_clean']].head())

                                                text  \
0  WASHINGTON (Reuters) - The head of a conservat...   
1  WASHINGTON (Reuters) - Transgender people will...   
2  WASHINGTON (Reuters) - The special counsel inv...   
3  WASHINGTON (Reuters) - Trump campaign adviser ...   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...   

                                          text_clean  
0  washington reuters the head of a conservative ...  
1  washington reuters transgender people will be ...  
2  washington reuters the special counsel investi...  
3  washington reuters trump campaign adviser geor...  
4  seattlewashington reuters president donald tru...  


In [22]:
df['title_clean'] = df['title'].apply(clean_text)
print(df[['title', 'title_clean']].head())

                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                         title_clean  
0  as us budget fight looms republicans flip thei...  
1  us military to accept transgender recruits on ...  
2  senior us republican senator let mr mueller do...  
3  fbi russia probe helped by australian diplomat...  
4  trump wants postal service to charge much more...  


In [23]:
df

Unnamed: 0,title,text,target,text_clean,title_clean
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,1,washington reuters the head of a conservative ...,as us budget fight looms republicans flip thei...
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,1,washington reuters transgender people will be ...,us military to accept transgender recruits on ...
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,1,washington reuters the special counsel investi...,senior us republican senator let mr mueller do...
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,1,washington reuters trump campaign adviser geor...,fbi russia probe helped by australian diplomat...
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,1,seattlewashington reuters president donald tru...,trump wants postal service to charge much more...
...,...,...,...,...,...
22698,The White House and The Theatrics of ‘Gun Cont...,21st Century Wire says All the world s a stage...,0,st century wire says all the world s a stage a...,the white house and the theatrics of gun control
22699,Activists or Terrorists? How Media Controls an...,Randy Johnson 21st Century WireThe majority ...,0,randy johnson st century wirethe majority of m...,activists or terrorists how media controls and...
22700,"BOILER ROOM – No Surrender, No Retreat, Heads ...",Tune in to the Alternate Current Radio Network...,0,tune in to the alternate current radio network...,boiler room no surrender no retreat heads will...
22701,Federal Showdown Looms in Oregon After BLM Abu...,21st Century Wire says A new front has just op...,0,st century wire says a new front has just open...,federal showdown looms in oregon after blm abu...


In [24]:
df.drop(['title','text'], axis=1)

Unnamed: 0,target,text_clean,title_clean
0,1,washington reuters the head of a conservative ...,as us budget fight looms republicans flip thei...
1,1,washington reuters transgender people will be ...,us military to accept transgender recruits on ...
2,1,washington reuters the special counsel investi...,senior us republican senator let mr mueller do...
3,1,washington reuters trump campaign adviser geor...,fbi russia probe helped by australian diplomat...
4,1,seattlewashington reuters president donald tru...,trump wants postal service to charge much more...
...,...,...,...
22698,0,st century wire says all the world s a stage a...,the white house and the theatrics of gun control
22699,0,randy johnson st century wirethe majority of m...,activists or terrorists how media controls and...
22700,0,tune in to the alternate current radio network...,boiler room no surrender no retreat heads will...
22701,0,st century wire says a new front has just open...,federal showdown looms in oregon after blm abu...


In [25]:
df['combined_text'] = df['title_clean'] + ' ' + df['text_clean']

X = df['combined_text']   
y = df['target']  

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
X_train.shape

(31284,)

In [28]:
y_train.shape

(31284,)

In [29]:
tf= TfidfVectorizer()

In [44]:
tf = TfidfVectorizer()  # limit features for speed
X_train_tf = tf.fit_transform(X_train)   # fit only on training data
X_test_tf = tf.transform(X_test)  

In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
model = LogisticRegression()
model.fit(X_train_tf, y_train)

In [46]:
y_pred = model.predict(X_test_tf)
y_pred


array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [47]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.987853215701317

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99      3564
           1       0.99      0.99      0.99      4257

    accuracy                           0.99      7821
   macro avg       0.99      0.99      0.99      7821
weighted avg       0.99      0.99      0.99      7821


Confusion Matrix:
 [[3503   61]
 [  34 4223]]


In [56]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train
rf_model.fit(X_train_tf, y_train)

# Predict
y_pred_rf = rf_model.predict(X_test_tf)


In [57]:
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


Accuracy: 0.9828666410944892

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98      3564
           1       0.98      0.99      0.98      4257

    accuracy                           0.98      7821
   macro avg       0.98      0.98      0.98      7821
weighted avg       0.98      0.98      0.98      7821


Confusion Matrix:
 [[3466   98]
 [  36 4221]]
