In [20]:
import pandas as pd
import numpy as np

#to import data from drive
from google.colab import drive

#data exploration and visualization
import seaborn as sns
import matplotlib.pyplot as plt

#model building and evaluation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import re
import string

In [2]:
#connecting to drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data_fake = pd.read_csv("/content/drive/MyDrive/Dataset/Fake.csv")
data_true = pd.read_csv("/content/drive/MyDrive/Dataset/True.csv")

EDA

In [4]:
data_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [5]:
# To check whether there are duplicated entires or not - fake
print(sum(data_fake.duplicated()))

#to remove duplicate entires
data_fake = data_fake.drop_duplicates()

3


In [6]:
# To check whether there are duplicated entires or not - true
print(sum(data_true.duplicated()))

#to remove duplicate entires
data_true = data_true.drop_duplicates()

206


In [7]:
# Missing values
print(data_fake.isnull().sum())
print('\n')
print(data_true.isnull().sum())

title      0
text       0
subject    0
date       0
dtype: int64


title      0
text       0
subject    0
date       0
dtype: int64


In [8]:
# Create a binary label for fake (1) and true (0) news
data_fake['label'] = 1
data_true['label'] = 0

In [9]:
# Concatenate the datasets
data = pd.concat([data_fake, data_true], ignore_index=True)

In [10]:
data.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [11]:
data['label'].value_counts()

1    23478
0    21211
Name: label, dtype: int64

In [12]:
# Separate features (X) and labels (y)
X = data['title'] + ' ' + data['text']  # Concatenate 'title' and 'text' features
y = data['label']

In [13]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [14]:
# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

In [15]:
# Transform the text data into numerical vectors
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [16]:
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=1)

In [17]:
# Train the classifier
rf_classifier.fit(X_train_tfidf, y_train)

In [18]:
# Make predictions on the test set
y_pred = rf_classifier.predict(X_test_tfidf)

In [22]:
# Calculate accuracy and other metrics
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

Accuracy: 0.9913850973372119
Confusion Matrix:
 [[4168   35]
 [  42 4693]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4203
           1       0.99      0.99      0.99      4735

    accuracy                           0.99      8938
   macro avg       0.99      0.99      0.99      8938
weighted avg       0.99      0.99      0.99      8938



In [23]:
print("Accuracy:", accuracy)

Accuracy: 0.9913850973372119


In [24]:
print("Confusion Matrix:\n", confusion_mat)

Confusion Matrix:
 [[4168   35]
 [  42 4693]]


In [25]:
print("Classification Report:\n", classification_rep)

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4203
           1       0.99      0.99      0.99      4735

    accuracy                           0.99      8938
   macro avg       0.99      0.99      0.99      8938
weighted avg       0.99      0.99      0.99      8938

