# **Fake News detection**

### import necessary libraries

In [None]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import classification_report, accuracy_score

### Load news dataset

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Data Analysis and Machine Learning with python Project/(Assignment)Fake_news_detection/news.csv')

In [None]:
df.head()

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [None]:
df.shape

(6335, 4)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      6335 non-null   int64 
 1   title   6335 non-null   object
 2   text    6335 non-null   object
 3   label   6335 non-null   object
dtypes: int64(1), object(3)
memory usage: 198.1+ KB


In [None]:
df.isnull().sum()

Unnamed: 0,0
id,0
title,0
text,0
label,0


### Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.text,df.label,test_size=0.2,random_state=20)

In [None]:
x_train.head()

Unnamed: 0,text
4741,"NAIROBI, Kenya — President Obama spoke out Sun..."
2089,"Killing Obama administration rules, dismantlin..."
4074,"Dean Obeidallah, a former attorney, is the hos..."
5376,WashingtonsBlog \nCNN’s Jake Tapper hit the ...
6028,Some of the biggest issues facing America this...


In [None]:
y_train.head()

Unnamed: 0,label
4741,REAL
2089,REAL
4074,REAL
5376,FAKE
6028,REAL


### Text into numeric


| Term                           | Purpose                                                               | Use Case                                                   |
|-------------------------------|------------------------------------------------------------------------|------------------------------------------------------------|
| `TfidfVectorizer`              | Converts text into numerical features, emphasizing important words     | Text classification, NLP tasks                            |
| `PassiveAggressiveClassifier` | Learns only from mistakes, updating aggressively when wrong            | Fast online learning, real-time classification, fake news |


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorize = TfidfVectorizer(stop_words='english', max_df=0.7)
x_train_v = vectorize.fit_transform(x_train)
x_test_v = vectorize.transform(x_test)

## Using Naive Bayes Classifiers and Import MultinomialNB and create an object to fit data

In [None]:
from sklearn.naive_bayes import MultinomialNB
bayes_model = MultinomialNB()
bayes_model.fit(x_train_v, y_train)

In [29]:
# Predict on the test set
y_pred = bayes_model.predict(x_test_v)
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\n Classification Report:\n", classification_report(y_test, y_pred))
print("\n Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8232044198895028

 Classification Report:
               precision    recall  f1-score   support

        FAKE       0.97      0.68      0.80       648
        REAL       0.74      0.98      0.84       619

    accuracy                           0.82      1267
   macro avg       0.86      0.83      0.82      1267
weighted avg       0.86      0.82      0.82      1267


 Confusion Matrix:
 [[438 210]
 [ 14 605]]


## Using PassiveAggressiveClassifier

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier
pac_model=PassiveAggressiveClassifier(max_iter=50)
pac_model.fit(x_train_v,y_train)

In [30]:
from sklearn.metrics import accuracy_score, confusion_matrix
y_pred = pac_model.predict(x_test_v)
score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(score*100,2)}%')
print("\n Classification Report:\n", classification_report(y_test, y_pred))
print("\n Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 94.95%

 Classification Report:
               precision    recall  f1-score   support

        FAKE       0.94      0.96      0.95       648
        REAL       0.96      0.94      0.95       619

    accuracy                           0.95      1267
   macro avg       0.95      0.95      0.95      1267
weighted avg       0.95      0.95      0.95      1267


 Confusion Matrix:
 [[623  25]
 [ 39 580]]


## Using LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [None]:
lr_model = LogisticRegression()
lr_model.fit(x_train_v,y_train)

In [31]:
y_pred = lr_model.predict(x_test_v)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9226519337016574

Classification Report:
               precision    recall  f1-score   support

        FAKE       0.90      0.96      0.93       648
        REAL       0.95      0.89      0.92       619

    accuracy                           0.92      1267
   macro avg       0.92      0.92      0.92      1267
weighted avg       0.92      0.92      0.92      1267


Confusion Matrix:
 [[619  29]
 [ 69 550]]


## Use LinearSVC

In [32]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
clf = LinearSVC(C=1.0)
clf.fit(x_train_v, y_train)

y_pred = clf.predict(x_test_v)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9494869771112865

Classification Report:
               precision    recall  f1-score   support

        FAKE       0.93      0.97      0.95       648
        REAL       0.97      0.93      0.95       619

    accuracy                           0.95      1267
   macro avg       0.95      0.95      0.95      1267
weighted avg       0.95      0.95      0.95      1267

Confusion Matrix:
 [[630  18]
 [ 46 573]]


# Make a Model to build

**Since LinearSVC and PassiveAggressiveClassifier delivered the highest accuracy, So that I will proceed with it as the final model.**


In [33]:
import pickle
pickle.dump(pac_model,open('model.pkl', 'wb'))