<a href="https://colab.research.google.com/github/omkarakolkar/Machine-Learning-projects/blob/main/P3_Fake_News_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("algord/fake-news")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/fake-news


In [16]:
import numpy as np
import pandas as pd
import re #regular expression lib for searching a words in a text or a paragraph
from nltk.corpus import stopwords #natural lang toolkit stopwords for removing words which don't add value to our prediction
#eg. where what as
from nltk.stem.porter import PorterStemmer #removes the prefix and suffix and give the root word
from sklearn.feature_extraction.text import TfidfVectorizer #converting text into feature vectors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [17]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Data Pre-processing

In [18]:
import os
print(os.listdir(path))
df = pd.read_csv(os.path.join(path, "FakeNewsNet.csv"))

['FakeNewsNet.csv']


In [19]:
df.shape

(23196, 5)

In [20]:
df.head(10)

Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1
5,Gwen Stefani Got Dumped by Blake Shelton Over ...,www.intouchweekly.com/posts/gwen-stefani-dumpe...,www.intouchweekly.com,45,0
6,Broward County Sheriff Fired For Lying About P...,https://yournewswire.com/broward-county-sherif...,yournewswire.com,124,0
7,Amber Rose Shuts Down French Montana Dating Ru...,www.etonline.com/news/214798_amber_rose_shuts_...,www.etonline.com,4,0
8,Mindy Kaling makes first post-baby appearance ...,https://www.aol.com/article/entertainment/2018...,www.aol.com,59,1
9,Katharine McPhee Butchers Tony Nominations: “I...,https://www.98online.com/2018/05/02/katharine-...,www.98online.com,10,1


In [21]:
df.isnull().sum()

Unnamed: 0,0
title,0
news_url,330
source_domain,330
tweet_num,0
real,0


In [22]:
dfnew = df.fillna('') # filling empty data with null string using fillna

In [23]:
dfnew.isnull().sum()

Unnamed: 0,0
title,0
news_url,0
source_domain,0
tweet_num,0
real,0


In [40]:
# seperating labels and data
y = dfnew['real']

# Stemming

a process of reducing a word to its root word

In [25]:
port_stem = PorterStemmer()

In [26]:
def stemming(title):
  stemmed_content = re.sub('[^a-zA-Z]',' ', title) # removing symbols
  stemmed_content = stemmed_content.lower() # converting all letters to lower case, as uppercase can mean significance
  stemmed_content = stemmed_content.split() #splitted and converted to list
  # for removing stopwords and stemming the non stopword words
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content) # joining all words
  return stemmed_content

In [27]:
dfnew['title'] = dfnew['title'].apply(stemming)

In [28]:
print(dfnew['title'])

0        kandi burruss explod rape accus real housew at...
1                   peopl choic award best red carpet look
2        sophia bush send sweet birthday messag one tre...
3        colombian singer maluma spark rumour inappropr...
4        gossip girl year later upper east sider shock ...
                               ...                        
23191    pippa middleton wed case miss pippa marri lace...
23192    zayn malik gigi hadid shock split chanc reunit...
23193    jessica chastain recal moment mother boyfriend...
23194    tristan thompson feel dump khlo kardashian ref...
23195    kelli clarkson perform medley kendrick lamar h...
Name: title, Length: 23196, dtype: object


# Tf-idf-vectorization


In [37]:
vectorizer = TfidfVectorizer() #TF-IDF stands for Term Frequency - Inverse Document Frequency. It’s a numerical statistic
# used in text processing to reflect how important a word is to a document in a collection (or corpus).

vectorizer.fit(dfnew['title'])

title = vectorizer.transform(dfnew['title']) ######### title column preprocessed

In [39]:
print(title)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 182971 stored elements and shape (23196, 12546)>
  Coords	Values
  (0, 53)	0.2721149519149911
  (0, 576)	0.32205761515065545
  (0, 1524)	0.4060028523437122
  (0, 3739)	0.37202135470266046
  (0, 5246)	0.27372476382132127
  (0, 5895)	0.4060028523437122
  (0, 8975)	0.3165911519884338
  (0, 9020)	0.24249280188561428
  (0, 9287)	0.26647854102470714
  (0, 11959)	0.2217815349642914
  (1, 643)	0.31690546637483147
  (1, 986)	0.34762560771735407
  (1, 1710)	0.3978429114527965
  (1, 1975)	0.43378159796649224
  (1, 6573)	0.3420111696667716
  (1, 8315)	0.4067948402953306
  (1, 9079)	0.387076508000102
  (2, 1069)	0.19438975915734968
  (2, 1361)	0.37698546923989756
  (2, 1528)	0.3313932058546551
  (2, 1529)	0.2679606830015344
  (2, 2132)	0.22767646060985314
  (2, 3657)	0.25142880383632443
  (2, 5100)	0.22534648956135603
  (2, 5104)	0.2418360754414745
  :	:
  (23193, 9046)	0.32490265953129527
  (23193, 10239)	0.39707689058614987
  (23194, 3

# One hot encoding for source domain

In [34]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
domain = encoder.fit_transform(dfnew[['source_domain']]) ######### source domain column preprocessed

In [35]:
print(domain)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [48]:
from scipy.sparse import csr_matrix, issparse

print(f"Type of domain: {type(domain)}")

if not issparse(domain):
    print("Converting X_domain to sparse matrix...")
    domain = csr_matrix(domain)
else:
    print("X_domain is already sparse.")

print(f"Shape: {domain.shape}")
print(f"Number of non-zero elements: {domain.nnz}")

Type of X_domain: <class 'numpy.ndarray'>
Converting X_domain to sparse matrix...
Shape of X_domain: (23196, 2442)
Number of non-zero elements in X_domain: 23196


# Sparsing for number of tweets data.

In [47]:
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix, hstack

#Extract tweet_num as 2D array
tweet_num_values = dfnew[['tweet_num']].values

#Scale
scaler = StandardScaler()
tweet_num_scaled = scaler.fit_transform(tweet_num_values)

#Convert to sparse matrix
tweets = csr_matrix(tweet_num_scaled)

print(tweets)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 23196 stored elements and shape (23196, 1)>
  Coords	Values
  (0, 0)	-0.0960882643054275
  (1, 0)	-0.18203336388292424
  (2, 0)	-0.05311571451667912
  (3, 0)	-0.14110712598887817
  (4, 0)	-0.10427351188423671
  (5, 0)	-0.08994932862132059
  (6, 0)	0.07170931106016139
  (7, 0)	-0.17384811630411504
  (8, 0)	-0.06130096209548833
  (9, 0)	-0.1615702449359012
  (10, 0)	-0.1451997497782828
  (11, 0)	0.012366266113794593
  (12, 0)	-0.17384811630411504
  (13, 0)	-0.18203336388292424
  (14, 0)	-0.18203336388292424
  (15, 0)	-0.05311571451667912
  (16, 0)	-0.07153252156899985
  (17, 0)	-0.05106940262197682
  (18, 0)	-0.17998705198822193
  (19, 0)	-0.14315343788358048
  (20, 0)	0.06352406348135219
  (21, 0)	-0.1656628687253058
  (22, 0)	0.004181018534985378
  (23, 0)	-0.153384997357092
  (24, 0)	-0.08585670483191597
  :	:
  (23171, 0)	-0.13292187841006894
  (23172, 0)	0.012366266113794593
  (23173, 0)	-0.1001808880948321
  (23174, 0)	-

# hstack

In [49]:
from scipy.sparse import hstack

x = hstack([title, domain, tweets])

In [50]:
print(x)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 229363 stored elements and shape (23196, 14989)>
  Coords	Values
  (0, 53)	0.2721149519149911
  (0, 576)	0.32205761515065545
  (0, 1524)	0.4060028523437122
  (0, 3739)	0.37202135470266046
  (0, 5246)	0.27372476382132127
  (0, 5895)	0.4060028523437122
  (0, 8975)	0.3165911519884338
  (0, 9020)	0.24249280188561428
  (0, 9287)	0.26647854102470714
  (0, 11959)	0.2217815349642914
  (0, 13519)	1.0
  (0, 14988)	-0.0960882643054275
  (1, 643)	0.31690546637483147
  (1, 986)	0.34762560771735407
  (1, 1710)	0.3978429114527965
  (1, 1975)	0.43378159796649224
  (1, 6573)	0.3420111696667716
  (1, 8315)	0.4067948402953306
  (1, 9079)	0.387076508000102
  (1, 14806)	1.0
  (1, 14988)	-0.18203336388292424
  (2, 1069)	0.19438975915734968
  (2, 1361)	0.37698546923989756
  (2, 1528)	0.3313932058546551
  (2, 1529)	0.2679606830015344
  :	:
  (23194, 3907)	0.2822937698497612
  (23194, 5165)	0.2766025175314641
  (23194, 5908)	0.18341672952604757
  (2

# train test splitting

In [51]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2, stratify =y, random_state = 2)

# Model training

In [54]:
model = LogisticRegression()

In [56]:
model.fit(xtrain, ytrain)

# Evaluation

In [57]:
xtrain_prediction = model.predict(xtrain)
training_data_acc = accuracy_score(xtrain_prediction, ytrain)

In [58]:
print(training_data_acc)

0.8975533520155206


In [59]:
xtest_prediction = model.predict(xtest)
test_data_acc = accuracy_score(xtest_prediction, ytest)
print(test_data_acc)

0.865948275862069
