<a href="https://colab.research.google.com/github/man1sh1409/fakeNewsSocialMediaProspective/blob/main/fake_news.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

In [11]:
# read all the datasets
df1=pd.read_csv('/content/gossipcop_real.csv',usecols=['title'])

In [12]:
df1

Unnamed: 0,title
0,Teen Mom Star Jenelle Evans' Wedding Dress Is ...
1,Kylie Jenner refusing to discuss Tyga on Life ...
2,Quinn Perkins
3,I Tried Kim Kardashian's Butt Workout & Am For...
4,Celine Dion donates concert proceeds to Vegas ...
...,...
16812,2017 Hollywood Film Awards: The Complete List ...
16813,Jada Pinkett Smith explains why son Jaden move...
16814,Tinsley Mortimer Reacts to Luann de Lesseps' R...
16815,Prince Harry Carries on Princess Diana’s Legac...


In [13]:
df2=pd.read_csv('/content/politifact_real.csv',usecols=['title'])
df2

Unnamed: 0,title
0,National Federation of Independent Business
1,comments in Fayetteville NC
2,"Romney makes pitch, hoping to close deal : Ele..."
3,Democratic Leaders Say House Democrats Are Uni...
4,"Budget of the United States Government, FY 2008"
...,...
619,Flake: “Religious tests should have no place i...
620,Change We Can Believe In
621,deputy director of national health statistics ...
622,Romneys ProLife Conversion Myth or Reality Jun...


In [14]:
df3=pd.read_csv('/content/gossipcop_fake.csv',usecols=['title'])
df3

Unnamed: 0,title
0,Did Miley Cyrus and Liam Hemsworth secretly ge...
1,Paris Jackson & Cara Delevingne Enjoy Night Ou...
2,Celebrities Join Tax March in Protest of Donal...
3,Cindy Crawford's daughter Kaia Gerber wears a ...
4,Full List of 2018 Oscar Nominations – Variety
...,...
5318,September 11: Celebrities Remember 9/11 (TWEETS)
5319,NASCAR owners threaten to fire drivers who pro...
5320,The 7 signs that David Beckham is definitely h...
5321,Ryan Gosling and Eva Mendes Did Not Get Marrie...


In [15]:
df4=pd.read_csv('/content/politifact_fake.csv',usecols=['title'])
df4

Unnamed: 0,title
0,BREAKING: First NFL Team Declares Bankruptcy O...
1,Court Orders Obama To Pay $400 Million In Rest...
2,UPDATE: Second Roy Moore Accuser Works For Mic...
3,Oscar Pistorius Attempts To Commit Suicide
4,Trump Votes For Death Penalty For Being Gay
...,...
427,Who is affected by the government shutdown?
428,Lindsey Graham Threatens To Convert To Democra...
429,ELECTORAL COLLEGE ELECTOR COMMITS SUICIDE TO A...
430,Sarah Palin Calls To Boycott Mall Of America B...


In [5]:
df1=df1.drop(labels=['id','news_url','tweet_ids'],axis=1)
df2=df2.drop(labels=['id','news_url','tweet_ids'],axis=1)
df3=df3.drop(labels=['id','news_url','tweet_ids'],axis=1)
df4=df4.drop(labels=['id','news_url','tweet_ids'],axis=1)


In [16]:
#combine the real and fake datasets
df_real=pd.concat([df1,df2],axis=0)
df_fake=pd.concat([df3,df4],axis=0)

In [7]:
# check for null and empty value
df_real.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5947 entries, 0 to 623
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   5947 non-null   object
dtypes: object(1)
memory usage: 92.9+ KB


In [8]:
# check data sample of real
df_real.sample(5)

Unnamed: 0,title
3358,Connecting People Through News
243,Barack Obama’s Remarks in St. Paul
3894,Keith in heat
2790,Jennifer Aniston And Justin Theroux Finally Re...
4031,Emma Stone


In [9]:
df_real.shape

(5947, 1)

In [17]:
# remove title having less than 5 words
df_real=df_real[~df_real.title.str.count('\s+').lt(4)]

In [11]:
# check shape of real datasets
df_real.shape

(5501, 1)

In [18]:
# add label column for real news
df_real['label']=0
df_real

Unnamed: 0,title,label
0,Teen Mom Star Jenelle Evans' Wedding Dress Is ...,0
1,Kylie Jenner refusing to discuss Tyga on Life ...,0
3,I Tried Kim Kardashian's Butt Workout & Am For...,0
4,Celine Dion donates concert proceeds to Vegas ...,0
5,"Chris Evans, Millie Bobby Brown, Snoop Dogg an...",0
...,...,...
618,"Trump asking Congress, not Mexico, to pay for ...",0
619,Flake: “Religious tests should have no place i...,0
620,Change We Can Believe In,0
621,deputy director of national health statistics ...,0


In [20]:
# similarly for fake news also
df_fake=df_fake[~ df_fake.title.str.count('\s+').lt(4)]
# add label column
df_fake['label']=1
df_fake

Unnamed: 0,title,label
0,Did Miley Cyrus and Liam Hemsworth secretly ge...,1
1,Paris Jackson & Cara Delevingne Enjoy Night Ou...,1
2,Celebrities Join Tax March in Protest of Donal...,1
3,Cindy Crawford's daughter Kaia Gerber wears a ...,1
4,Full List of 2018 Oscar Nominations – Variety,1
...,...,...
426,BUSTED: Russian Mansions Obama Seized Were Mea...,1
427,Who is affected by the government shutdown?,1
428,Lindsey Graham Threatens To Convert To Democra...,1
429,ELECTORAL COLLEGE ELECTOR COMMITS SUICIDE TO A...,1


In [21]:
# make a dataset as a whole so that we can train our model
df=pd.concat([df_real,df_fake])

In [22]:
# shuffle the final dataset to ovecome the over fitting 
df=df.sample(frac=1)
df

Unnamed: 0,title,label
2547,Candice Swanepoel Welcomes Her Second Child,0
12522,Emmy Nominations 2018: The 20 Biggest Snubs an...,0
248,All Citizens Who Want to Receive Gvt Benefits ...,1
15225,Jessie James Decker's Baby Niece Has Sweet Hom...,0
9499,10 Best-Dressed Stars at the 2018 Kids’ Choice...,0
...,...,...
4584,Blake Shelton: What He Really Thinks of The Gw...,1
9343,Who Killed JonBenet Ramsey? 8 Possible Suspects,0
4126,Prince Louis Christening Outfit: All the Detai...,0
5107,Anna Wintour Responds to Mario Testino and Bru...,0


In [23]:
df.shape

(21893, 2)

In [24]:
# count the distinct label value
df.label.value_counts()

0    16460
1     5433
Name: label, dtype: int64

In [25]:
# download stop words
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
# download wordNet for english language from laxical database
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [27]:
#Text preprocessing
def preprocessing(tweet):
  text=BeautifulSoup(tweet).get_text() # Remove html tags
  text=re.sub("[^a-zA-Z#]"," ",text) # remove special characters
  test=re.sub('((www.[^s]+) | (https?://[^s]+))',' ',text) # remove urls
  text=text.lower().split() # convert to lowercase and split each words
  
  stop_w=set(stopwords.words("english")) # set of all stopwords for fast search
  text=[w for w in text if not w in stop_w] # reomve stop words
  text=[WordNetLemmatizer().lemmatize(w) for w in text] # Lemmatization if each word

  return (" ".join(text)) # return words after joining with each word separator



In [28]:
# clean the title of dataset
df['title']=df['title'].apply(lambda text: preprocessing(text))
df['title'].head()

  text=BeautifulSoup(tweet).get_text() # Remove html tags


2547                candice swanepoel welcome second child
12522                emmy nomination biggest snub surprise
248      citizen want receive gvt benefit must agree mi...
15225    jessie james decker baby niece sweet homecomin...
9499              best dressed star kid choice sport award
Name: title, dtype: object

In [29]:
# spliting the data into train and test 

train,test=train_test_split(df,test_size=0.2,stratify=df['label'],random_state=42)
train.shape,test.shape

((17514, 2), (4379, 2))

In [30]:
# Create a TF-IDF vectorizer object
tfidf_vec = TfidfVectorizer(max_features=10000, ngram_range=(1, 3))

# Data fitting and transformation
train_df = tfidf_vec.fit_transform(train.title)
test_df  = tfidf_vec.transform(test.title)

In [31]:
# before over-sampling the minority class
train['label'].value_counts()

0    13168
1     4346
Name: label, dtype: int64

In [32]:
# Use SMOTE (Synthetic Minority Oversampling Technique) for dealing with class imbalance
smt = SMOTE(random_state = 18, sampling_strategy = 1.0)
smt_xtrain_df, smt_ytrain = smt.fit_resample(train_df, train.label)
smt_ytrain.value_counts()

0    13168
1    13168
Name: label, dtype: int64

In [33]:
# fit into model
# MOdel 1: Logistic Regressor
logis_reg=LogisticRegression()
logis_reg.fit(smt_xtrain_df,smt_ytrain)

In [34]:
y_pred=logis_reg.predict(test_df)

In [35]:
# evaluation metrics
print(classification_report(test.label, y_pred, target_names=['Negative', 'Positive']))

              precision    recall  f1-score   support

    Negative       0.91      0.85      0.88      3292
    Positive       0.62      0.73      0.67      1087

    accuracy                           0.82      4379
   macro avg       0.76      0.79      0.78      4379
weighted avg       0.84      0.82      0.83      4379



In [36]:
# Model 2 - Linear Support Vector Classifier
linear_svc = LinearSVC()
linear_svc.fit(smt_xtrain_df, smt_ytrain)

y_pred = linear_svc.predict(test_df)

# Evaluation metrics
print(classification_report(test.label, y_pred, target_names=['Negative', 'Positive']))

              precision    recall  f1-score   support

    Negative       0.89      0.85      0.87      3292
    Positive       0.60      0.68      0.64      1087

    accuracy                           0.81      4379
   macro avg       0.74      0.76      0.75      4379
weighted avg       0.82      0.81      0.81      4379

