# Notebook for Fake and Real News Dataset
link: https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset/data


## 1. Import General Libraries

In [66]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

## 2. Import Dataset

In [67]:
true = pd.read_csv('True.csv')
fake = pd.read_csv('Fake.csv')

In [68]:
true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [69]:
fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


## 3. General Information

### Data size

In [70]:
print(f"true rows: {true.shape[0]}")
print(f"true columns: {true.shape[1]}")
print()
print(f"fake rows: {fake.shape[0]}")
print(f"fake columns: {fake.shape[1]}")

true rows: 21417
true columns: 4

fake rows: 23481
fake columns: 4


### Data Information

In [71]:
true.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB


In [72]:
fake.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB


## 4. Preprocessing and Data Cleaning

### Preprocessing

#### Add new Feature

True news as 1<br>
Fake news as 0

In [73]:
true['label'] = 1
fake['label'] = 0

In [74]:
true.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [75]:
fake.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


#### Drop Unecessary Columns

In [76]:
drop_columns = ['subject', 'date'] 

In [77]:
true.drop(drop_columns, axis=1, inplace=True)
fake.drop(drop_columns, axis=1, inplace=True)

$### Make new dataset

In [78]:
# We are gonna uee this dataset 
news = pd.concat([true, fake], ignore_index=True)

In [79]:
news[news['label'] == 1].head()

Unnamed: 0,title,text,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,1


In [80]:
news[news['label'] == 0].head()

Unnamed: 0,title,text,label
21417,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,0
21418,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,0
21419,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",0
21420,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",0
21421,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,0


### Data Cleaning

#### Missing Values

In [81]:
news.isna().sum()

title    0
text     0
label    0
dtype: int64

#### Drop Duplicates

In [82]:
news.drop_duplicates(inplace=True)

#### Drop NA

In [83]:
news.dropna(inplace=True)

#### New Data Info

In [84]:
news.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39105 entries, 0 to 44119
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   39105 non-null  object
 1   text    39105 non-null  object
 2   label   39105 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


### 4.2. Preprocessing

### Stored Label

In [85]:
y = news.label
x = news.drop('label',axis=1)

### Words Counting

In [86]:
import re
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gabrielmarcellinojoestiawan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gabrielmarcellinojoestiawan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [87]:
def process_text(text):
    text = re.sub(r'\s+', ' ', text, flags=re.I) # Remove extra white space from text

    text = re.sub(r'\W', ' ', str(text)) # Remove all the special characters from text

    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text) # Remove all single characters from text

    text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove any character that isn't alphabetical

    text = text.lower()

    words = word_tokenize(text)

    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    stop_words = set(stopwords.words("english"))
    Words = [word for word in words if word not in stop_words]

    Words = [word for word in Words if len(word) > 3]

    indices = np.unique(Words, return_index=True)[1]
    cleaned_text = np.array(Words)[np.sort(indices)].tolist()

    return cleaned_text

### Cleaned the sentence in both text and title

In [88]:
cleaned_texts = [process_text(text) for text in list(news['text'])]
cleaned_title = [process_text(title) for title in list(news['title'])]

### 4.3 Data Split

In [89]:
from sklearn.model_selection import train_test_split

## using text to predict
x_train,x_test,y_train,y_test = train_test_split(cleaned_texts,y,test_size=0.2,random_state=42)
## using title to predict
# x_train,x_test,y_train,y_test = train_test_split(cleaned_title,y,test_size=0.2,random_state=42)

In [90]:
print(f"Train data: {len(x_train)}")
print(f"Test data: {len(x_test)}")

Train data: 31284
Test data: 7821


### 4.4. Label Encoded and Tokenized

In [91]:
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()
y_train_encoded = labelEncoder.fit_transform(y_train)
y_test_encoded = labelEncoder.fit_transform(y_test)

In [92]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

Limiting the sentence length to n 

We need to have the inputs with the same size, this is where the padding is necessary. <br> <br>
Then we need to do padding, since every sentence in the text has not the same number of words, we can also define maximum number of words for each sentence, if a sentence is <br><br>
longer then we can drop some words. Here we have the lines for padding as illustrated below:

source: https://medium.com/@canerkilinc/padding-for-nlp-7dd8598c916a


In [95]:
maxlen = 100
x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
x_test = pad_sequences(x_test, padding='post', maxlen=maxlen)


| Parameter | Function | 
| --- | --- |
| padding = ”post” | add the zeros at the end of the sequence to make the samples in the same size | 
| maxlen |  this input defines the maximum number of words in your sentences | 
|truncating|setting this truncating parameter as post means that when a sentence exceeds the number of maximum words drop the last words in the sentence instead of the default setting which drops the words from the beginning of the sentence. |

In [103]:
print(f"there are {len(x_train)} vocabs in the training set")

there are 31284 vocabs in the training set


In [102]:
print(f"there are {len(x_test)} vocabs in the test set")

there are 7821 vocabs in the test set


## 5. Start to Predict

### 5.1. Build a Model

### 5.2.