# Fake News Classifier
## NL1 project - Matteo Santelmo

Dataset: https://www.kaggle.com/c/fake-news/data#

In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from platform import python_version
import nltk
from keras.utils.vis_utils import plot_model
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import numpy as np

In [2]:
df=pd.read_csv('train.csv') #import dataset

In [3]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
df=df.dropna() #drop missing values (nan values)
x=df.drop('label',axis=1) #drop also the label 
y=df['label'] #the label that tells me wheter the news is fake or not will be the output

In [5]:
print("Tensorflow -> ",tf.__version__) # 2.7.0
print("Python -> ",python_version())   # 3.8.12

Tensorflow ->  2.6.2
Python ->  3.6.13


In [6]:
voc_size=5000 #vocabulary size

## Data cleaning  

In [7]:
messages=x.copy()
messages.reset_index(inplace=True)

In [8]:
messages['title'][2]
# messages['text'][3]

'Why the Truth Might Get You Fired'

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\matte\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Data preprocessing

In [10]:
#stemming is the process of reducing words to their word stem, base or root form
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i]) #replace non literal characters with a space
    review = review.lower() #lowercase
    review = review.split() #split into a list of words
                            #in the following line I apply the stemming process to every single word
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [11]:
# through one hot every word is represented as a number that is specific for that single word
onehot_repr=[one_hot(words,voc_size)for words in corpus] 


## Creating model

In [52]:
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length) # fix sentences' lentgh
#embedded_docs=embedded_docs/voc_size

In [53]:
# the embedding layer converts the input into a vector with a specific number of features
#embedding_vector_features=50
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, input_shape=(20,),activation='relu'),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
#plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_15 (Dense)             (None, 64)                1344      
_________________________________________________________________
dense_16 (Dense)             (None, 24)                1560      
_________________________________________________________________
dense_17 (Dense)             (None, 1)                 25        
Total params: 2,929
Trainable params: 2,929
Non-trainable params: 0
_________________________________________________________________


In [54]:
len(embedded_docs),y.shape

(18285, (18285,))

In [55]:
x_final=np.array(embedded_docs)
y_final=np.array(y)

In [56]:
x_final.shape,y_final.shape

((18285, 20), (18285,))

In [57]:
x_train, x_test, y_train, y_test = train_test_split(x_final, y_final, test_size=0.33, random_state=42)

### Model Training

In [58]:
model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=10,verbose=2)

Epoch 1/10
383/383 - 2s - loss: 27.4625 - accuracy: 0.6937 - val_loss: 11.5313 - val_accuracy: 0.7423
Epoch 2/10
383/383 - 1s - loss: 9.0764 - accuracy: 0.7230 - val_loss: 5.9548 - val_accuracy: 0.7387
Epoch 3/10
383/383 - 1s - loss: 6.7148 - accuracy: 0.7221 - val_loss: 6.2463 - val_accuracy: 0.7259
Epoch 4/10
383/383 - 1s - loss: 4.8650 - accuracy: 0.7282 - val_loss: 4.7941 - val_accuracy: 0.7102
Epoch 5/10
383/383 - 1s - loss: 3.8988 - accuracy: 0.7265 - val_loss: 8.4253 - val_accuracy: 0.5051
Epoch 6/10
383/383 - 1s - loss: 3.9016 - accuracy: 0.7274 - val_loss: 3.0615 - val_accuracy: 0.7548
Epoch 7/10
383/383 - 1s - loss: 3.3671 - accuracy: 0.7270 - val_loss: 5.6387 - val_accuracy: 0.7039
Epoch 8/10
383/383 - 1s - loss: 3.2880 - accuracy: 0.7238 - val_loss: 2.9333 - val_accuracy: 0.7306
Epoch 9/10
383/383 - 1s - loss: 2.5751 - accuracy: 0.7263 - val_loss: 1.9618 - val_accuracy: 0.7524
Epoch 10/10
383/383 - 1s - loss: 2.3609 - accuracy: 0.7256 - val_loss: 2.8126 - val_accuracy: 0.76

<keras.callbacks.History at 0x21ac75ddc88>

### Performance Metrics And Accuracy

In [66]:
predict_x=model.predict(x_test)
y_pred=(model.predict(x_test) > 0.5).astype("int32")
#y_pred=np.argmax(model.predict(x_test),axis=1)

In [67]:
confusion_matrix(y_test,y_pred)

array([[2324, 1095],
       [ 331, 2285]], dtype=int64)

In [68]:
accuracy_score(y_test,y_pred)

0.7637116818558409