# Fake News Classifier
## NL1 project - Matteo Santelmo

Dataset: https://www.kaggle.com/c/fake-news/data#

In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from platform import python_version
import nltk
from keras.utils.vis_utils import plot_model
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import numpy as np

In [2]:
df=pd.read_csv('train.csv') #import dataset

In [3]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
df=df.dropna() #drop missing values (nan values)
x=df.drop('label',axis=1) #drop also the label 
y=df['label'] #the label that tells me wheter the news is fake or not will be the output

In [5]:
print("Tensorflow -> ",tf.__version__) # 2.7.0
print("Python -> ",python_version())   # 3.8.12

Tensorflow ->  2.6.2
Python ->  3.6.13


In [6]:
voc_size=5000 #vocabulary size

## Data cleaning  

In [7]:
messages=x.copy()
messages.reset_index(inplace=True)

In [8]:
messages['title'][2]
# messages['text'][3]

'Why the Truth Might Get You Fired'

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\matte\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Data preprocessing

In [10]:
#stemming is the process of reducing words to their word stem, base or root form
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i]) #replace non literal characters with a space
    review = review.lower() #lowercase
    review = review.split() #split into a list of words
                            #in the following line I apply the stemming process to every single word
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [11]:
# through one hot every word is represented as a number that is specific for that single word
onehot_repr=[one_hot(words,voc_size)for words in corpus] 

## Creating model

In [12]:
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length) # fix sentences' lentgh

In [13]:
# the embedding layer converts the input into a vector with a specific number of features
embedding_vector_features=64
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(voc_size,embedding_vector_features,input_length=sent_length),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
#plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 64)            320000    
_________________________________________________________________
global_max_pooling1d (Global (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 32)                2080      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 322,113
Trainable params: 322,113
Non-trainable params: 0
_________________________________________________________________


In [14]:
len(embedded_docs),y.shape

(18285, (18285,))

In [15]:
x_final=np.array(embedded_docs)
y_final=np.array(y)

In [16]:
x_final.shape,y_final.shape

((18285, 20), (18285,))

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x_final, y_final, test_size=0.33, random_state=42)

### Model Training

In [18]:
model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=10,verbose=2)

Epoch 1/10
383/383 - 3s - loss: 0.3504 - accuracy: 0.8380 - val_loss: 0.2148 - val_accuracy: 0.9072
Epoch 2/10
383/383 - 2s - loss: 0.1473 - accuracy: 0.9429 - val_loss: 0.1996 - val_accuracy: 0.9198
Epoch 3/10
383/383 - 2s - loss: 0.0828 - accuracy: 0.9736 - val_loss: 0.2245 - val_accuracy: 0.9173
Epoch 4/10
383/383 - 2s - loss: 0.0419 - accuracy: 0.9895 - val_loss: 0.2584 - val_accuracy: 0.9158
Epoch 5/10
383/383 - 2s - loss: 0.0198 - accuracy: 0.9963 - val_loss: 0.2839 - val_accuracy: 0.9152
Epoch 6/10
383/383 - 2s - loss: 0.0099 - accuracy: 0.9990 - val_loss: 0.3196 - val_accuracy: 0.9137
Epoch 7/10
383/383 - 2s - loss: 0.0060 - accuracy: 0.9994 - val_loss: 0.3416 - val_accuracy: 0.9132
Epoch 8/10
383/383 - 2s - loss: 0.0040 - accuracy: 0.9994 - val_loss: 0.3599 - val_accuracy: 0.9133
Epoch 9/10
383/383 - 2s - loss: 0.0025 - accuracy: 0.9997 - val_loss: 0.3775 - val_accuracy: 0.9120
Epoch 10/10
383/383 - 2s - loss: 0.0020 - accuracy: 0.9997 - val_loss: 0.3916 - val_accuracy: 0.9112

<keras.callbacks.History at 0x1594796fbe0>

In [19]:
#model.save('float_model')

### Performance Metrics And Accuracy

In [20]:
predict_x=model.predict(x_test)
y_pred=(model.predict(x_test) > 0.5).astype("int32")
#y_pred=np.argmax(model.predict(x_test),axis=1)
y_pred[0]

array([1])

In [21]:
confusion_matrix(y_test,y_pred)

array([[3118,  301],
       [ 235, 2381]], dtype=int64)

In [22]:
accuracy_score(y_test,y_pred)

0.9111847555923778