<a href="https://colab.research.google.com/github/naman9810/Fake-News-Detection/blob/main/major_project_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import json
import tensorflow as tf
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
from tensorflow.keras import regularizers
import os

In [None]:
def parse(filename):
  items = []
  with open(filename, 'r') as f:
    data = f.read()[1:-2]
    data = data.split('}\n{')
    for item in data:
      item_dict = json.loads('{' + item + '}')
      items.append(item_dict)
  return items

In [None]:
drive.mount("/content/gdrive")

In [None]:
os.chdir("/content/gdrive/MyDrive/fake_news")

In [None]:
data_list=parse('train.jsonl')
data=pd.DataFrame(data_list)
data.to_csv('train.csv',index=False)

In [None]:
data['label'].value_counts()

In [None]:
data=data[data.label!='NOT ENOUGH INFO']

In [None]:
data.info()

In [None]:
data.drop('id',axis=1,inplace=True)
data.drop('verifiable',axis=1,inplace=True)
data.drop('evidence',axis=1,inplace=True)
d=pd.get_dummies(data['label'],drop_first=True)
data.drop('label',axis=1,inplace=True)
data=pd.concat([data,d],axis=1)
print(data)

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(data['claim'])
data['claim'] = tokenizer.texts_to_sequences(data['claim'])

In [None]:
len_text = [len(x) for x in data['claim'].values]
len_text = np.array(len_text)
sns.distplot(len_text)

In [None]:
m=np.mean(len_text)
std=np.std(len_text)
MAX_TEXT=int(2*std+m)
print(MAX_TEXT)
np.unique((len_text >= MAX_TEXT), return_counts=True)

In [None]:
labels = data.pop('SUPPORTS')
X_train, X_test, Y_train, Y_test = train_test_split(data,labels,test_size=0.05,random_state=2)

In [None]:
X_train_claim = tf.keras.preprocessing.sequence.pad_sequences(X_train['claim'],maxlen=MAX_TEXT,padding='post',
                                                             truncating='post')
X_test_claim = tf.keras.preprocessing.sequence.pad_sequences(X_test['claim'],maxlen=MAX_TEXT,padding='post',
                                                            truncating='post')

In [None]:
embeddings_matrix = np.zeros((len(tokenizer.index_word)+1, 100))

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(tokenizer.index_word)+1, 100,activity_regularizer=regularizers.l2(2e-3),trainable=False),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv1D(64, 5, activation='relu',activity_regularizer=regularizers.l2(2e-3)),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.LSTM(20, return_sequences=True,activity_regularizer=regularizers.l2(2e-3)),
    tf.keras.layers.LSTM(20,activity_regularizer=regularizers.l2(2e-3)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1024,activation='relu',activity_regularizer=regularizers.l2(2e-3)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(512,activation='relu',activity_regularizer=regularizers.l2(2e-3)),
    tf.keras.layers.Dense(1, activation="sigmoid",activity_regularizer=regularizers.l2(2e-3))
])
model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train_claim,Y_train, epochs=100, batch_size=256, validation_data=(X_test_claim,Y_test))
model.save('saved_model/my_model',include_optimizer=False)

In [None]:
model = tf.keras.models.load_model('/content/gdrive/MyDrive/fake_news/saved_model/my_model')
model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy'])
acc_train=model.evaluate(X_train_claim,Y_train)
acc_test=model.evaluate(X_test_claim,Y_test)
print('Training Accuracy:',acc_train[1])
print('Test Accuracy:',acc_test[1])

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper right')
plt.show()