In [None]:
!pip install tensorflow-addons
import pandas as pd
import tensorflow as tf
import numpy as np
from gensim.utils import simple_preprocess
import time
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import tensorflow_addons as tfa
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers import MaxPooling1D, Dense, Flatten
from keras.initializers import Constant
import keras.layers as L
from keras.models import Model
from keras.layers import Dense, Embedding, LSTM, GRU, Input, Flatten, Bidirectional, SimpleRNN, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model
from sklearn.metrics import accuracy_score, \
                            precision_score, recall_score, \
                            f1_score

 
def cnn_model2():
  model = Sequential()
  model.add(Conv1D(filters=128, kernel_size=16, activation='relu'))
  model.add(MaxPooling1D(pool_size=2))
  model.add(Flatten())
  model.add(Dense(10, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))
  return model

!git clone https://github.com/MaazAmjad/Urdu-Fake-news-detection-FIRE2021 UrduFake
!unzip -o "/content/UrduFake/Test Dataset @ FIRE 2021".zip 
!unzip -o "/content/UrduFake/Training Dataset@FIRE2021".zip

datasets = {
      "Train": "/content/Training Dataset@FIRE2021/Train",
      "Validation": "/content/Training Dataset@FIRE2021/Test",
      "Test": "/content/Test Dataset"
}
for dataset in datasets.keys():
  if dataset == 'Test':
    df = pd.DataFrame(columns=['Text', 'File No'])
  else:
    df = pd.DataFrame(columns=['Text', 'Label'])
  fileList = []
  for root, dirs, files in os.walk(datasets[dataset]):
    for file in files:
      if dataset == 'Test':
        classLabel = file.replace('.txt', '')
      else:
        classLabel = os.path.join(root,file).split('/')[-2]
      fileList.append({'path':os.path.join(root,file),'class':classLabel})

  for _file in fileList:
      fileRead = open(_file['path'], mode='r', encoding="UTF-8").read()
      if dataset == 'Test':
        df = df.append({'Text':fileRead, 'File No':_file['class']}, ignore_index = True)
      else:
        df = df.append({'Text':fileRead, 'Label':_file['class']}, ignore_index = True)

  df.to_csv(dataset+'.csv', index=False) 

dfTrain = pd.read_csv('Train.csv')
dfValid = pd.read_csv('Validation.csv')
dfTest = pd.read_csv('Test.csv')

dataFrames = [dfTrain, dfValid, dfTest]
for dataFrame in dataFrames:
  dataFrame['Text'].replace(np.NaN, 'Null',inplace=True)
  dataFrame['Text'] = dataFrame['Text'].apply(lambda x:str(x))
  if 'Label' in dataFrame.columns: 
    dataFrame['Label'].replace(np.NaN, 0,inplace=True)
    dataFrame['Label'] = np.where(dataFrame['Label']== 'Fake', 1, 0)

featureVectors = TfidfVectorizer(input='content',decode_error='ignore',analyzer='word',binary=False, norm='l2', sublinear_tf=True, max_features=None).fit(dfTrain['Text'])
tfLen = len(featureVectors.vocabulary_)

XTrainVectorized = featureVectors.transform(dfTrain['Text']).toarray()
XTrainVectorized = XTrainVectorized[:,:,None]
XValidVectorized = featureVectors.transform(dfValid['Text']).toarray()
XValidVectorized = XValidVectorized[:,:,None]

XTestVectorized = featureVectors.transform(dfTest['Text']).toarray()
XTestVectorized = XTestVectorized[:,:,None]

labelsTrain = dfTrain['Label'].values
labelsValid = dfValid['Label'].values
model = cnn_model2()

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
mc = ModelCheckpoint('/content/best_model.h5', monitor='val_acc', mode='max', min_delta=1, save_best_only=True)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc', tfa.metrics.F1Score(num_classes=1), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]) 
with tf.device('/device:GPU:0'):
    model.fit(XTrainVectorized, labelsTrain, batch_size=10, epochs=50, validation_data=(XValidVectorized, labelsValid), verbose=2, shuffle=True, callbacks=[es,mc])
    predict_model = load_model('/content/best_model.h5')
    y_pred = (np.asarray(predict_model.predict(XValidVectorized))).round()
    y = labelsValid
    Accuracy = accuracy_score(y, y_pred)
    Precision = precision_score(y, y_pred, zero_division=1)
    Recall =  recall_score(y, y_pred, zero_division=1)
    f1Score = ((Precision*Recall)/(Precision+Recall))*2
    print("=============================================")
    print(f"Accuracy for {dataset} Dataset:{Accuracy}")
    print(f"Precision for {dataset} Dataset:{Precision}")
    print(f"Recall for {dataset} Dataset:{Recall}")
    print(f"F1 Score for {dataset} Dataset:{f1Score}")

    y_pred = (np.asarray(predict_model.predict(XTestVectorized))).round()
    dfTest['Label'] =  np.where(y_pred== 1, 'F', 'R')
    del(dfTest['Text'])
    dfTest.to_csv('Submission.csv', index=False)

Collecting tensorflow-addons
  Downloading tensorflow_addons-0.14.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[?25l[K     |▎                               | 10 kB 35.3 MB/s eta 0:00:01[K     |▋                               | 20 kB 36.1 MB/s eta 0:00:01[K     |▉                               | 30 kB 22.0 MB/s eta 0:00:01[K     |█▏                              | 40 kB 18.5 MB/s eta 0:00:01[K     |█▌                              | 51 kB 18.0 MB/s eta 0:00:01[K     |█▊                              | 61 kB 16.2 MB/s eta 0:00:01[K     |██                              | 71 kB 14.3 MB/s eta 0:00:01[K     |██▍                             | 81 kB 15.6 MB/s eta 0:00:01[K     |██▋                             | 92 kB 14.3 MB/s eta 0:00:01[K     |███                             | 102 kB 14.6 MB/s eta 0:00:01[K     |███▎                            | 112 kB 14.6 MB/s eta 0:00:01[K     |███▌                            | 122 kB 14.6 MB/s eta 0:00:01[K