In [None]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [None]:
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv('/content/drive/MyDrive/cleandata.csv')

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  44898 non-null  int64 
 1   text        44897 non-null  object
 2   label       44898 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.0+ MB


In [None]:
train.dropna(inplace = True)

In [None]:
X = train["text"].values
y = train["label"].values

In [None]:
import tensorflow as tf

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()

In [None]:
tokenizer.fit_on_texts(X)

In [None]:
word_index = tokenizer.word_index

In [None]:
word_index

{'said': 1,
 'trump': 2,
 'state': 3,
 'presid': 4,
 'would': 5,
 'peopl': 6,
 'year': 7,
 'republican': 8,
 'one': 9,
 'say': 10,
 'elect': 11,
 'also': 12,
 'govern': 13,
 'like': 14,
 'new': 15,
 'hous': 16,
 'time': 17,
 'report': 18,
 'clinton': 19,
 'democrat': 20,
 'obama': 21,
 'nation': 22,
 'call': 23,
 'donald': 24,
 'support': 25,
 'american': 26,
 'countri': 27,
 'unit': 28,
 'right': 29,
 'campaign': 30,
 'go': 31,
 'could': 32,
 'parti': 33,
 'make': 34,
 'told': 35,
 'senat': 36,
 'white': 37,
 'vote': 38,
 'offici': 39,
 'two': 40,
 'last': 41,
 'news': 42,
 'get': 43,
 'use': 44,
 'polit': 45,
 'includ': 46,
 'work': 47,
 'offic': 48,
 'want': 49,
 'group': 50,
 'law': 51,
 'first': 52,
 'even': 53,
 'take': 54,
 'back': 55,
 'secur': 56,
 'day': 57,
 'former': 58,
 'u': 59,
 'week': 60,
 'hillari': 61,
 'mani': 62,
 'show': 63,
 'court': 64,
 'medium': 65,
 'attack': 66,
 'plan': 67,
 'come': 68,
 'made': 69,
 'polic': 70,
 'may': 71,
 'need': 72,
 'think': 73,
 'acc

In [None]:
train_sequences = tokenizer.texts_to_sequences(X)
maxlen = max([len(x) for x in train_sequences])

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
pad_type = 'post'
trunc_type = 'post'

In [None]:
train_padded = pad_sequences(train_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)

In [None]:
train_padded

array([[   24,     2,  1456, ...,     0,     0,     0],
       [   16,   231,   150, ...,     0,     0,     0],
       [  157,   791,    58, ...,     0,     0,     0],
       ...,
       [ 3359, 20190,  2601, ...,     0,     0,     0],
       [ 3534,   169,     3, ...,     0,     0,     0],
       [ 2804,   943, 17368, ...,     0,     0,     0]], dtype=int32)

# LSTM MODEL

In [None]:
from keras.layers import Dense,LSTM,Embedding,Dropout

In [None]:
embedding_vector_features=45
vocab_size = len(word_index)

In [None]:
model= tf.keras.Sequential()
model.add(Embedding(vocab_size,embedding_vector_features))
model.add(LSTM(128,activation='relu',return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])





In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 45)          4976235   
                                                                 
 lstm (LSTM)                 (None, None, 128)         89088     
                                                                 
 dropout (Dropout)           (None, None, 128)         0         
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 32)                4128      
                                                                 
 dropout_2 (Dropout)         (None, 32)                0

In [None]:
history = model.fit(train_padded,y,validation_split = 0.3,batch_size = 64,epochs = 5)

Epoch 1/50
Epoch 2/50
 65/492 [==>...........................] - ETA: 2:28:45 - loss: 0.5710 - accuracy: 0.7466

In [None]:
import matplotlib.pyplot as plt 

In [None]:
epochs = list(range(5))
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
plt.plot(epochs, acc, label='Trainig', linewidth = 5.0)
plt.plot(epochs, val_acc, label='Validation',  linewidth = 5.0)
plt.xlabel('Number of Epochs', fontsize = 15, fontfamily ='serif', weight = 'bold')
plt.ylabel('Accuracy', fontsize = 15, fontfamily = 'serif', weight = 'bold')
plt.yticks(fontsize = 12, fontfamily = 'serif', weight = 'bold')
plt.xticks(fontsize = 12, fontfamily = 'serif', weight = 'bold')
font = font_manager.FontProperties(family='serif',weight='bold',style='normal', size=10)
plt.legend(prop = font)
plt.savefig('CNN_ModelAcc_TESS.png')
plt.show()


In [None]:
epochs = list(5)
loss = history.history['loss']
val_loss = history.history['val_loss']
plt.plot(epochs, loss, label='Trainig', linewidth = 5.0)
plt.plot(epochs, val_loss, label='Validation',  linewidth = 5.0)
plt.xlabel('Number of Epochs', fontsize = 15, fontfamily ='serif', weight = 'bold')
plt.ylabel('Loss', fontsize = 15, fontfamily = 'serif', weight = 'bold')
plt.yticks(fontsize = 12, fontfamily = 'serif', weight = 'bold')
plt.xticks(fontsize = 12, fontfamily = 'serif', weight = 'bold')
plt.legend(prop = font)
plt.savefig('lossforfakenews.png')
plt.show()