<a href="https://www.kaggle.com/code/niramay/obama-quotes-classification?scriptVersionId=112468887" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, GlobalMaxPooling1D, Dropout, Activation
from keras.layers import Input, TextVectorization
from keras.callbacks import *
from keras.metrics import *
from keras.backend import clear_session
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/obama-quotestruths-lies/politifact-obama.csv


In [2]:
data = pd.read_csv('/kaggle/input/obama-quotestruths-lies/politifact-obama.csv')
data.head()

Unnamed: 0,Score,Date,Quote
0,False,"July 22, 2021","""The cost of an automobile, it's kind of back ..."
1,False,"June 23, 2021","""The Second Amendment, from the day it was pas..."
2,False,"May 3, 2021",For vaccine rates among Americans 65 and older...
3,False,"March 25, 2021",“We’re sending back the vast majority of the f...
4,False,"February 16, 2021","""If we kept (the minimum wage) indexed to infl..."


In [3]:
data['Score'].value_counts()

Half True        211
Mostly True      211
True             148
False            107
Mostly False     101
Pants on Fire     15
Score              1
Name: Score, dtype: int64

In [4]:
data.drop(['Date'], axis=1, inplace=True)
data.head()

Unnamed: 0,Score,Quote
0,False,"""The cost of an automobile, it's kind of back ..."
1,False,"""The Second Amendment, from the day it was pas..."
2,False,For vaccine rates among Americans 65 and older...
3,False,“We’re sending back the vast majority of the f...
4,False,"""If we kept (the minimum wage) indexed to infl..."


In [5]:
data['check'] = data['Score'].replace({"Mostly True": 1, "True":1, "Half True":0, "Mostly False":0,"False":0,"Pants on Fire":0,"Score":0})
data['check'].value_counts()

0    435
1    359
Name: check, dtype: int64

In [6]:
X = data['Quote']
y = data['check']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
X_train.shape, X_test.shape, y_train.shape, y_test.shape 

((555,), (239,), (555,), (239,))

In [7]:
tokenizer = Tokenizer()

#preparing vocabulary
tokenizer.fit_on_texts(list(X_train))
words_to_index = tokenizer.word_index
#converting text into integer sequences
X_train_seq  = tokenizer.texts_to_sequences(X_train) 
X_test_seq = tokenizer.texts_to_sequences(X_test)

#padding to prepare sequences of same length
X_train_seq  = pad_sequences(X_train_seq, maxlen=100)
X_test_seq = pad_sequences(X_test_seq, maxlen=100)

In [8]:
size_of_vocabulary=len(tokenizer.word_index) + 1
print(size_of_vocabulary)

2285


In [9]:
METRICS = [
      Precision(name='precision'),
      Recall(name='recall'),
      AUC(name='auc'),
      AUC(name='prc', curve='PR'), # precision-recall curve  
]

2022-11-29 17:15:40.370207: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [10]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip


--2022-11-29 17:15:41--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-11-29 17:15:41--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-11-29 17:15:41--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove

In [11]:
size_of_vocabulary=len(tokenizer.word_index) + 1
size_of_vocabulary

2285

In [12]:
def read_glove_vector(glove_vec):
  with open(glove_vec, 'r', encoding='UTF-8') as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
      w_line = line.split()
      curr_word = w_line[0]
      word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)



  return word_to_vec_map

In [13]:
word_to_vec_map = read_glove_vector('/kaggle/working/glove.6B.50d.txt')


In [14]:
vocab_len = len(words_to_index)
embed_vector_len = word_to_vec_map['moon'].shape[0]

emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
  embedding_vector = word_to_vec_map.get(word)
  if embedding_vector is not None:
    emb_matrix[index, :] = embedding_vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=size_of_vocabulary, weights = [emb_matrix], trainable=False)

In [15]:
clear_session()

model=Sequential()

#embedding layer
model.add(embedding_layer) 

#lstm layer
model.add(LSTM(128,return_sequences=True,dropout=0.2))

#Global Maxpooling
model.add(GlobalMaxPooling1D())

#Dense Layer
model.add(Dense(64,activation='relu')) 
model.add(Dense(1,activation='sigmoid')) 

#Add loss function, metrics, optimizer
model.compile(optimizer='adam', loss='binary_crossentropy',metrics=METRICS) 

#Adding callbacks
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)  
mc=ModelCheckpoint('best_model', monitor='val_loss', mode='min', save_best_only=True, save_weights_only=False, verbose=1)  

#Print summary of model
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 2285, 50)          114200    
_________________________________________________________________
lstm (LSTM)                  (None, 2285, 128)         91648     
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 214,169
Trainable params: 99,969
Non-trainable params: 114,200
_________________________________________________________________
None
