<a href="https://www.kaggle.com/code/niramay/obama-quotes-classification?scriptVersionId=112422439" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/obama-quotestruths-lies/politifact-obama.csv


In [2]:
data = pd.read_csv('/kaggle/input/obama-quotestruths-lies/politifact-obama.csv')
data.head()

Unnamed: 0,Score,Date,Quote
0,False,"July 22, 2021","""The cost of an automobile, it's kind of back ..."
1,False,"June 23, 2021","""The Second Amendment, from the day it was pas..."
2,False,"May 3, 2021",For vaccine rates among Americans 65 and older...
3,False,"March 25, 2021",“We’re sending back the vast majority of the f...
4,False,"February 16, 2021","""If we kept (the minimum wage) indexed to infl..."


In [3]:
data['Score'].value_counts()

Half True        211
Mostly True      211
True             148
False            107
Mostly False     101
Pants on Fire     15
Score              1
Name: Score, dtype: int64

In [4]:
data.drop(['Date'], axis=1, inplace=True)
data.head()

Unnamed: 0,Score,Quote
0,False,"""The cost of an automobile, it's kind of back ..."
1,False,"""The Second Amendment, from the day it was pas..."
2,False,For vaccine rates among Americans 65 and older...
3,False,“We’re sending back the vast majority of the f...
4,False,"""If we kept (the minimum wage) indexed to infl..."


In [5]:
data['check'] = data['Score'].replace({"Mostly True": 1, "True":1, "Half True":0, "Mostly False":0,"False":0,"Pants on Fire":0,"Score":0})
data['check'].value_counts()

0    435
1    359
Name: check, dtype: int64

In [6]:
X = data['Quote']
y = data['check']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
X_train.shape, X_test.shape, y_train.shape, y_test.shape 

((555,), (239,), (555,), (239,))

In [7]:
tokenizer = Tokenizer()

#preparing vocabulary
tokenizer.fit_on_texts(list(X_train))
words_to_index = tokenizer.word_index
#converting text into integer sequences
X_train_seq  = tokenizer.texts_to_sequences(X_train) 
X_test_seq = tokenizer.texts_to_sequences(X_test)

#padding to prepare sequences of same length
X_train_seq  = pad_sequences(X_train_seq, maxlen=100)
X_test_seq = pad_sequences(X_test_seq, maxlen=100)

In [8]:
size_of_vocabulary=len(tokenizer.word_index) + 1
print(size_of_vocabulary)

2285


In [9]:
# METRICS = [
#       Precision(name='precision'),
#       Recall(name='recall'),
#       AUC(name='auc'),
#       AUC(name='prc', curve='PR'), # precision-recall curve  
# ]

In [10]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip


--2022-11-29 09:44:06--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-11-29 09:44:07--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-11-29 09:44:07--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove

In [11]:
f = open('/kaggle/working/glove.6B.50d.txt')
embedding_values = {}
for line in tqdm(f):
    value = line.split(' ')
    word = value[0]
    coef = np.array(value[1:],dtype = 'float32')
    embedding_values[word] = coef

400000it [00:05, 77055.71it/s]


In [12]:
all_embs = np.stack(embedding_values.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

  if (await self.run_code(code, result,  async_=asy)):


(0.020940498, 0.6441043)

In [13]:
embedding_matrix = np.random.normal(emb_mean, emb_std, (size_of_vocabulary, 50))
for word,i in tqdm(tokenizer.word_index.items()):
    values = embedding_values.get(word)
    if values is not None:
        embedding_matrix[i] = values

100%|██████████| 2284/2284 [00:00<00:00, 417474.63it/s]
