In [12]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, SimpleRNN, Flatten
from tensorflow.keras.regularizers import l2
import re

In [2]:
data = pd.read_csv('../raw_data/new_df_clean2.csv')
data

Unnamed: 0,title,text,label,total
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1.0,941
1,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0.0,1290
2,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1.0,329
3,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1.0,255
4,Sports Bar Owner Bans NFL Games‚Ä¶Will Show On...,"The owner of the Ringling Bar, located south o...",1.0,270
...,...,...,...,...
56376,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0.0,746
56377,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1.0,612
56378,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0.0,466
56379,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump‚Äôs comba...,0.0,565


In [6]:
# remove duplicate items
data.drop_duplicates(inplace=True)
data.shape

(56076, 4)

In [7]:
# check for null values
data.isnull().sum()

title    0
text     0
label    0
total    0
dtype: int64

In [8]:
# use regex to remove the unimportant info at the begining of each row ('CITY NAME (Reuteurs) -')
def remove_city_name(text):
    pattern = r'(?:[A-Z]+(?:[ \/][A-Z]+)* )?\(Reuters\) - '
    return re.sub(pattern, '', text)

In [9]:
data = data.drop(columns=['title', 'total'])

In [13]:
# apply regex funciton to each row of the df
data['text'] = data['text'].apply(remove_city_name)
data.head()

Unnamed: 0,text,label
0,No comment is expected from Barack Obama Membe...,1.0
1,A dozen politically active pastors came here f...,0.0
2,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1.0
3,All we can say on this one is it s about time ...,1.0
4,"The owner of the Ringling Bar, located south o...",1.0


In [14]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def cleaning(sentence):
    
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercase 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation
    
    tokenized_sentence = word_tokenize(sentence) ## tokenize 
    stop_words = set(stopwords.words('english')) ## define stopwords
    
    tokenized_sentence_cleaned = [ ## remove stopwords
        w for w in tokenized_sentence if not w in stop_words
    ]

    lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v") 
        for word in tokenized_sentence_cleaned
    ]
    
    cleaned_sentence = ' '.join(word for word in lemmatized)
    
    return cleaned_sentence

In [15]:
data['text_clean'] = data['text'].apply(cleaning)
data

Unnamed: 0,text,label,text_clean
0,No comment is expected from Barack Obama Membe...,1.0,comment expect barack obama members fyf fukyof...
1,A dozen politically active pastors came here f...,0.0,dozen politically active pastors come private ...
2,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1.0,rs sarmat missile dub satan replace ss fly mil...
3,All we can say on this one is it s about time ...,1.0,say one time someone sue southern poverty law ...
4,"The owner of the Ringling Bar, located south o...",1.0,owner ringling bar locate south white sulphur ...
...,...,...,...
56374,An email released by WikiLeaks on Sunday appea...,1.0,email release wikileaks sunday appear show for...
56376,Hackers believed to be working for the Russian...,0.0,hackers believe work russian government break ...
56377,"You know, because in fantasyland Republicans n...",1.0,know fantasyland republicans never question ci...
56378,Migrants Refuse To Leave Train At Refugee Camp...,0.0,migrants refuse leave train refugee camp hunga...


In [16]:
X = data.text_clean
y = data.label

In [17]:
# Perform a 70:30 split for the initial training set and temporary set
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Further split the temporary set into validation (50%) and test (50%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [18]:
# Initialize the tokenizer and fit it on the training data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [19]:
# Convert text data to sequences of integers
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_val_sequences = tokenizer.texts_to_sequences(X_val)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

In [None]:
%%time
from keras.callbacks import EarlyStopping

model_rnn = Sequential([
    Embedding(input_dim = len(word_index), output_dim = 8, 
             input_length=avg_length_text), ## recall that we set the post-padding length to be this value 
    SimpleRNN(16), ## THIS IS THE RECURRENT LAYER 
    Dropout(0.2),
    Dense(8, activation='relu'),
    Dropout(0.2), 
    Dense(1, activation= 'sigmoid') ## final layer for prediction, hence only one node
])

## compile -- add optim, add loss, add metrics 
model_rnn.compile(optimizer = 'adam', loss = ['binary_crossentropy'], metrics = ['accuracy'])

early_stopping = EarlyStopping(patience=10, restore_best_weights=True)

history_rnn = model_rnn.fit(X_train_pad, y_train, epochs=1000, batch_size=32, validation_data=(X_val_pad, y_val), callbacks=[early_stopping])