<a href="https://colab.research.google.com/github/mhmdahmd2422/RNN-LTSM-Twitter_Sentiment/blob/main/Twitter_Sentiment(RNN~LTSM).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount("/content/gdrive/")

Mounted at /content/gdrive/


# Importing libraries

In [None]:
# matplotlib
import matplotlib.pyplot as plt

# nltk
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# sklearn
from sklearn.model_selection import train_test_split

#tensorflow
import tensorflow as tf

# Keras
from keras.models import Sequential
from keras import layers
from keras.layers import Dense, Embedding, LSTM, Dropout
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras import regularizers
from keras import backend as K
from keras.callbacks import ModelCheckpoint

# Utility
import pandas as pd
import numpy as np
import re
import string
import pickle

# Loading Dataset

In [None]:
# Construct a tf.data.Dataset
data = pd.read_csv('gdrive/MyDrive/twitterSentiment/training.csv',encoding='latin', names = ['target','id','date','query_flag','user','tweet_text'])

In [None]:
# data = data.sample(frac=1)
# data = data[:200000]

In [None]:
print("Dataset shape:", data.shape)

Dataset shape: (1600000, 6)


# Data Visualization

In [None]:
data['target'].unique()

array([0, 4])

In [None]:
# Replacing the value 4 -->1 for ease of understanding.
data['target'] = data['target'].replace(4,1)

In [None]:
# check the number of positive vs. negative tagged sentences
positives = data['target'][data.target == 1 ]
negatives = data['target'][data.target == 0 ]

print('Total length of the data is:         {}'.format(data.shape[0]))
print('No. of positve tagged sentences is:  {}'.format(len(positives)))
print('No. of negative tagged sentences is: {}'.format(len(negatives)))

Total length of the data is:         1600000
No. of positve tagged sentences is:  800000
No. of negative tagged sentences is: 800000


In [None]:
# Removing the unnecessary columns.
data.drop(['id','date','query_flag','user'], axis=1, inplace=True)

In [None]:
data.head(10)

Unnamed: 0,target,tweet_text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
5,0,@Kwesidei not the whole crew
6,0,Need a hug
7,0,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,@Tatiana_K nope they didn't have it
9,0,@twittera que me muera ?


In [None]:
#Checking if any null values present
(data.isnull().sum() / len(data))*100

target        0.0
tweet_text    0.0
dtype: float64

In [None]:
#convrting pandas object to a string type
data['tweet_text'] = data['tweet_text'].astype('str')

# Data Processing

**NLTK (Natural Language Toolkit) is a Python library used for natural language processing.**

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stopword = set(stopwords.words('english'))
print(stopword)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


{'ain', 'such', "don't", 'while', 'just', "shouldn't", 'other', 'y', 'they', 'so', 'won', 'doesn', 'didn', 're', 'before', 'each', 'too', 'yourselves', 'below', 'did', 'again', 'same', 'where', 'above', 'my', 'your', "didn't", 'very', 'having', 'those', 'up', 'few', 'because', 'can', "you'll", 'does', 'by', 'been', 'at', 'itself', 'himself', 'to', "hadn't", "you've", 'hadn', 'was', "needn't", 've', "shan't", 'yours', 'during', 'had', 'haven', 'wouldn', 'you', 'ma', 'hasn', "should've", 'hers', "she's", 'this', 'of', 'should', 'don', "mustn't", 'under', 'once', 'why', 't', 'i', 'about', 'own', 'all', "aren't", 'whom', 'will', 'shouldn', 'isn', 'than', 'who', "wouldn't", 'is', 'do', 'being', 'from', 'weren', 'nor', 'a', "wasn't", "that'll", 'and', 'her', 'how', 'his', 'most', 'he', "haven't", 'me', 'theirs', 'yourself', 'through', 'until', "doesn't", 'or', 'for', 'she', 'but', "weren't", 'are', 'd', 'couldn', 'the', 's', 'were', 'down', 'wasn', 'these', 'myself', 'after', 'be', "isn't", 

[nltk_data] Downloading package wordnet to /root/nltk_data...


The Preprocessing steps taken are:

* Lower Casing: Each text is converted to lowercase.

* Removing URLs: Links starting with "http" or "https" or "www" are replaced by "".

* Removing Usernames: Replace @Usernames with word "". (eg: "@XYZ" to "")

* Removing Short Words: Words with length less than 2 are removed.

* Removing Stopwords: Stopwords are the English words which does not add much meaning to a sentence. They can safely be ignored without sacrificing the meaning of the sentence. (eg: "the", "he", "have")

* Lemmatizing: Lemmatization is the process of converting a word to its base form. (e.g: “wolves” to “wolf”)

[Tokenization is the process by which a large quantity of text is divided into smaller parts called tokens. These tokens are very useful for finding patterns and are considered as a base step for stemming and lemmatization.]

In [None]:
urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
userPattern = '@[^\s]+'
def process_tweets(tweet):
  # Lower Casing
    tweet = tweet.lower()
    tweet=tweet[0:]
    # Removing all URls
    tweet = re.sub(urlPattern,'',tweet)
    # Removing all @username.
    tweet = re.sub(userPattern,'', tweet)
    #Remove punctuations
    tweet = tweet.translate(str.maketrans("","",string.punctuation))
    #tokenizing words
    tokens = word_tokenize(tweet)
    #Removing Stop Words
    final_tokens = [w for w in tokens if w not in stopword]
    #reducing a word to its word stem
    wordLemm = WordNetLemmatizer()
    finalwords=[]
    for w in final_tokens:
      if len(w)>1:
        word = wordLemm.lemmatize(w)
        finalwords.append(word)
    return ' '.join(finalwords)

In [None]:
data['clean_tweets'] = data['tweet_text'].apply(lambda x: process_tweets(x))
print('Text Preprocessing complete.')

Text Preprocessing complete.


In [None]:
data.head(10)

Unnamed: 0,target,tweet_text,clean_tweets
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww thats bummer shoulda got david carr third...
1,0,is upset that he can't update his Facebook by ...,upset cant update facebook texting might cry r...
2,0,@Kenichan I dived many times for the ball. Man...,dived many time ball managed save 50 rest go b...
3,0,my whole body feels itchy and like its on fire,whole body feel itchy like fire
4,0,"@nationwideclass no, it's not behaving at all....",behaving im mad cant see
5,0,@Kwesidei not the whole crew,whole crew
6,0,Need a hug,need hug
7,0,@LOLTrish hey long time no see! Yes.. Rains a...,hey long time see yes rain bit bit lol im fine...
8,0,@Tatiana_K nope they didn't have it,nope didnt
9,0,@twittera que me muera ?,que muera


In [None]:
# Removing the unnecessary columns.
data.drop(['tweet_text'], axis=1, inplace=True)

tokenizer object, which can be used to covert any word into a Key in dictionary (number).

**tokenizer** create tokens for every word in the data corpus and map them to a index using dictionary.

**word_index** contains the index for each word

**vocab_size** represents the total number of word in the data corpus

Since we are going to build a sequence model. We should feed in a sequence of numbers to it. And also we should ensure there is no variance in input shapes of sequences. It all should be of same lenght. But texts in tweets have different count of words in it. To avoid this, we seek a little help from pad_sequence to do our job. It will make all the sequence in one constant lengt[MAX_SEQUENCE_LENGTH]

***!!! Better than masking to handle the variable sequence lengths!!! ***

In [None]:
max_words = 5000
max_len = 200

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data.clean_tweets)
sequences = tokenizer.texts_to_sequences(data.clean_tweets)
tweets = pad_sequences(sequences, maxlen=max_len)
print(tweets)

[[   0    0    0 ...  701 1705    2]
 [   0    0    0 ...   11  175 1049]
 [   0    0    0 ...  360    6 2960]
 ...
 [   0    0    0 ...  124  498 1657]
 [   0    0    0 ...  394 4667   13]
 [   0    0    0 ...    0    0   56]]


# Split train - validation and test

In [None]:
X_train, X_val_test, y_train, y_val_test = train_test_split(tweets, data.target.values, test_size=0.2, random_state=101)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=101)
print("X_train", X_train.shape)
print("y_train", y_train.shape)
print()
print("X_val", X_val.shape)
print("y_val", y_val.shape)
print()
print("X_test", X_test.shape)
print("y_test", y_test.shape)

X_train (1280000, 200)
y_train (1280000,)

X_val (160000, 200)
y_val (160000,)

X_test (160000, 200)
y_test (160000,)


# Model Building

In [None]:
model = Sequential()
model.add(layers.Embedding(max_words, 128))
model.add(layers.SimpleRNN(64,dropout=0.5))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(8, activation='relu'))
model.add(layers.Dense(1,activation='sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=3,validation_data=(X_val, y_val), validation_steps=30)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
model2 = Sequential()
model2.add(layers.Embedding(max_words, 128))
model2.add(layers.LSTM(64,dropout=0.5))
model2.add(layers.Dense(16, activation='relu'))
model2.add(layers.Dense(8, activation='relu'))
model2.add(layers.Dense(1,activation='sigmoid'))
model2.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])
history2 = model2.fit(X_train, y_train, epochs=3,validation_data=(X_val, y_val), validation_steps=30)

Epoch 1/3
Epoch 2/3
Epoch 3/3

# Model evaluating

In [None]:
print('\n\n=========RNN Evaluation On Test Set=========')
results_RNN = model.evaluate(X_test, y_test)
print('\n\n')
print('\n\n=========LSTM Evaluation On Test Set=========')
results_LSTM = model2.evaluate(X_test, y_test)
print('\n\n')

In [None]:
print('{:<30} {:<20} {:<20}'.format('', 'Loss', 'Accuracy'))
print('{:<30} {:<20.4f} {:<20.4f}'.format('RNN', results_RNN[0], results_RNN[1]))
print('{:<30} {:<20.4f} {:<20.4f}'.format('LSTM', results_LSTM[0], results_LSTM[1]))

In [None]:
def compare_metrics(history):
  # plot el accuracy
  plt.plot(history.history["accuracy"], label="Train Acc")
  plt.plot(history.history["val_accuracy"], label="Val. Acc")
  plt.legend()
  plt.xlabel("Epoch")
  plt.ylabel("Acc.")
  plt.show()
  print('\n\n')
# plot el loss
  plt.plot(history.history["loss"], label="Train loss")
  plt.plot(history.history["val_loss"], label="Val. loss")
  plt.legend()
  plt.xlabel("Epoch")
  plt.ylabel("Loss")
  plt.show()

In [None]:
print('\n\n=========RNN Metrics Plot=========')
compare_metrics(history)
print('\n\n')

In [None]:
print('\n\n=========LSTM Metrics Plot=========')
compare_metrics(history2)
print('\n\n')