### NLP Model Demonstration (Sentiment Classification)
This model performs sentiment classification on tweets, outputting either a positive or negative sentiment. 
Uses the public twitter_samples dataset from NLTK.

Imports

In [60]:
import numpy as np
import tensorflow as tf
import nltk
from nltk.corpus import twitter_samples    # NLTK Twitter dataset
import re
import string
from tensorflow import keras
from tensorflow.keras import regularizers
from tensorflow.keras import models, layers
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam

Downloading the twitter_samples dataset and analyzing its structure

In [61]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to C:\Users\Rohan
[nltk_data]     Parekh\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [62]:
pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')

In [63]:
print(len(pos_tweets))
print(len(neg_tweets))

5000
5000


In [64]:
tweets = pos_tweets + neg_tweets
len(tweets)

10000

In [65]:
tweets

['#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)',
 '@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!',
 '@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!',
 '@97sides CONGRATS :)',
 'yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days',
 '@BhaktisBanter @PallaviRuhail This one is irresistible :)\n#FlipkartFashionFriday http://t.co/EbZ0L2VENM',
 "We don't like to keep our lovely customers waiting for long! We hope you enjoy! Happy Friday! - LWWF :) https://t.co/smyYriipxI",
 '@Impatientraider On second thought, there’s just not enough time for a DD :) But new shorts entering system. Sheep must be buying.',
 'Jgh , but we have to go to Bayan :D bye',
 'As an act of mischievousness, am calling the ETL layer of our in-house warehousing 

Splitting into train and test sets, 90% train and 10% test

In [66]:
trainPos = pos_tweets[:4500]
trainNeg = neg_tweets[:4500]
testPos = pos_tweets[4500:]
testNeg = neg_tweets[4500:]

In [67]:
x_train = trainPos + trainNeg
x_test = testPos + testNeg

# Creating the label sets using the size of the Pos and Neg lists
y_train = np.append(np.ones((len(trainPos),1)), np.zeros((len(trainNeg),1)), axis = 0)
y_test = np.append(np.ones((len(testPos),1)), np.zeros((len(testNeg),1)), axis = 0)

print(len(x_test))

1000


Defining methods to clean tweets of punctuation, URLs, etc. using RegEx and string libraries

In [68]:
def clean(text):
    url = re.compile(r"https?://\S+|www.\.\S+")
    out = url.sub(r"", text)
    translator = str.maketrans("", "", string.punctuation)
    out = text.translate(translator)
    return out

def clean_list(s):
    out = []
    for item in s:
        out.append(clean(item))
    return out

In [69]:
tweets[0:10]

['#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)',
 '@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!',
 '@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!',
 '@97sides CONGRATS :)',
 'yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days',
 '@BhaktisBanter @PallaviRuhail This one is irresistible :)\n#FlipkartFashionFriday http://t.co/EbZ0L2VENM',
 "We don't like to keep our lovely customers waiting for long! We hope you enjoy! Happy Friday! - LWWF :) https://t.co/smyYriipxI",
 '@Impatientraider On second thought, there’s just not enough time for a DD :) But new shorts entering system. Sheep must be buying.',
 'Jgh , but we have to go to Bayan :D bye',
 'As an act of mischievousness, am calling the ETL layer of our in-house warehousing 

In [70]:
x_train = clean_list(x_train)
x_test = clean_list(x_test)
cleaned_tweets = clean_list(tweets)
cleaned_tweets[0:10]

['FollowFriday FranceInte PKuchly57 MilipolParis for being top engaged members in my community this week ',
 'Lamb2ja Hey James How odd  Please call our Contact Centre on 02392441234 and we will be able to assist you  Many thanks',
 'DespiteOfficial we had a listen last night  As You Bleed is an amazing track When are you in Scotland',
 '97sides CONGRATS ',
 'yeaaaah yippppy  my accnt verified rqst has succeed got a blue tick mark on my fb profile  in 15 days',
 'BhaktisBanter PallaviRuhail This one is irresistible \nFlipkartFashionFriday httptcoEbZ0L2VENM',
 'We dont like to keep our lovely customers waiting for long We hope you enjoy Happy Friday  LWWF  httpstcosmyYriipxI',
 'Impatientraider On second thought there’s just not enough time for a DD  But new shorts entering system Sheep must be buying',
 'Jgh  but we have to go to Bayan D bye',
 'As an act of mischievousness am calling the ETL layer of our inhouse warehousing app Katamari\n\nWell… as the name implies p']

Tokenizing unique words to create a vocabulary of all words and their frequencies

In [71]:
from collections import Counter
from nltk.tokenize import word_tokenize
def count_words(tweet_list):
    count = Counter()
    for tweet in tweet_list:
        for word in word_tokenize(tweet):
            count[word.lower()] += 1
    return count

word_dict = count_words(cleaned_tweets)

In [72]:
vocab_size = len(word_dict)
vocab_size

21495

In [102]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(cleaned_tweets)

In [103]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = 20

# Padding the train and test sets so all the inputs are equal length 
train_pad = pad_sequences(tokenizer.texts_to_sequences(x_train), maxlen = max_length, padding = "post")
test_pad = pad_sequences(tokenizer.texts_to_sequences(x_test), maxlen = max_length, padding = "post")

In [104]:
#Listing the top 5 most common words
word_dict.most_common(5)

[('i', 3275), ('you', 2191), ('to', 2186), ('the', 1999), ('a', 1583)]

Defining the sequential model using TensorFlow Keras layers

In [105]:
model = models.Sequential([
    layers.Embedding(vocab_size, 32, input_length=max_length),
    layers.LSTM(64, dropout=0.5, recurrent_dropout=0.5, kernel_regularizer=regularizers.l2(0.01)),
    layers.Dropout(0.7),
    layers.Dense(1, activation='sigmoid')
])
model.summary()

In [106]:
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(learning_rate=0.001)
metrics = ["accuracy"]

model.compile(loss=loss, optimizer=optim, metrics=metrics)

Training the model

In [85]:
model.fit(train_pad, y_train, batch_size = 8, epochs=10, verbose=1)

Epoch 1/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.5115 - loss: 0.7774
Epoch 2/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7827 - loss: 0.5266
Epoch 3/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8618 - loss: 0.4015
Epoch 4/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9089 - loss: 0.3093
Epoch 5/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9344 - loss: 0.2406
Epoch 6/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9440 - loss: 0.2100
Epoch 7/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9524 - loss: 0.1968
Epoch 8/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9628 - loss: 0.1613
Epoch 9/10
[1m1125/1125

<keras.src.callbacks.history.History at 0x20c3a9f6490>

Running model on test set

In [98]:
test_loss, test_accuracy = model.evaluate(test_pad, y_test, verbose=1)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7188 - loss: 0.7984 
Test Loss: 0.6385711431503296
Test Accuracy: 0.7570000290870667


Defining a method to run the model on manually inputted tweets

In [110]:
def predict_tweet(tweet):
    cleaned = clean(tweet)
    tokenized_string = tokenizer.texts_to_sequences([cleaned])
    tokened = pad_sequences(tokenized_string, maxlen = max_length, padding = "post")
    pred = model.predict(tokened)
    out = 1 if pred[0][0] > 0.5 else 0
    print(f'"{tweet}"')
    if out == 1:
        print("The model predicts: Positive sentiment")  
    else:
        print("The model predicts: Negative sentiment")

In [111]:
tweet = "I hate this movie"
predict_tweet(tweet)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
"I hate this movie"
The model predicts: Positive sentiment


In [112]:
tweet = "Keep it up man! Good job!"
predict_tweet(tweet)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
"Keep it up man! Good job!"
The model predicts: Positive sentiment


Feel free to reach out to rohan11parekh@gmail.com for any questions.