## Import Dependencies

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Data Processing

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#print the stopwords in English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
twitter_data = pd.read_csv("/content/training.1600000.processed.noemoticon.csv", encoding= 'ISO-8859-1')

In [None]:
twitter_data.shape

(1599999, 6)

In [None]:
twitter_data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [None]:
# naming the columns and reading the dataset again

column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
twitter_data = pd.read_csv("/content/training.1600000.processed.noemoticon.csv", encoding= 'ISO-8859-1', names=column_names)

In [None]:
twitter_data.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
twitter_data.shape

(1600000, 6)

In [None]:
# Check for missing values

twitter_data.isnull().sum()

target    0
ids       0
date      0
flag      0
user      0
text      0
dtype: int64

In [None]:
# Check the distribution of target columns

twitter_data['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [None]:
# Convert the target "4" to "1"

twitter_data.loc[twitter_data['target'] == 4, 'target'] = 1

In [None]:
twitter_data['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

### Lemmatization

In [None]:
from nltk.corpus import wordnet

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
lemmatizer= WordNetLemmatizer()

In [None]:
lemmatizer= WordNetLemmatizer()

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts."""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {
        'J': wordnet.ADJ,
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV
    }
    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_content(content):
    lemmatized_content = re.sub('[^a-zA-Z]', ' ', content)
    lemmatized_content = lemmatized_content.lower()
    lemmatized_content = lemmatized_content.split()
    lemmatized_content = [
        lemmatizer.lemmatize(word, get_wordnet_pos(word))
        for word in lemmatized_content
        if word not in stopwords.words('english')
    ]
    lemmatized_content = ' '.join(lemmatized_content)
    return lemmatized_content

In [None]:
twitter_data['lemmatized_text'] = twitter_data['text'].apply(lemmatize_content)

In [None]:
twitter_data.to_csv('lemmatized_twitter_data.csv', index=False)

In [None]:
twitter_data.head()

Unnamed: 0,target,ids,date,flag,user,text,lemmatized_text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset update facebook texting might cry result...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive many time ball manage save rest ...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole body feel itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behaving mad see


### Load lemmatized data

In [None]:
twitter_data= pd.read_csv('/content/drive/MyDrive/Twitter sentiment analysis/deflaut_lemmatized_twitter_data.csv')

In [None]:
twitter_data.head()

Unnamed: 0,target,ids,date,flag,user,text,lemmatized_text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset update facebook texting might cry result...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive many time ball manage save rest ...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole body feel itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behaving mad see


In [None]:
twitter_data.shape

(1600000, 7)

In [None]:
X = twitter_data['lemmatized_text']
Y = twitter_data['target']

In [None]:
Y.value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

In [None]:
print(twitter_data.dtypes)

target              int64
ids                 int64
date               object
flag               object
user               object
text               object
lemmatized_text    object
dtype: object


### Vectorize

In [None]:
sent_length = [len(sentence.split()) for sentence in X if len(sentence.split())>30]

AttributeError: 'float' object has no attribute 'split'

Looks like, there are some float values in lemmatized_text.

In [None]:
# Step 1: Convert floats or non-string values to string
twitter_data['lemmatized_text'] = twitter_data['lemmatized_text'].apply(lambda x: str(x) if isinstance(x, float) else x)

In [None]:
X = twitter_data['lemmatized_text']
Y = twitter_data['target']

In [None]:
sent_length = [len(sentence.split()) for sentence in X if len(sentence.split())>20]

In [None]:
len(sent_length)

2197

In [None]:
from tensorflow.keras.layers import TextVectorization

In [None]:
MAX_FEATURES = 75000

In [None]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                              output_sequence_length=20,
                              output_mode='int')

In [None]:
vectorizer.adapt(X.values)

In [None]:
vectorizer.get_vocabulary()

In [None]:
len(vectorizer.get_vocabulary())

75000

In [None]:
vectorizer('Hello world, life is great')

<tf.Tensor: shape=(20,), dtype=int64, numpy=
array([349, 204,  95,   1,  46,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0])>

In [None]:
vectorizer('Hey buddy, life is awesome')

<tf.Tensor: shape=(20,), dtype=int64, numpy=
array([ 86, 909,  95,   1,  90,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0])>

In [None]:
vectorized_text = vectorizer(X.values)

In [None]:
vectorized_text.dtype

tf.int64

In [None]:
vectorized_text.shape

TensorShape([1600000, 20])

## Prepare Dataset

In [None]:
#MCSHBAP - map, chache, shuffle, batch, prefetch  from_tensor_slices, list_file
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, Y))
dataset = dataset.cache()
dataset = dataset.shuffle(1000000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(4)

In [None]:
train = dataset.take(int(len(dataset)*.8))
val = dataset.skip(int(len(dataset)*.8)).take(int(len(dataset)*.1))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

## Create Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [None]:
model = Sequential()
# Create the embedding layer
model.add(Embedding(MAX_FEATURES + 1, 32))
# Bidirectional LSTM Layer with specified parameters
model.add(Bidirectional(LSTM(50, activation='tanh', dropout=0.2, recurrent_dropout=0.2)))
# Dense layer with Dropout
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.1))
# Output layer
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.build(input_shape=(None, None))

In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.models import load_model

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

# Define the filepath pattern for saving the model weights
filepath = "/kaggle/working/Twitter_epoch_{epoch:02d}_val_loss:{val_loss:.2f}_val_acc:{val_accuracy:.2f}.weights.h5"

# Create a list of callbacks
callbacks = [ModelCheckpoint(filepath=filepath,save_weights_only=True,save_freq="epoch")]

In [None]:
model.compile(optimizer=Adam(learning_rate=0.001),loss=BinaryCrossentropy(), metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
history = model.fit(train, epochs=5, validation_data=val, callbacks=callbacks)

Epoch 1/5
[1m72347/80000[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m5:30[0m 43ms/step - accuracy: 0.7937 - loss: 0.4444

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1m80000/80000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3552s[0m 44ms/step - accuracy: 0.8271 - loss: 0.3845 - val_accuracy: 0.8192 - val_loss: 0.3976
Epoch 4/5
[1m80000/80000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3547s[0m 44ms/step - accuracy: 0.8321 - loss: 0.3761 - val_accuracy: 0.8243 - val_loss: 0.3898
Epoch 5/5
[1m 2284/80000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m56:07[0m 43ms/step - accuracy: 0.8639 - loss: 0.3181

In [None]:
Y.shape

(1600000,)

## Make Prediction

In [None]:
from tensorflow.keras.models import load_model

In [None]:
model = load_model('/content/drive/MyDrive/twitter_sentiment_analysis/twitter_sentiment_analysis_epoch4.h5')

In [None]:
input_text = vectorizer("6 years later and nothing changes… And he blames the other driver each time.")

In [None]:
input_text = tf.expand_dims(input_text, axis=0)

In [None]:
input_text.shape

TensorShape([1, 20])

In [None]:
res = model.predict(input_text)



In [None]:
res.shape

(1, 1)

In [None]:
res

array([[0.60668284]], dtype=float32)

In [None]:
if res<0.5:
  print('Negative')
else:
  print('Positive')

Positive


## Save the vectorizer

In [None]:

model = tf.keras.models.Sequential()
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model.add(vectorizer)

In [None]:
model.save('/content/drive/MyDrive/twitter_sentiment_analysis/vectorizer_model', save_format='tf')



In [None]:
loaded_model = tf.keras.models.load_model('/content/drive/MyDrive/twitter_sentiment_analysis/vectorizer_model')



In [None]:
loaded_vectorizer = loaded_model.layers[0]

In [None]:
vectorizer('Hello world, life is pretty awesome')

<tf.Tensor: shape=(20,), dtype=int64, numpy=
array([349, 204,  95,   1, 151,  90,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0])>

In [None]:
loaded_vectorizer('Hello world, life is pretty awesome')

<tf.Tensor: shape=(20,), dtype=int64, numpy=
array([349, 204,  95,   1, 151,  90,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0])>