In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from helper_functions import pre_process_tweet
from sklearn.utils import shuffle

In [2]:
from gensim.models import Word2Vec
word2vec = Word2Vec.load("trained_models/word2vec.model")

In [3]:
# get the data
data = pd.read_csv('data.csv',encoding='ISO-8859-1', header=None)

In [4]:
data.columns = ['target','ids','date','flag','user','text']

In [5]:
# get only the text and its label
data_to_used =data[['target','text']]


In [6]:
# pre-process the texts
preprocessed_tweet_data = []
# iterating and tokenizing all the tweets
for index, text in enumerate(data_to_used['text'], start=1):
    tweet = pre_process_tweet(text)
    preprocessed_tweet_data.append(tweet)

In [7]:
maxlen = max(len(x) for x in preprocessed_tweet_data)

In [8]:
# convert each word in sentence to embedding using word2vec model
def word_to_vec(sentence):
    return [word2vec.wv[word] if word in word2vec.wv else np.zeros(word2vec.vector_size) for word in sentence]

In [9]:
# convert the tokenized words to embeddings
vectorized_tweets = [word_to_vec(sentence) for sentence in preprocessed_tweet_data]

In [10]:
# separate the data into train and test set
# First, compute the maximum length of any tweet in your dataset
maxlen = max(len(tweet) for tweet in vectorized_tweets)
print(maxlen)

37


In [11]:

# used to have uniform input by padding the input which are smaller
def pad_sequence(seq, maxlen):
    return np.array(seq + [np.zeros(word2vec.vector_size)] * (maxlen - len(seq)))

In [12]:
# Then, use a list comprehension to pad every sequence in vectorized_tweets
vectorized_tweets_padded = [pad_sequence(tweet, maxlen) for tweet in vectorized_tweets]

In [13]:

labels = data['target'].values  # this will convert the pandas Series to a numpy array
labels = [0 if label == 0 else 1 for label in labels]


In [14]:
vectorized_tweets_train, vectorized_tweets_val, labels_train, labels_val = train_test_split(vectorized_tweets_padded, labels, test_size=0.2)

In [15]:
type(labels_train)

list

In [16]:
# this function is used to create generator to train and test
def data_generator(vectorized_tweets, labels, batch_size=32):
    data_size = len(vectorized_tweets)
    indices = np.arange(data_size)
    np.random.shuffle(indices)
    while True:
        for i in range(0, data_size, batch_size):
            # Get batch indices
            batch_indices = indices[i:i+batch_size]

            # Get the batch of sequences and corresponding labels
            x = [vectorized_tweets[i] for i in batch_indices]
            y = [labels[i] for i in batch_indices]

            # Convert lists to numpy arrays for Keras
            x_array = np.asarray(x)
            y_array = np.asarray(y)

            yield x_array, y_array

        # Re-shuffle indices for the next epoch
        np.random.shuffle(indices)

In [17]:
# Create data generators
train_generator = data_generator(vectorized_tweets_train, labels_train)
val_generator = data_generator(vectorized_tweets_val, labels_val)

In [18]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Bidirectional

In [19]:
embedding_dim = 100

In [20]:


model = Sequential()
model.add(LSTM(64, input_shape=(maxlen, embedding_dim)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [21]:
steps_per_epoch = len(vectorized_tweets_train) // 32
validation_steps = len(vectorized_tweets_val) // 32

# Fit the model
history = model.fit(
    train_generator,
    steps_per_epoch=steps_per_epoch,
    validation_data=val_generator,
    validation_steps=validation_steps,
    epochs=5
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [35]:
def get_pre_processed_input(tweet,maxlen=37,embedding_dim=100):
    # maxlen is 37
    # embedding_dim is 100 which is the vector size of word2vec model
    # Tokenize and pad the tweet
    pre_processed_new_tweet = pre_process_tweet(tweet)
    #getting embedding of each word of tweet
    vectorized_new_tweet =  word_to_vec(pre_processed_new_tweet)
    # padding to get uniform size inout
    vectorized_new_tweet_padded = pad_sequence(vectorized_new_tweet, maxlen=maxlen)
    # reshaping the tweet to match input shape of model
    vectorized_new_tweet_padded = vectorized_new_tweet_padded.reshape(1, maxlen, embedding_dim)

    return vectorized_new_tweet_padded


In [40]:
new_tweet = "I hate this movie!"

In [41]:
processed_input = get_pre_processed_input(new_tweet)

In [42]:
prediction = model.predict(processed_input)



In [43]:
prediction

array([[0.06353077]], dtype=float32)

In [44]:
# saving the model
model.save('trained_models/sentiment_model.h5')