## Project : Sentiment analysis on 1.6 million tweets. Context This is the sentiment140 dataset.
 It contains 1,600,000 tweets extracted using the
twitter api . The tweets have been annotated (0 = negative, 4 = positive) and they
can be used to detect sentiment . Content
It contains the following 6 fields:
target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
ids: The id of the tweet ( 2087)
date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)
flag: The query (lyx). If there is no query, then this value is NO_QUERY. user: the user that tweeted (robotickilldozr)
text: the text of the tweet (Lyx is cool)

Dataset link : https://www.kaggle.com/kazanova/sentiment140


In [2]:
import pandas as pd

In [5]:
!pip install opendatasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [6]:
import opendatasets as od

od.download("https://www.kaggle.com/datasets/kazanova/sentiment140")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: omharne
Your Kaggle Key: ··········
Downloading sentiment140.zip to ./sentiment140


100%|██████████| 80.9M/80.9M [00:00<00:00, 119MB/s]





In [7]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

In [8]:
# Load the dataset
df = pd.read_csv('/content/sentiment140/training.1600000.processed.noemoticon.csv', header=None, names=['target', 'id', 'date', 'flag', 'user', 'text'], encoding='latin-1')

df.head(5)

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [9]:
# Définition of parameter of the model
vocab_size = 20000
embedding_dim = 32
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

In [10]:
df = df[['target', 'text']]
df

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...


In [11]:
# Split dataset

X = df["text"]
y = df["target"]

y = y.replace(4,1)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [12]:
# USE Tokenization
from keras.layers import Embedding

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [13]:
# create the sequesnec of tokens
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [14]:
# Creation of model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [15]:
# Training
num_epochs = 1
history = model.fit(X_train_padded, y_train, epochs=num_epochs,batch_size=32, validation_data=(X_test_padded, y_test))



In [16]:
# test
test_loss, test_acc = model.evaluate(X_test_padded, y_test, verbose=2)
print("Test Accuracy: ", test_acc)

10000/10000 - 527s - loss: 0.3828 - accuracy: 0.8264 - 527s/epoch - 53ms/step
Test Accuracy:  0.8263593912124634


In [18]:
# Test with input entry
entry = ['I love Dr.Pepper with my burger ! :D']
test_text = tokenizer.texts_to_sequences(entry)
test_text_padded = pad_sequences(test_text, maxlen=max_length, padding=padding_type, truncating=trunc_type)

prediction = model.predict(test_text_padded)

#0 : bad
#1 : good
print(prediction)

entry = ['I hate this music so bad ! I just want to sleep rn']
test_text = tokenizer.texts_to_sequences(entry)
test_text_padded = pad_sequences(test_text, maxlen=max_length, padding=padding_type, truncating=trunc_type)

prediction = model.predict(test_text_padded)

#0 : bad
#1 : good
print(prediction)

[[0.9553149]]
[[0.00748623]]


In [20]:
# Test with input entry
entry = ['i do not like tamato' ]
test_text = tokenizer.texts_to_sequences(entry)
test_text_padded = pad_sequences(test_text, maxlen=max_length, padding=padding_type, truncating=trunc_type)

prediction = model.predict(test_text_padded)

#0 : bad
#1 : good
print(prediction)

entry = ['I like the KFC burger']
test_text = tokenizer.texts_to_sequences(entry)
test_text_padded = pad_sequences(test_text, maxlen=max_length, padding=padding_type, truncating=trunc_type)

prediction = model.predict(test_text_padded)

#0 : bad
#1 : good
print(prediction)

[[0.09167097]]
[[0.951655]]
