In [31]:
!pip install transformers==2.8.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==2.8.0
  Downloading transformers-2.8.0-py3-none-any.whl (563 kB)
[K     |████████████████████████████████| 563 kB 37.4 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 61.9 MB/s 
Collecting boto3
  Downloading boto3-1.24.67-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 68.3 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 63.1 MB/s 
Collecting tokenizers==0.5.2
  Downloading tokenizers-0.5.2-cp37-cp37m-manylinux1_x86_64.whl (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 51.3 MB/s 
Collecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.7.0,>=0.6.0
  Down

In [1]:
!pip install vaderSentiment

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 32.3 MB/s 
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [28]:
import pandas as pd
import  glob 
import os 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences 
from keras.models import Sequential 
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Flatten
from keras.utils.np_utils import to_categorical 
from keras.callbacks import EarlyStopping 
from sklearn.model_selection import train_test_split
from keras.layers import Dropout 
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import MaxPooling1D

In [3]:
#Loading and annotating the data
sent_analyzer = SentimentIntensityAnalyzer()
path = r'/content/drive/MyDrive/inputs'
raw_files = glob.glob(os.path.join(path,  "*.csv"))
dfs_list = []
for file in raw_files: 
    df = pd.read_csv(file)
    dfs_list.append(df)
all_tweets_df = pd.concat(dfs_list, axis=0, ignore_index=True)
#print(all_tweets_df.head(2))
all_tweets_df['compound_score'] = all_tweets_df['clean_tweets'].apply(lambda x: sent_analyzer.polarity_scores(str(x))['compound'])
all_tweets_df['label'] = all_tweets_df['compound_score'].apply(lambda x: "Positive" if x > 0.0 else "Negative" if x < 0.0 else "Neutral")
all_tweets_df = all_tweets_df.sample(frac=1)

In [4]:
#results
all_tweets_df.head(2)

Unnamed: 0,Date,User,Tweet,clean_tweets,compound_score,label
11067,2022-09-04 14:36:41+00:00,たんぽぽ組まなみん🍀,@UmaiyoUmesyu @ORAL_UVER_APPLE めっちゃ並んで食べたけど、これ...,めっちゃ並んで食べたけど、これは並んでも食べるね！って思うくらいには、美味しかった！！,0.0,Neutral
8033,2022-08-30 22:34:31+00:00,Nyakwar Dana,@KeEquityBank @MatiangDr @jumaf3 Check DM. I n...,Check DM. I need assistance ASAP.,0.0,Neutral


In [5]:
#declaring vocabulary parameters.
MAX_WORDS =  5000
max_length = 150 
embed_dim = 150 

In [13]:
#The textual features in the data will be converted to tokens using the tokenizer function in lstms as demonstrated belw:
tok = Tokenizer(num_words = MAX_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True )
all_tweets_df['clean_tweets'] = all_tweets_df['clean_tweets'].astype(str)
tok.fit_on_texts(all_tweets_df['clean_tweets'].values)
word_index = tok.word_index
print("Number of unique tokens: " + str(len(word_index)))

Number of unique tokens: 25964


In [14]:
X = tok.texts_to_sequences(all_tweets_df['clean_tweets'].values)
X = pad_sequences(X, maxlen = max_length )# since tweets have a limit of 140 characters, we use this figure as our max. length as defined before
print("Input tensor shape: ", X.shape)

Input tensor shape:  (16118, 150)


In [17]:
y = pd.get_dummies(all_tweets_df['label']).values

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [21]:
#initializing neural network
lstm_model = Sequential()
lstm_model.add(Embedding(MAX_WORDS, embed_dim, input_length = X.shape[1]))
lstm_model.add(SpatialDropout1D(0.2))
lstm_model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
lstm_model.add(Dense(3, activation="softmax"))
lstm_model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [22]:
#setting model runtime params and fitting model
epochs = 10
batch_size = 32
lstm_history = lstm_model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.1, callbacks=[EarlyStopping(monitor="val_loss", patience=5, min_delta=0.0001 )])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


In [24]:
metrcs = lstm_model.evaluate(X_test, y_test)
lstm_loss, lstm_acc = metrcs[0], metrcs[1]
print("LSTM Test Performance")
print("LSTM Loss: " +str(lstm_loss))
print("LSTM accuracy: " + str(lstm_acc))

LSTM Test Performance
LSTM Loss: 0.6239235401153564
LSTM accuracy: 0.8440860509872437


In [27]:
#Initializing Convolution neural network
cnn = Sequential()
cnn.add(Embedding(MAX_WORDS, embed_dim, input_length=X.shape[1]))
cnn.add(Conv1D(filters=128, kernel_size=4, padding="same", activation="relu"))
cnn.add(MaxPooling1D(pool_size=2))
cnn.add(Conv1D(filters=64, kernel_size=4, padding="same", activation="relu"))
cnn.add(MaxPooling1D(pool_size=2))
cnn.add(Flatten())
cnn.add(Dense(256, activation='relu'))
cnn.add(Dense(3, activation="softmax"))
cnn.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [29]:
#fitting the model
cnn_history = cnn.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [30]:
cnn_loss, cnn_acc = cnn.evaluate(X_test, y_test)
print("CNN Test Performance")
print("CNN Loss: " + str(cnn_loss))
print("CNN Acc: " + str(cnn_acc))

CNN Test Performance
CNN Loss: 1.1292004585266113
CNN Acc: 0.8457402586936951


In [None]:
#Transformer models: BERT
