In [1]:
import numpy as np 
import pandas as pd

import tensorflow as tf 
from tensorflow import keras
from tensorflow.keras import layers, Model,Input
from tensorflow.keras.layers import *
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix,f1_score,classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [2]:
train_path = 'https://raw.githubusercontent.com/malinphy/datasets/main/tweet_sentiment_extraction/twitter_sentiment_analysis/twitter_training.csv'
test_path = 'https://raw.githubusercontent.com/malinphy/datasets/main/tweet_sentiment_extraction/twitter_sentiment_analysis/twitter_validation.csv'

In [3]:

train_df = pd.read_csv(train_path,header = None).dropna().reset_index(drop= True)
test_df = pd.read_csv(test_path,header = None).reset_index(drop= True)
train_df = train_df.rename(columns={0: 'tweet_id', 1: 'entity',2:'sentiment',3:'content'})
test_df = test_df.rename(columns={0: 'tweet_id', 1: 'entity',2:'sentiment',3:'content'})

In [4]:
vocab_size = 45000
embed_dim = 32
input_len = 170

In [5]:
vectorize_layer = tf.keras.layers.TextVectorization(
 max_tokens=vocab_size,
 output_mode='int',
 output_sequence_length=170
 )

vectorize_layer.adapt(train_df['content'])
train_tokens = vectorize_layer(train_df['content'])
test_tokens = vectorize_layer(test_df['content'])

In [6]:
input_len = tf.shape(train_tokens)[1]
corpus_size = len(vectorize_layer.get_vocabulary())

In [7]:
LE = LabelEncoder()
train_encoded_labels = LE.fit_transform(train_df['sentiment'])
test_encoded_labels = LE.transform(test_df['sentiment'])

In [8]:
input_layer = Input(shape=(input_len,), name = 'input_layer')
emb_layer = Embedding(vocab_size, embed_dim, name = 'embedding_layer')(input_layer)
flat_layer = Flatten(name = 'Flatten_layer')(emb_layer)
d1_layer = Dense(128,activation = 'relu',name = 'd1_layer')(flat_layer)
d2_layer = Dense(64,activation = 'relu',name = 'd2_layer')(d1_layer)
d3_layer = Dense(32,activation = 'relu',name = 'd3_layer')(d2_layer)
final_layer = Dense(4,activation = 'softmax',name = 'final_layer')(d3_layer)
model = Model(inputs = input_layer, outputs = final_layer)

model.compile(
    loss = tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer = tf.keras.optimizers.Adam(),
    metrics = ['accuracy']
)

model.fit(
    train_tokens,
    train_encoded_labels,
    epochs = 8
)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7fd6fde83a50>

In [9]:
preds = tf.math.top_k(model.predict( test_tokens), k=1 )[1]

In [10]:
confusion_matrix(test_encoded_labels, preds)

array([[170,   1,   0,   1],
       [  0, 265,   0,   1],
       [  1,   4, 275,   5],
       [  1,   1,   3, 272]])

In [11]:
print('f1 score',f1_score(test_encoded_labels, preds, average= 'macro'))
print(classification_report(test_encoded_labels, preds, labels = [0,1,2,3]))

f1 score 0.9826658478285479
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       172
           1       0.98      1.00      0.99       266
           2       0.99      0.96      0.98       285
           3       0.97      0.98      0.98       277

    accuracy                           0.98      1000
   macro avg       0.98      0.98      0.98      1000
weighted avg       0.98      0.98      0.98      1000



In [16]:
RF_classifier= RandomForestClassifier(n_estimators=1000, random_state=0)
RF_classifier.fit( train_tokens,train_encoded_labels)

RandomForestClassifier(n_estimators=1000, random_state=0)

In [17]:
RF_pred = RF_classifier.predict(test_tokens)
print(classification_report(test_encoded_labels, RF_pred, labels = [0,1,2,3]))

              precision    recall  f1-score   support

           0       0.97      0.88      0.92       172
           1       0.85      0.97      0.90       266
           2       0.91      0.87      0.89       285
           3       0.94      0.90      0.92       277

    accuracy                           0.91      1000
   macro avg       0.91      0.90      0.91      1000
weighted avg       0.91      0.91      0.91      1000



In [14]:
SVC_classifier = LinearSVC()
SVC_classifier.fit( train_tokens,train_encoded_labels)



LinearSVC()

In [15]:
SVC_pred = SVC_classifier.predict(test_tokens)
print(classification_report(test_encoded_labels, SVC_pred, labels = [0,1,2,3]))

              precision    recall  f1-score   support

           0       0.22      0.37      0.27       172
           1       0.25      0.09      0.13       266
           2       0.16      0.04      0.06       285
           3       0.30      0.60      0.40       277

    accuracy                           0.26      1000
   macro avg       0.23      0.27      0.21      1000
weighted avg       0.23      0.26      0.21      1000

