## **0. Download dataset**
**Note:** If you can't download using gdown due to limited number of downloads, please download it manually and upload it to your drive, then copy it from the drive to colab.
```python
from google.colab import drive

drive.mount('/content/drive')
!cp /path/to/dataset/on/your/drive .
```

In [None]:
# https://drive.google.com/file/d/1e1uIwcJ1-MviSn9yk_ldPGffDWVp6yK_/view?usp=drive_link
!gdown --id 1e1uIwcJ1-MviSn9yk_ldPGffDWVp6yK_

In [None]:
!unzip twitter_sentiment_analysis_3cls_dataset.zip

## **1. Import libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
nltk.download('stopwords')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

## **2. Read dataset**

In [None]:
dataset_path = 'Twitter_Data.csv'
df = pd.read_csv(
    dataset_path
)
df

In [None]:
df.info()

In [None]:
df.describe()

## **3. Drop missing value**

In [None]:
null_rows = df.isnull().any(axis=1)
df[null_rows]

In [None]:
df = df.dropna()

In [None]:
df.info()

## **4. Preprocessing data**



In [None]:
def text_normalize(text):
    text = text.lower()
    text = re.sub(r'^rt[\s]+', '', text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]

    stemmer = SnowballStemmer('english')
    words = [stemmer.stem(word) for word in words]
    text = ' '.join(words)

    return text

In [None]:
df['clean_text'] = df['clean_text'].apply(lambda x: text_normalize(x))

In [None]:
df

## **5. Add bias term**

In [None]:
vectorizer = TfidfVectorizer(max_features=2000)
X = vectorize.fit_transform(df['clean_text']).to_array()

In [None]:
intercept = np.ones((X.shape[0], 1))
X_b = np.concatenate((intercept, X), axis=1)

## **6. One-hot encoding label**

In [None]:
n_classes = df['category'].nunique()
n_samples = df['category'].size

y = df['category'].to_numpy() + 1
y = y,astype(np.uint8)

y_encoded = np.array([np.zeros (n_classes) for _ in range(n_samples)])
y_encoded[np.arange(n_samples), y] = 1

## **7. Create train, val, test set**

In [None]:
val_size = 0.2
test_size = 0.125
random_state = 2
is_shuffle = True

X_train, X_val, y_train, y_val = train_test_split(
    X_b, y_encoded,
    test_size=val_size,
    random_state=random_state,
    shuffle=is_shuffle
)

X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train,
    test_size=test_size,
    random_state=random_state,
    shuffle=is_shuffle
)

In [None]:
print(f'Number of training samples: {X_train.shape[0]}')
print(f'Number of val samples: {X_val.shape[0]}')
print(f'Number of test samples: {X_test.shape[0]}')

## **7. Define essential functions**

### **7.1. Sigmoid function**

In [None]:
def softmax(z):
    y_hat = np.exp(z) / np.exp(z).sum(axis=1)[:, None]

    return y_hat

### **7.2. Cross-entropy loss function**

In [None]:
def compute_loss(y_hat, y):
    loss = -1 * (np.dot(y.T, np.log(y_hat))) / y.size

    return loss

### **7.3. Hypothesis function**

In [None]:
def predict(X, theta):
    z = np.dot(X, theta)
    y_hat = softmax(z)

    return y_hat

### **7.4. Gradient function**

In [None]:
def compute_gradient(X, y, y_hat):
    dtheta = np.dot(X.T, (y_hat - y)) / y.size

    return dtheta

### **7.5. Update weights function**

In [None]:
def update_theta(theta, gradient, lr):
    theta = theta - lr * gradient

    return theta

### **7.6. Accuracy function**

In [None]:
def compute_accuracy(X, y, theta):
    y_hat = predict(X, theta)
    acc = (np.argmax(y_hat, axis=1) == np.argmax(y, axis=1)).mean()

    return acc

## **8. Training**

In [None]:
lr = 0.1
epochs = 200
batch_size = X_train.shape[0]
n_features = X_train.shape[1]

np.random.seed(random_state)
theta = np.random.uniform(
    size=(n_features, n_classes)
)

In [None]:
train_accs = []
train_losses = []
val_accs = []
val_losses = []

for epoch in range(epochs):
    train_batch_losses = []
    train_batch_accs = []
    val_batch_losses = []
    val_batch_accs = []

    for i in range(0, X_train.shape[0], batch_size):

        X_i = X_train[i: i + batch_size]
        y_i = y_train[i: i + batch_size]

        y_hat = predict(X_i, theta)
        train_loss = compute_loss(y_hat, y_i)
        dtheta = compute_gradient(X_i, y_i, y_hat)
        theta = update_theta(theta, dtheta, lr)
        train_batch_losses.append(train_loss)
        train_acc = compute_accuracy(X_train, theta, y_train)
        train_batch_accs.append(train_acc)

        y_val_hat = predict(X_val, theta)
        val_loss = compute_loss(y_val_hat, y_val)
        val_batch_losses.append(val_loss)
        val_acc = compute_accuracy(X_val, theta, y_val)
        val_batch_accs.append(val_acc)

    train_batch_loss = sum(train_batch_losses) / len(train_batch_losses)
    val_batch_loss = sum(val_batch_losses) / len(val_batch_losses)
    train_batch_acc = sum(train_batch_accs) / len(train_batch_accs)
    val_batch_acc = sum(val_batch_accs) / len(val_batch_accs)

    train_losses.append(train_batch_loss)
    train_accs.append(train_batch_acc)
    val_losses.append(val_batch_loss)
    val_accs.append(val_batch_acc)

    print(f'\nEPOCH {epoch + 1}:\tTraining Loss: {train_batch_loss:.3f}\tValidation Loss: {val_batch_loss:.3f}')

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(12, 10))
ax[0, 0].plot(train_losses, color='green')
ax[0, 0].set(xlabel='Epoch', ylabel='Loss')
ax[0, 0].set_title('Training Loss')

ax[0, 1].plot(val_losses, color='orange')
ax[0, 1].set(xlabel='Epoch', ylabel='Loss')
ax[0, 1].set_title('Validation Loss')

ax[1, 0].plot(train_accs, color='green')
ax[1, 0].set(xlabel='Epoch', ylabel='Accuracy')
ax[1, 0].set_title('Training Accuracy')

ax[1, 1].plot(val_accs, color='orange')
ax[1, 1].set(xlabel='Epoch', ylabel='Accuracy')
ax[1, 1].set_title('Validation Accuracy')

plt.show()

## **9. Evaluation**

In [None]:
# Val set
val_set_acc = compute_accuracy(X_val, y_val, theta)
print('Evaluation on validation set:')
print(f'Accuracy: {val_set_acc}')

In [None]:
# Test set
test_set_acc = compute_accuracy(X_test, y_test, theta)
print('Evaluation on test set:')
print(f'Accuracy: {test_set_acc}')