## **0. Download dataset**
**Note:** If you can't download using gdown due to limited number of downloads, please download it manually and upload it to your drive, then copy it from the drive to colab.
```python
from google.colab import drive

drive.mount('/content/drive')
!cp /path/to/dataset/on/your/drive .
```

In [None]:
# https://drive.google.com/file/d/1e1uIwcJ1-MviSn9yk_ldPGffDWVp6yK_/view?usp=drive_link
!gdown --id 1e1uIwcJ1-MviSn9yk_ldPGffDWVp6yK_

In [None]:
!unzip twitter_sentiment_analysis_3cls_dataset.zip

## **1. Import libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import torch
import torch.nn as nn
import torch.optim as optim
import nltk
nltk.download('stopwords')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

## **2. Read dataset**

In [None]:
dataset_path = 'Twitter_Data.csv'
df = pd.read_csv(
    dataset_path
)
df

In [None]:
df.info()

In [None]:
df.describe()

## **3. Drop missing value**

In [None]:
null_rows = df.isnull().any(axis=1)
df[null_rows]

In [None]:
df = df.dropna()

In [None]:
df.info()

## **4. Preprocessing data**



In [None]:
def text_normalize(text):
    text = text.lower()
    text = re.sub(r'^rt[\s]+', '', text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]

    stemmer = SnowballStemmer('english')
    words = [stemmer.stem(word) for word in words]
    text = ' '.join(words)

    return text

In [None]:
df['clean_text'] = df['clean_text'].apply(
    lambda x: text_normalize(x)
)

In [None]:
df

## **5. One-hot encoding label**

In [None]:
n_classes = df['category'].nunique()
n_samples = df['category'].size

y = df['category'].to_numpy() + 1
y = y,astype(np.uint8)

y_encoded = np.array([np.zeros (n_classes) for _ in range(n_samples)])
y_encoded[np.arange(n_samples), y] = 1

## **6. Create train, val, test set**

In [None]:
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y_encoded, dtype=torch.float32)

In [None]:
val_size = 0.2
test_size = 0.125
random_state = 2
is_shuffle = True

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=val_size,
    random_state=random_state,
    shuffle=is_shuffle
)

X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train,
    test_size=test_size,
    random_state=random_state,
    shuffle=is_shuffle
)

In [None]:
print(f'Number of training samples: {X_train.shape[0]}')
print(f'Number of val samples: {X_val.shape[0]}')
print(f'Number of test samples: {X_test.shape[0]}')

## **7. Define Softmax Regression model**

In [None]:
class SoftmaxRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SoftmaxRegression, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, output_dim)
        )

    def forward(self, x):
        return self.model(x)

In [None]:
def compute_accuracy(y_hat, y_true):
    y_hat = predict(X, theta)
    acc = (torch.argmax(y_hat, axis=1) == torch.argmax(y, axis=1)).mean()

    return acc

## **8. Training**

In [None]:
lr = 0.1
epochs = 500
torch.manual_seed(random_state)
if torch.cuda.is_available():
    torch.cuda.manual_seed(random_state)

input_dim = X_train.shape[1]
output_dim = y_train.shape[1]

model = SoftmaxRegression(
    input_dim, output_dim
)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(
    model.parameters(), lr=lr
)

In [None]:
train_accs = []
train_losses = []
val_accs = []
val_losses = []

for epoch in epochs:
    model.train()

    optimizer.zero_grad()

    y_hat = model(X_train)

    train_loss = criterion(y_hat, y_train)
    train_acc = compute_accuracy(y_hat, y_train)
    train_losses.append(train_loss.item())
    train_accs.append(train_acc)


    train_loss.backward()
    optimizer.step()

    model.eval()

    with torch.no_grad():
        y_val_hat = model(X_val)
        val_loss = criterion(y_val_hat, y_val)
        val_losses.append(val_loss.item())
        val_acc = compute_accuracy(y_val_hat, y_val)
        val_accs.append(val_acc)

    print(f'\nEPOCH {epoch + 1}:\tTraining Loss: {train_batch_loss:.3f}\tValidation Loss: {val_batch_loss:.3f}')

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(12, 10))
ax[0, 0].plot(train_losses, color='green')
ax[0, 0].set(xlabel='Epoch', ylabel='Loss')
ax[0, 0].set_title('Training Loss')

ax[0, 1].plot(val_losses, color='orange')
ax[0, 1].set(xlabel='Epoch', ylabel='Loss')
ax[0, 1].set_title('Validation Loss')

ax[1, 0].plot(train_accs, color='green')
ax[1, 0].set(xlabel='Epoch', ylabel='Accuracy')
ax[1, 0].set_title('Training Accuracy')

ax[1, 1].plot(val_accs, color='orange')
ax[1, 1].set(xlabel='Epoch', ylabel='Accuracy')
ax[1, 1].set_title('Validation Accuracy')

plt.show()

## **9. Evaluation**

In [None]:
# Val set
model.eval()
with torch.no_grad():
    y_hat = model(X_val)
    val_set_acc = compute_accuracy(y_hat, y_val)
    print('Evaluation on validation set:')
    print(f'Accuracy: {val_set_acc}')

In [None]:
# Test set
model.eval()
with torch.no_grad():
    y_hat = model(X_test)
    test_set_acc = compute_accuracy(y_hat, y_test)
    print('Evaluation on test set:')
    print(f'Accuracy: {test_set_acc}')