## **0. Download dataset**

```

```


In [None]:
# https://drive.google.com/file/d/1e1uIwcJ1-MviSn9yk_ldPGffDWVp6yK_/view?usp=drive_link
!gdown --id 1e1uIwcJ1-MviSn9yk_ldPGffDWVp6yK_


Downloading...
From: https://drive.google.com/uc?id=1e1uIwcJ1-MviSn9yk_ldPGffDWVp6yK_
To: d:\NgocDai\AIO24\AIO2024_AIO-056\Module5\Week2_SoftmaxRegression\twitter_sentiment_analysis_3cls_dataset.zip

  0%|          | 0.00/7.97M [00:00<?, ?B/s]
  7%|▋         | 524k/7.97M [00:00<00:02, 2.80MB/s]
 33%|███▎      | 2.62M/7.97M [00:00<00:00, 9.71MB/s]
100%|██████████| 7.97M/7.97M [00:00<00:00, 19.5MB/s]


In [1]:
import zipfile

with zipfile.ZipFile('twitter_sentiment_analysis_3cls_dataset.zip', 'r') as zip_ref:
    zip_ref.extractall()


## **1. Import libraries**


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import torch
import torch.nn as nn
import torch.optim as optim
import nltk
nltk.download('stopwords')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## **2. Read dataset**


In [None]:
dataset_path = 'Twitter_Data.csv'
df = pd.read_csv(
    dataset_path
)
df


Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
...,...,...
162975,why these 456 crores paid neerav modi not reco...,-1.0
162976,dear rss terrorist payal gawar what about modi...,-1.0
162977,did you cover her interaction forum where she ...,0.0
162978,there big project came into india modi dream p...,0.0


In [None]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  162976 non-null  object 
 1   category    162973 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.5+ MB


In [None]:
df.describe()


Unnamed: 0,category
count,162973.0
mean,0.225436
std,0.781279
min,-1.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


## **3. Drop missing value**


In [None]:
null_rows = df.isnull().any(axis=1)
df[null_rows]


Unnamed: 0,clean_text,category
148,,0.0
130448,the foundation stone northeast gas grid inaugu...,
155642,dear terrorists you can run but you cant hide ...,
155698,offense the best defence with mission shakti m...,
155770,have always heard politicians backing out thei...,
158693,modi government plans felicitate the faceless ...,
158694,,-1.0
159442,chidambaram gives praises modinomics,
159443,,0.0
160559,the reason why modi contested from seats 2014 ...,


In [None]:
df = df.dropna()


In [None]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 162969 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  162969 non-null  object 
 1   category    162969 non-null  float64
dtypes: float64(1), object(1)
memory usage: 3.7+ MB


## **4. Preprocessing data**


In [None]:
def text_normalize(text):
    text = text.lower()
    # Retweet old acronym "RT" removal
    text = re.sub(r'^rt[\s]+', '', text)
    # Hyperlinks removal
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    # Punctuation removal
    text = re.sub(r'[^\w\s]', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    text = ' '.join(words)
    # Stemming
    stemmer = SnowballStemmer('english')
    words = text.split()
    words = [stemmer.stem(word) for word in words]
    text = ' '.join(words)
    return text


In [None]:
df['clean_text'] = df['clean_text'].apply(lambda x: text_normalize(x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_text'] = df['clean_text'].apply(lambda x: text_normalize(x))


In [None]:
df.head()


Unnamed: 0,clean_text,category
0,modi promis minimum govern maximum govern expe...,-1.0
1,talk nonsens continu drama vote modi,0.0
2,say vote modi welcom bjp told rahul main campa...,1.0
3,ask support prefix chowkidar name modi great s...,1.0
4,answer among power world leader today trump pu...,1.0


In [None]:
vectorizer = TfidfVectorizer(max_features=2000)
X = vectorizer.fit_transform(df['clean_text']).toarray()


In [None]:
X


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## **5. label**


In [None]:
n_classes = df['category'].nunique()
n_samples = df['category'].size
y = np.array(df['category'].to_numpy() + 1, dtype=int)


## **6. Create train, val, test set**


In [None]:
import torch
val_size = 0.2
test_size = 0.125
random_state = 2
is_shuffle = True
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=val_size,
    random_state=random_state,
    shuffle=is_shuffle
)

X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train,
    test_size=test_size,
    random_state=random_state,
    shuffle=is_shuffle
)


In [None]:
print(f'Number of training samples: {X_train.shape[0]}')
print(f'Number of val samples: {X_val.shape[0]}')
print(f'Number of test samples: {X_test.shape[0]}')


Number of training samples: 114078
Number of val samples: 32594
Number of test samples: 16297


## **7. Define Softmax Regression model**


In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SoftmaxRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SoftmaxRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        logits = self.linear(x)
        probabilities = F.softmax(logits, dim=1)
        return probabilities


### **7.6. Accuracy function**


In [18]:
def compute_accuracy(y_hat, y):
    acc = (torch.argmax(y_hat, axis=1) == torch.argmax(y, axis=1)).mean()
    return acc


## **8. Training**


In [None]:
lr = 0.1
epochs = 50
torch.manual_seed(random_state)
if torch.cuda.is_available():
    torch.cuda.manual_seed(random_state)

input_dim = X_train.shape[1]
output_dim = n_classes

model = SoftmaxRegression(
    input_dim, output_dim
)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(
    model.parameters(), lr=lr
)


In [24]:
def train_model(model, criterion, optimizer, X_train, y_train, epochs):
    model.train()  # Set the model to training mode
    for epoch in range(epochs):
        # Convert input and output data to tensors if needed
        inputs = torch.tensor(X_train, dtype=torch.float32)
        
        # Ensure labels are in integer class index format
        if y_train.ndim > 1:  # If y_train is one-hot encoded
            labels = torch.tensor(y_train, dtype=torch.long).argmax(dim=1)
        else:
            labels = torch.tensor(y_train, dtype=torch.long)

        # Reset gradients to zero before backpropagation
        optimizer.zero_grad()
        
        # Compute model predictions
        outputs = model(inputs)
        
        # Calculate loss based on model predictions and true labels
        loss = criterion(outputs, labels)
        
        # Perform backpropagation to calculate gradients
        loss.backward()
        
        # Update model parameters
        optimizer.step()

        # Calculate accuracy
        _, predicted = torch.max(outputs, 1)  # Get the index of the max log-probability
        accuracy = (predicted == labels).sum().item() / labels.size(0)
        
        # Print loss and accuracy every 100 epochs

        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}, Accuracy: {accuracy * 100:.2f}%')

# Example of using the train function
train_model(model, criterion, optimizer, X_train, y_train, epochs)


  inputs = torch.tensor(X_train, dtype=torch.float32)
  labels = torch.tensor(y_train, dtype=torch.long)


Epoch [1/50], Loss: 1.0975, Accuracy: 37.44%
Epoch [2/50], Loss: 1.0972, Accuracy: 38.33%
Epoch [3/50], Loss: 1.0970, Accuracy: 39.14%
Epoch [4/50], Loss: 1.0967, Accuracy: 39.91%
Epoch [5/50], Loss: 1.0964, Accuracy: 40.62%
Epoch [6/50], Loss: 1.0962, Accuracy: 41.34%
Epoch [7/50], Loss: 1.0959, Accuracy: 42.04%
Epoch [8/50], Loss: 1.0956, Accuracy: 42.68%
Epoch [9/50], Loss: 1.0954, Accuracy: 43.13%
Epoch [10/50], Loss: 1.0951, Accuracy: 43.46%
Epoch [11/50], Loss: 1.0948, Accuracy: 43.70%
Epoch [12/50], Loss: 1.0946, Accuracy: 43.82%
Epoch [13/50], Loss: 1.0943, Accuracy: 43.99%
Epoch [14/50], Loss: 1.0941, Accuracy: 44.05%
Epoch [15/50], Loss: 1.0938, Accuracy: 44.11%
Epoch [16/50], Loss: 1.0936, Accuracy: 44.18%
Epoch [17/50], Loss: 1.0933, Accuracy: 44.19%
Epoch [18/50], Loss: 1.0931, Accuracy: 44.20%
Epoch [19/50], Loss: 1.0929, Accuracy: 44.21%
Epoch [20/50], Loss: 1.0926, Accuracy: 44.21%
Epoch [21/50], Loss: 1.0924, Accuracy: 44.22%
Epoch [22/50], Loss: 1.0921, Accuracy: 44.2