In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
nltk.download('stopwords')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/loiphong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
dataset_path = './datasets/Twitter_Data.csv'
df = pd.read_csv(dataset_path)

df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 162969 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  162969 non-null  object 
 1   category    162969 non-null  float64
dtypes: float64(1), object(1)
memory usage: 3.7+ MB


In [47]:
def text_normalization(text):
    # lowercasing
    text = text.lower()

    # Retweet old acronym "RT" removal
    text = re.sub(r'rt[\s]+', '', text)

    # hyperlinks removal
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)

    # punctuation removal
    text = re.sub(r'[^\w\s]', '', text)

    # stopwords removal
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    text = ' '.join(words)

    # stemming
    stemmer = SnowballStemmer('english')
    words = text.split()
    words = [stemmer.stem(word) for word in words]
    text = ' '.join(words)

    # remove special characters and emojis
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # remove Unicode
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # remove encoding
    text = re.sub(r'&amp;', '&', text)

    # remove icon
    text = re.sub(r'pic\.twitter\.com\/\S+', '', text)
    return text

In [54]:
df['clean_text'] = df['clean_text'].apply(text_normalization)
vectorizer = TfidfVectorizer(max_features=2000)
X = vectorizer.fit_transform(df['clean_text']).toarray()

0         modi promi minimum govern maximum govern expec...
1                       talk nonsen continu drama vote modi
2         say vote modi welcom bjp told rahul main campa...
3         ask suppoprefix chowkidar name modi great serv...
4         answer among power world leader today trump pu...
                                ...                        
162975    456 crore paid neerav modi recov congress lead...
162976    dear rss terrorist payal gawar modi kill 1000 ...
162977                            cover interact forum left
162978    big project came india modi dream project happ...
162979    ever listen like gurukul disciplin maintain ev...
Name: clean_text, Length: 162969, dtype: object


In [49]:
intercept = np.ones((X.shape[0], 1))
X_b = np.concatenate((intercept, X), axis=1)

In [50]:
num_unique_class = np.unique(df['category'].to_numpy())

# one-hot encoding
y = df['category'].to_numpy() + 1
y = y.astype(np.uint8)
y_encoded = np.eye(num_unique_class.shape[0])[y]

In [51]:
val_size = 0.2
test_size = 0.125
random_state = 42
is_shuffle = True

X_train, X_val, y_train, y_val = train_test_split(X_b, y_encoded, test_size=val_size, random_state=random_state, shuffle=is_shuffle)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=test_size, random_state=random_state, shuffle=is_shuffle)


In [52]:
def softmax(z):
    exp_z = np.exp(z)

    return exp_z / exp_z.sum(axis=1)[:, None]

def predict(X, theta):
    z = np.dot(X, theta)
    y_hat = softmax(z)
    
    return y_hat

def compute_loss(y_hat, y):
    n = y.size
    
    return (-1 / n) * np.sum(y * np.log(y_hat))

def compute_gradient(X, y, y_hat):
    n = y.size
    
    return np.dot(X.T, (y_hat - y)) / n

def update_theta(theta, gradient, lr):
    return theta - lr * gradient

def compute_accuracy(X, y, theta):
    y_hat = predict(X, theta)
    acc = (np.argmax(y_hat, axis=1) == np.argmax(y, axis=1)).mean()

    return acc

In [65]:
lr = 0.9
epochs = 200
batch_size = X_train.shape[0]
n_features = X_train.shape[1]
n_classes = df['category'].nunique()

np.random.seed(random_state)
theta = np.random.uniform(size=(n_features, n_classes))


train_accs = []
train_losses = []
val_accs = []
val_losses = []

for epoch in range(epochs):
    train_batch_losses = []
    train_batch_accs = []
    val_batch_losses = []
    val_batch_accs = []

    for i in range(0, X_train.shape[0], batch_size):
        X_i = X_train[i:i+batch_size]
        y_i = y_train[i:i+batch_size]

        y_hat = predict(X_i, theta)
        train_loss = compute_loss(y_hat, y_i)
        gradient = compute_gradient(X_i, y_i, y_hat)
        theta = update_theta(theta, gradient, lr)

        train_batch_losses.append(train_loss)
        train_acc = compute_accuracy(X_train, y_train, theta)
        train_batch_accs.append(train_acc)
        
        y_val_hat = predict(X_val, theta)
        val_loss = compute_loss(y_val_hat, y_val)
        val_batch_losses.append(val_loss)

        val_acc = compute_accuracy(X_val, y_val, theta)
        val_batch_accs.append(val_acc)

    train_batch_loss = sum(train_batch_losses) / len(train_batch_losses)
    val_batch_loss = sum(val_batch_losses) / len(val_batch_losses)
    train_batch_acc = sum(train_batch_accs) / len(train_batch_accs)
    val_batch_acc = sum(val_batch_accs) / len(val_batch_accs) 

    train_accs.append(train_batch_acc)
    train_losses.append(train_batch_loss)
    val_losses.append(val_batch_loss)
    val_accs.append(val_batch_acc)

    print(f'Epoch {epoch + 1}/{epochs} - Train loss: {train_batch_loss:.3f} - Train acc: {train_batch_acc:.3f} - Val loss: {val_batch_loss:.3f} - Val acc: {val_batch_acc:.3f}')



Epoch 1/200 - Train loss: 0.369 - Train acc: 0.367 - Val loss: 0.370 - Val acc: 0.363
Epoch 2/200 - Train loss: 0.368 - Train acc: 0.370 - Val loss: 0.369 - Val acc: 0.365
Epoch 3/200 - Train loss: 0.368 - Train acc: 0.373 - Val loss: 0.368 - Val acc: 0.369
Epoch 4/200 - Train loss: 0.367 - Train acc: 0.376 - Val loss: 0.368 - Val acc: 0.371
Epoch 5/200 - Train loss: 0.366 - Train acc: 0.378 - Val loss: 0.367 - Val acc: 0.374
Epoch 6/200 - Train loss: 0.366 - Train acc: 0.380 - Val loss: 0.366 - Val acc: 0.376
Epoch 7/200 - Train loss: 0.365 - Train acc: 0.383 - Val loss: 0.366 - Val acc: 0.378
Epoch 8/200 - Train loss: 0.365 - Train acc: 0.385 - Val loss: 0.366 - Val acc: 0.380
Epoch 9/200 - Train loss: 0.364 - Train acc: 0.387 - Val loss: 0.365 - Val acc: 0.382
Epoch 10/200 - Train loss: 0.364 - Train acc: 0.387 - Val loss: 0.365 - Val acc: 0.383
Epoch 11/200 - Train loss: 0.363 - Train acc: 0.389 - Val loss: 0.365 - Val acc: 0.384
Epoch 12/200 - Train loss: 0.363 - Train acc: 0.391 

In [58]:
val_set_acc = compute_accuracy (X_val , y_val , theta )
test_set_acc = compute_accuracy ( X_test , y_test , theta )

print(f'Validation set accuracy: {val_set_acc:.3f}')
print(f'Test set accuracy: {test_set_acc:.3f}')

Validation set accuracy: 0.410
Test set accuracy: 0.414
