In [1]:
!curl -s -O http://saifmohammad.com/WebDocs/AIT-2018/AIT2018-DATA/SemEval2018-Task1-all-data.zip
!unzip -q -o SemEval2018-Task1-all-data.zip -x "__MACOSX*"

In [2]:
from tqdm import tqdm
import spacy
from scipy.sparse import csr_matrix
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
# nrows = 2000
df = pd.read_csv('./SemEval2018-Task1-all-data/English/E-c/2018-E-c-En-train.txt', sep='\t', on_bad_lines='skip', nrows=2000)
df = df.drop(columns=['ID'])

In [5]:
df

Unnamed: 0,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,“Worry is a down payment on a problem you may ...,0,1,0,0,0,0,1,0,0,0,1
1,Whatever you decide to do make sure it makes y...,0,0,0,0,1,1,1,0,0,0,0
2,@Max_Kellerman it also helps that the majorit...,1,0,1,0,1,0,1,0,0,0,0
3,Accept the challenges so that you can literall...,0,0,0,0,1,0,1,0,0,0,0
4,My roommate: it's okay that we can't spell bec...,1,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1995,"What a great training course, lots of photos, ...",0,1,0,0,1,1,1,0,0,0,0
1996,@TedLeitner @jesseagler You two are TEN TIMES ...,1,0,0,0,1,1,0,0,0,0,1
1997,#GBBO is such a homely pure piece of tv gold. ...,1,0,1,0,0,0,0,0,0,0,0
1998,gifs on iOS10 messaging app are hilarious.,0,0,0,0,1,0,0,0,0,1,0


In [6]:
def preprocess_with_spacy(text):
    text = text.lower()
    doc = nlp(text)
    text = text.lower()
    tokens = [token.text for token in doc if not token.is_punct and not token.is_space]
    tokens = [token if token != '.' else ' ' for token in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [7]:
df['Tweet'] = df['Tweet'].apply(preprocess_with_spacy)

In [8]:
df

Unnamed: 0,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,worry is a down payment on a problem you may n...,0,1,0,0,0,0,1,0,0,0,1
1,whatever you decide to do make sure it makes y...,0,0,0,0,1,1,1,0,0,0,0
2,@max_kellerman it also helps that the majority...,1,0,1,0,1,0,1,0,0,0,0
3,accept the challenges so that you can literall...,0,0,0,0,1,0,1,0,0,0,0
4,my roommate it 's okay that we ca n't spell be...,1,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1995,what a great training course lots of photos fu...,0,1,0,0,1,1,1,0,0,0,0
1996,@tedleitner @jesseagler you two are ten times ...,1,0,0,0,1,1,0,0,0,0,1
1997,gbbo is such a homely pure piece of tv gold ch...,1,0,1,0,0,0,0,0,0,0,0
1998,gifs on ios10 messaging app are hilarious,0,0,0,0,1,0,0,0,0,1,0


In [9]:
tweets = df['Tweet']

In [10]:
count_vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))

In [11]:
X = count_vectorizer.fit_transform(tweets)
Y = df.drop(columns=['Tweet'])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [13]:
X_train = csr_matrix(X_train)
X_test = csr_matrix(X_test)

X_train = X_train.toarray()
X_test = X_test.toarray()

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

In [14]:
onehot_encoder = OneHotEncoder(sparse_output=False)

y_train_encoded = onehot_encoder.fit_transform(y_train)
y_test_encoded = onehot_encoder.transform(y_test)

y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.float32)

In [15]:
class MultiClassModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(MultiClassModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [25]:
class ImprovedMultiClassModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(ImprovedMultiClassModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [26]:
# define the input and output sizes based on data
input_size = X_train_tensor.shape[1]
num_classes = y_train_tensor.shape[1]

# instantiate the model
model = ImprovedMultiClassModel(input_size, num_classes)

In [27]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [28]:
num_epochs = 200
batch_size = 32

for epoch in tqdm(range(num_epochs), desc='Training Progress'):
    for i in range(0, len(X_train_tensor), batch_size):
        batch_X = X_train_tensor[i:i + batch_size]
        batch_y = y_train_tensor[i:i + batch_size]

        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, torch.max(batch_y, 1)[1])
        loss.backward()
        optimizer.step()

    # print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item()}')

print('Training complete')


Training Progress: 100%|██████████████████████| 200/200 [02:35<00:00,  1.28it/s]

Training complete





In [29]:
model.eval()
with torch.no_grad():
    y_pred = model(X_test_tensor)
    _, predicted = torch.max(y_pred, 1)

accuracy = torch.sum(predicted == torch.max(y_test_tensor, 1)[1]).item() / len(X_test_tensor)
print(f'Accuracy on the test set: {accuracy * 100:.2f}%')

Accuracy on the test set: 64.75%
