In [None]:
import nltk
print(nltk.__version__)
nltk.download('punkt')

In [None]:
%load_ext autoreload
%autoreload 2

import pickle

import pandas as pd
import re
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

from naive_bayes import *
from bert1 import *
from collections import defaultdict

Firstly, we load train, validation and test data: 

In [None]:
import pandas as pd

dataset_directory = "stance"

dataset_train = [f"{dataset_directory}/stance_hillary_train.csv", 
                 f"{dataset_directory}/stance_feminist_train.csv", 
                 f"{dataset_directory}/stance_climate_train.csv", 
                 f"{dataset_directory}/stance_atheism_train.csv", 
                 f"{dataset_directory}/stance_abortion_train.csv"]

train_hillary = pd.read_csv(f"{dataset_directory}/stance_hillary_train.csv")
train_feminist = pd.read_csv(f"{dataset_directory}/stance_feminist_train.csv")
train_climate = pd.read_csv(f"{dataset_directory}/stance_climate_train.csv")
train_atheism = pd.read_csv(f"{dataset_directory}/stance_atheism_train.csv")
train_abortion = pd.read_csv(f"{dataset_directory}/stance_abortion_train.csv")


dataset_test = [f"{dataset_directory}/stance_hillary_test.csv", 
                 f"{dataset_directory}/stance_feminist_test.csv", 
                 f"{dataset_directory}/stance_climate_test.csv", 
                 f"{dataset_directory}/stance_atheism_test.csv", 
                 f"{dataset_directory}/stance_abortion_test.csv"]

test_hillary = pd.read_csv(f"{dataset_directory}/stance_hillary_test.csv")
test_feminist = pd.read_csv(f"{dataset_directory}/stance_feminist_test.csv")
test_climate = pd.read_csv(f"{dataset_directory}/stance_climate_test.csv")
test_atheism = pd.read_csv(f"{dataset_directory}/stance_atheism_test.csv")
test_abortion = pd.read_csv(f"{dataset_directory}/stance_abortion_test.csv")

df_train = [pd.read_csv(file) for file in dataset_train]
df_train[0]['target'] = "hillary"
df_train[1]['target'] = "feminist"
df_train[2]['target'] = "climate"
df_train[3]['target'] = "atheism"
df_train[4]['target'] = "abortion"
train = pd.concat(df_train, ignore_index=True)

df_test = [pd.read_csv(file) for file in dataset_test]
df_test[0]['target'] = "hillary"
df_test[1]['target'] = "feminist"
df_test[2]['target'] = "climate"
df_test[3]['target'] = "atheism"
df_test[4]['target'] = "abortion"
test = pd.concat(df_test, ignore_index=True)

# 0 Data Preprocess
Our dataset comprises tweets, which, unlike other data forms such as news releases, often display unconventional expressions. This irregularity poses challenges in tokenization and feature extraction. To mitigate these issues, it's crucial to undertake data preprocessing that's specifically designed for the unique attributes of tweets. We have segmented this preprocessing into several steps:

## 0.1 Eliminate "@user"
It's important to note the frequent presence of "@user" in Twitter texts. These mentions often don't contribute meaningful information to the analysis. As such, we choose to disregard these specific terms in our dataset.

In [None]:
train['text'] = train['text'].str.replace('@user', '', regex=False)
# validation['text'] = validation['text'].str.replace('@user', '', regex=False)
test['text'] = test['text'].str.replace('@user', '', regex=False)
train['text'][2618]

In [None]:
train['text'] = train['text'].str.replace('#SemST', '', regex=False)
# validation['text'] = validation['text'].str.replace('#SemST', '', regex=False)
test['text'] = test['text'].str.replace('#SemST', '', regex=False)

train['text'] = train['text'].str.replace('#', '', regex=False)
# validation['text'] = validation['text'].str.replace('#', '', regex=False)
test['text'] = test['text'].str.replace('#', '', regex=False)
train['text'][2618]

In [None]:
def lowercase_text(text):
    text = text.lower()
    return text

train['text'] = train['text'].apply(lowercase_text)
# validation['text'] = validation['text'].apply(lowercase_text)
test['text'] = test['text'].apply(lowercase_text)

In [None]:
def replace_abbreviations(text):
    abbreviations = {
        "u": "you",
        "r": "are",
        "b4": "before",
        "b/w": "between", 
        "what's": "what is",
        "l8r": "later", 
        "gr8": "great",
        "thx": "thanks", 
        "tx": "thanks", 
        "she's": "she is",
        "won't": "will not",
        "we're": "We are",
        "that's": "That is",
        "haven't": "have not",
        "we'll": "we will",
        "they're": "they are",
        "btw": "by the way", 
        "idk": "i don't know", 
        "imo": "in my opinion", 
        "isn't": "is not",
        "here's": "Here is",
        "should've": "should have",
        "aren't": "are not",
        "you've": "you have",
        "i'm": "I am",
        "you're": "you are",
        "would've": "would have",
        "you'll": "you will",
        "you'd": "You would",
        "it's": "It is",
        "couldn't": "could not",
        "they'd": "they would",
        "i'll": "I will",
        "gov't": "government",
        "didn't": "did not",
        "who'd": "who would",
        "i've": "I have",
        "let's": "let us",
        "who's": "who is",
        "youve": "you have",
        "he'll": "he will",
        "didn't": "did not",
        "it'll": "it will",
        "shouldn't": "should not",
        "weren't": "were not",
        "can't": "cannot",
        "can't": "Cannot",
        "ain't": "am not",
        "ur's": "yours",
        "ca't": "cannot",
        "here`s": "here is",
        "we've": "we have",
        "doesn't": "does not",
        "he's": "he is",
        "hadn't": "had not",
        "tnx": "thanks", 
        "ty": "thank you", 
        "asap": "as soon as possible", 
        "w/o": "without"
    }
    return " ".join([abbreviations.get(word, word) for word in text.split()])

train['text'] = train['text'].apply(replace_abbreviations)
# validation['text'] = validation['text'].apply(replace_abbreviations)
test['text'] = test['text'].apply(replace_abbreviations)

In [None]:
def clean_text(text):
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    return text

train['text'] = train['text'].apply(clean_text)
# validation['text'] = validation['text'].apply(clean_text)
test['text'] = test['text'].apply(clean_text)

# 1 Naive Bayes

In [None]:
get_basic_stats(train)

In [None]:
naive_bayes = NaiveBayes()
naive_bayes.fit(train)

In [None]:
naive_bayes = NaiveBayes()
naive_bayes.fit(train)
print(f"Probability for each category: {naive_bayes.category_prob}")
print(f"Length of self.ngram_count: {len(naive_bayes.ngram_count)}")
print(f"Shape of the counts for 1st category: {naive_bayes.ngram_count[0].shape}")
print(f"Number of non-zero terms for 1st category: {(naive_bayes.ngram_count[0] > 0).sum()}")
print(f"Maximum count of the 1st category: {naive_bayes.ngram_count[0].max()}")
print(f"Minimum count of the 1st category: {naive_bayes.ngram_count[0].min()}")
print(f"Sum of ngram count for 1st category: {naive_bayes.ngram_count[0].sum()}")
print(f"Total count for each category: {naive_bayes.total_count}")

In [None]:
train_hillary = pd.read_csv(f"{dataset_directory}/stance_hillary_train.csv")
train_feminist = pd.read_csv(f"{dataset_directory}/stance_feminist_train.csv")
train_climate = pd.read_csv(f"{dataset_directory}/stance_climate_train.csv")
train_atheism = pd.read_csv(f"{dataset_directory}/stance_atheism_train.csv")
train_abortion = pd.read_csv(f"{dataset_directory}/stance_abortion_train.csv")

dataset_test = [f"{dataset_directory}/stance_hillary_test.csv", 
                 f"{dataset_directory}/stance_feminist_test.csv", 
                 f"{dataset_directory}/stance_climate_test.csv", 
                 f"{dataset_directory}/stance_atheism_test.csv", 
                 f"{dataset_directory}/stance_abortion_test.csv"]

test_hillary = pd.read_csv(f"{dataset_directory}/stance_hillary_test.csv")
test_feminist = pd.read_csv(f"{dataset_directory}/stance_feminist_test.csv")
test_climate = pd.read_csv(f"{dataset_directory}/stance_climate_test.csv")
test_atheism = pd.read_csv(f"{dataset_directory}/stance_atheism_test.csv")
test_abortion = pd.read_csv(f"{dataset_directory}/stance_abortion_test.csv")

In [None]:
naive_bayes = NaiveBayes()
naive_bayes.fit(train_climate)
preds = naive_bayes.predict(test_climate['text'])

labels = test_climate['label']

accuracy, mac_f1, mic_f1 = evaluate(preds, labels)
print(f"Accuracy: {accuracy}")
print(f"Macro f1: {mac_f1}")
print(f"Micro f1: {mic_f1}")

In [None]:
preds = naive_bayes.predict(test['text'])
labels = test['label']
print(f"Prediction: {preds[0:10]}")

In [None]:
accuracy, mac_f1, mic_f1 = evaluate(preds, labels)
print(f"Accuracy: {accuracy}")
print(f"Macro f1: {mac_f1}")
print(f"Micro f1: {mic_f1}")

In [None]:
import torch
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence

class BiLSTM(nn.Module):

    def __init__(self, linear_size, lstm_hidden_size, net_dropout, lstm_dropout):

        super(BiLSTM, self).__init__()
        
        self.model_name = 'BiLSTM'
        
        self.dropout = nn.Dropout(net_dropout)
        
        self.hidden_size = lstm_hidden_size
        self.lstm = nn.LSTM(1024, self.hidden_size, dropout=lstm_dropout, bidirectional=True)
        self.linear = nn.Linear(self.hidden_size*2, linear_size)
        self.out = nn.Linear(linear_size, 3)
        self.relu = nn.ReLU()
        
    def forward(self, x, x_len, epoch, target_word, _):
        
        x = x.squeeze(1)
        
        seq_lengths, perm_idx = x_len.sort(0, descending=True)
        seq_tensor = x[perm_idx,:,:]
        packed_input = pack_padded_sequence(seq_tensor, seq_lengths, batch_first=True)
        packed_output, (ht, ct) = self.lstm(packed_input)
        _, unperm_idx = perm_idx.sort(0)
        h_t = ht[:,unperm_idx,:]
        h_t = torch.cat((h_t[0,:,:self.hidden_size], h_t[1,:,:self.hidden_size]), 1)
        
        linear = self.relu(self.linear(h_t))
        linear = self.dropout(linear)
        out = self.out(linear)

In [None]:
from torch.utils.data import Dataset, DataLoader
class DataFrameDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=256):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx, 0]
        label = self.dataframe.iloc[idx, 1]

        inputs = self.tokenizer(text)
        print(inputs)

        return {
            'input': torch.tensor(inputs, dtype=torch.str),
            'labels': torch.tensor(label, dtype=torch.long)
        }

tokenizer = word_tokenize
train_dataset = DataFrameDataset(train, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [None]:
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam

lstm_hidden_size = 128
linear_size = lstm_hidden_size * 2

net_dropout = 0.2
lstm_dropout = 0.2

model = BiLSTM(linear_size, lstm_hidden_size, net_dropout, lstm_dropout)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)

In [None]:
model = get_bert_model()

In [None]:
model = get_bert_model()

In [None]:
lrs = [2e-5, 1e-5, 8e-6]
batch_sizes = [32, 16]

for lr in lrs:
    for bs in batch_sizes:
        y = get_stance(train_hillary['Stance'], le)
        train_model(train_hillary['Tweet'], y, bs,lr)

In [None]:
lrs = [2e-5, 1e-5, 8e-6]
batch_sizes = [32, 16]

for lr in lrs:
    for bs in batch_sizes:
        y = get_stance(train_feminist['Stance'], le)
        train_model(train_feminist['Tweet'], y, bs,lr)

In [None]:
lrs = [2e-5, 1e-5, 8e-6]
batch_sizes = [32, 16]

for lr in lrs:
    for bs in batch_sizes:
        y = get_stance(train_climate['Stance'], le)
        train_model(train_climate['Tweet'], y, bs,lr)

In [None]:
lrs = [2e-5, 1e-5, 8e-6]
batch_sizes = [32, 16]

for lr in lrs:
    for bs in batch_sizes:
        y = get_stance(train_atheism['Stance'], le)
        train_model(train_atheism['Tweet'], y, bs,lr)

In [None]:
lrs = [2e-5, 1e-5, 8e-6]
batch_sizes = [32, 16]

for lr in lrs:
    for bs in batch_sizes:
        y = get_stance(train_abortion['Stance'], le)
        train_model(train_abortion['Tweet'], y, bs,lr)

In [None]:
os.makedirs("trained_models")
y = encode_labels(aa_df['Stance'], le)
model, _ = train_whole_model(train_hillary['Tweet'], y, 16, 8e-6,8)
model.save('stance_hillary_train.h5')

In [None]:
model, _ = train_whole_model(train_feminist['Tweet'], y, 16, 8e-6,8)
model.save('stance_feminist_train.h5')

In [None]:
model, _ = train_whole_model(test_climate['Tweet'], y, 16, 8e-6,8)
model.save('stance_climate_train.h5')

In [None]:
model, _ = train_whole_model(test_atheism['Tweet'], y, 16, 8e-6,8)
model.save('stance_atheism_train.h5')

In [None]:
model, _ = train_whole_model(train_abortion['Tweet'], y, 16, 8e-6,8)
model.save('stance_abortion_train.h5')

In [None]:
test_result(get_model(stance_hillary_train.h5), test_hillary)

In [None]:
test_result(get_model(stance_feminist_train.h5):, test_feminist)

In [None]:
test_result(get_model(stance_climate_train.h5):, test_climate)

In [None]:
test_result(get_model(stance_atheism_train.h5):, test_atheism)

In [None]:
test_result(get_model(stance_abortion_train.h5):, test_abortion)