In [10]:
#Preprocessing
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

import pandas as pd
import numpy
import numpy as np
from sklearn.utils import shuffle

def my_clean(text, stops, stemming):
      text = str(text)
      text = re.sub(r" US ", " american ", text)
      text = text.lower().split()
      text = " ".join(text)
      text = re.sub(r"what's", "what is ", text)
      text = re.sub(r"don't", "do not ", text)
      text = re.sub(r"aren't", "are not ", text)
      text = re.sub(r"isn't", "is not ", text)
      text = re.sub(r"%", " percent ", text)
      text = re.sub(r"that's", "that is ", text)
      text = re.sub(r"doesn't", "does not ", text)
      text = re.sub(r"he's", "he is ", text)
      text = re.sub(r"she's", "she is ", text)
      text = re.sub(r"it's", "it is ", text)
      text = re.sub(r"\'s", " ", text)
      text = re.sub(r"\'ve", " have ", text)
      text = re.sub(r"n't", " not ", text)
      text = re.sub(r"i'm", "i am ", text)
      text = re.sub(r"\'re", " are ", text)
      text = re.sub(r"\'d", " would ", text)
      text = re.sub(r"\'ll", " will ", text)
      text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
      text = re.sub(r"<url", " ", text)
      text = re.sub(r",", " ", text)
      text = re.sub(r"\.", " ", text)
      text = re.sub(r"!", " ! ", text)
      text = re.sub(r"\/", " ", text)
      text = re.sub(r"\^", " ^ ", text)
      text = re.sub(r"\+", " + ", text)
      text = re.sub(r"\-", " - ", text)
      text = re.sub(r"\=", " = ", text)
      text = re.sub(r"'", " ", text)
      text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
      text = re.sub(r":", " : ", text)
      text = re.sub(r" u s ", " american ", text)
      text = re.sub(r"\0s", "0", text)
      text = re.sub(r" 9 11 ", "911", text)
      text = re.sub(r"e - mail", "email", text)
      text = re.sub(r"j k", "jk", text)
      text = re.sub(r"\s{2,}", " ", text)
      text = text.lower().split()
      text = [w for w in text if len(w) >= 2]
      if stemming and stops:
          text = [word for word in text if word not in stopwords.words('english')]
          wordnet_lemmatizer = WordNetLemmatizer()
          englishStemmer = SnowballStemmer("english", ignore_stopwords=True)
          text = [englishStemmer.stem(word) for word in text]
          text = [wordnet_lemmatizer.lemmatize(word) for word in text]
          text = [ word for word in text if word not in stopwords.words('english')]
      elif stops:
          text = [ word for word in text if word not in stopwords.words('english')]
      elif stemming:
          wordnet_lemmatizer = WordNetLemmatizer()
          englishStemmer = SnowballStemmer("english", ignore_stopwords=True)
          text = [englishStemmer.stem(word) for word in text]
          text = [wordnet_lemmatizer.lemmatize(word) for word in text]
      text = " ".join(text)
      return text

class Preproccesor:

    def __init__(self):
        """Init function
        """
    def load_data(preprocessed=True, stemming_a=True):
        DIRECTORY = "/content/"

        # Note: it seems that the data is already to lower case, so no need to apply lower() to the text
        pos_data = pd.read_fwf(DIRECTORY+"train_neg.txt", header=None, names=["text"]).drop_duplicates().apply(lambda x: x.str.lower())
        pos_data = pos_data[:1000]
        pos_data["labels"] = 1
        neg_data = pd.read_fwf(DIRECTORY+'train_pos.txt', header=None, names=["text"]).drop_duplicates().apply(lambda x: x.str.lower())
        neg_data = neg_data[:1000]
        neg_data["labels"] = 0
        data = pd.concat([pos_data, neg_data], ignore_index=True)

        np.random.seed(500)
        data = data.iloc[np.random.permutation(len(data))]
        XT = data['text'].values
        X = []
        y = data['labels'].values
        for x in XT:
            if preprocessed:
                X.append(my_clean(text=str(x), stops=True, stemming=stemming_a))
            else:
                X.append(x)
        return numpy.array(X), numpy.array(y)



In [2]:
#import data
# in some cases without stemming and without removing stop words the model can perform better since learn well the meaning of words
X, y = Preproccesor.load_data(preprocessed=True, stemming_a=True)

class_names = ['Negative', 'Positive']

In [3]:
print("Total tweets:",len(y))
print("Positive tweets:",sum(y))
print("Negative tweets:",len(y)-sum(y))

print("Sample")
print(X[3])
print(y[3])

Total tweets: 2000
Positive tweets: 1000
Negative tweets: 1000
Sample
necklac silver smokeyquartz jewer 18 inch gnklc 322 jewelri ancient time india pow
1


In [4]:
#CROSS VALIDATION
from sklearn.model_selection import train_test_split

#split train data (20% testing , 10% validation and 70% training)
indices = np.arange(len(y))
train_texts, test_texts, train_labels, test_labels, _, test_indexes = train_test_split(list(X), y, indices, stratify=y, test_size=0.2, random_state=42)

size = (0.1 * len(y)) / len(train_labels)
train_texts, validation_texts, train_labels, validation_labels = train_test_split(list(train_texts), train_labels, stratify=train_labels, test_size=size, random_state=42)

print("Training samples: ",len(train_labels))
print("Validation samples: ",len(validation_labels))
print("Test samples: ",len(test_labels))

Training samples:  1400
Validation samples:  200
Test samples:  400


In [5]:
import torch

from transformers import DistilBertTokenizerFast 
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

#Set training arguments
training_args = TrainingArguments(
    evaluation_strategy='epoch',     # evaluation frequency
    save_strategy='epoch',           # model checkpoint frequency
    logging_strategy='epoch',        # logging frequency
    log_level='warning',             # logging level
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs # checked for different epoches
    per_device_train_batch_size=4,   # batch size per device during training
    per_device_eval_batch_size=256,   # batch size for evaluation, change according to GPU memory capabilities #checked for different eval batch
    warmup_steps=200,                # number of warmup steps for learning rate scheduler
    weight_decay=0.001,              # strength of weight decay
    logging_dir='./logs'             # directory for storing logs
)

In [6]:
#
class TwitterDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [7]:
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
validation_encodings = tokenizer(list(validation_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)
    
train_dataset = TwitterDataset(train_encodings, train_labels)
validation_dataset = TwitterDataset(validation_encodings, validation_labels)
test_dataset = TwitterDataset(test_encodings, test_labels)

In [None]:
import transformers

#Load model
model = DistilBertForSequenceClassification.from_pretrained("distilbert_model/", output_attentions=False)

#Train model
trainer = Trainer(
            model=model,                         # the instantiated Transformers model to be trained
            args=training_args,                  # training arguments
            train_dataset=train_dataset,         # training dataset
            eval_dataset=validation_dataset      # evaluation dataset
)
trainer.train()

In [None]:
from scipy.special import softmax

#return prediction  probabilities
def model_predict(dataset):
    logits = trainer.predict(dataset).predictions
    probabilities = softmax(logits, axis = 1)
    return probabilities

In [None]:
probabilities = model_predict(test_dataset)# the test dataset is from the original training data not the final testing dataset

#Get predicted labels
y_preds = []
for i in probabilities:
    y_preds.append(np.argmax(i))

In [None]:
#Compute performance metrics

f1_score = f1_score(test_labels, y_preds, average='macro')
precision_score= precision_score(test_labels, y_preds, average='macro')
recall_score = recall_score(test_labels, y_preds, average='macro')
accuracy_score = accuracy_score(test_labels, y_preds)