In [None]:
!pip install transformers

In [None]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
import logging
logging.basicConfig(level=logging.ERROR)

In [None]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
data = pd.read_fwf('train.txt', sep=" ", header=None, names=[ "id", "label", "tweet"] , encoding='UTF-8')
# data=data[900:1100]
data_text = data[['tweet']]
data_text = data_text.astype('str')

tweets = []
for index, row in data_text.iterrows():
    tweets.append(row['tweet'])

print('Size of dataset:',len(tweets))

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data['label'].unique()

In [None]:
data.describe()

In [None]:
from matplotlib import pyplot as plt


In [None]:
new_df = data[['label', 'tweet']]

In [None]:
new_df.head()

In [None]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
# EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

In [None]:
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.tweet
        self.targets = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
train_size = 0.85
train_data=new_df.sample(frac=train_size,random_state=200)
test_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = SentimentData(train_data, tokenizer, MAX_LEN)
testing_set = SentimentData(test_data, tokenizer, MAX_LEN)

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
model = RobertaClass()
model.to(device)

In [None]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [None]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [None]:
EPOCHS = 5
for epoch in range(EPOCHS):
    train(epoch)

In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.stem import PorterStemmer # for stemming
from nltk.stem import WordNetLemmatizer # for lemmatization
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.model_selection import train_test_split , cross_val_score,KFold
from collections import Counter
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn import metrics
import warnings
!pip install imblearn
import imblearn
warnings.filterwarnings("ignore")

In [None]:
from sklearn.metrics import precision_score,recall_score,accuracy_score, f1_score
def show_confusion(label,pred_label,nm):
  class_names=['No-Addiction','Addiction']
  cm=confusion_matrix(label,pred_label)
  disp =ConfusionMatrixDisplay(cm,display_labels=class_names)
  disp.plot()
  plt.title(f'Confusion Matrix {nm}')
  plt.xlabel('Actual')
  plt.ylabel('Predicted')
  plt.show()

def mymetrics(Y,P , alg = None):
    recall= recall_score(Y, P)
    acc2= accuracy_score(Y, P)
    precision= precision_score(Y, P)
    Fmeasure = f1_score(Y, P)
    print(f"accuracy = {acc2*100}")
    print(f'recall= {recall*100}')
    print(f'precision= {precision*100}')
    print(f'Fmeasure= {Fmeasure*100}')

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report


def valid(model, testing_loader):
   # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0
  all_logits=[]
  all_labels=[]
  # Evaluate data for one epoch
  for _,data in tqdm(enumerate(testing_loader, 0)):
    ids = data['ids'].to(device, dtype = torch.long)
    mask = data['mask'].to(device, dtype = torch.long)
    token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
    targets = data['targets'].to(device, dtype = torch.long)


    with torch.no_grad():
      # Forward pass, calculate logit predictions
      output = model(ids, mask, token_type_ids)
      logits = output

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = targets.to('cpu').numpy()
    all_logits.extend(np.argmax(logits, axis=1).flatten())
    all_labels.extend(label_ids)
  accuracy = accuracy_score(all_logits , all_labels)


  val_f1 = f1_score(all_logits, all_labels)
  val_precision = precision_score(all_logits, all_labels)
  val_recall = recall_score(all_logits, all_labels)

  show_confusion(all_logits, all_labels,'Roberta')





  y = classification_report(all_labels,all_logits,digits=4)

  print(y)



In [None]:
print("*****validation acuracy*******")
valid(model, testing_loader)

In [None]:
test_data2 = pd.read_fwf('test.txt', sep=" ", header=None, names=["id", "label","tweet"] , encoding='UTF-8')

In [None]:
test_data2 = test_data2[['label', 'tweet']]

In [None]:
test_data2.head()

In [None]:
testing_set2 = SentimentData(test_data2, tokenizer, MAX_LEN)
testing_loader2 = DataLoader(testing_set2, **test_params)

In [None]:
print("*****test acuracy*******")
valid(model, testing_loader2)