In [None]:
# (ONE TIME ONLY) install apex (Ref: https://github.com/NVIDIA/apex/issues/116)
%cd ../../
!git clone https://www.github.com/nvidia/apex
%cd apex
!python setup.py install

# to remove apex if installed previously, uncomment the following line
# !pip uninstall apex           # remove apex if installed previously

In [None]:
# Reference: https://zerowithdot.com/colab-github-workflow/
from google.colab import drive
from os.path import join

ROOT = '/content/drive'     # default for the drive
PROJ = 'My Drive/pessimism_research'    # path to your project on Drive

PROJECT_PATH = join(ROOT, PROJ)

In [None]:
# Run this code block only when your session is terminated
drive.mount(ROOT)           # mount the drive at /content/drive

In [None]:
# Reference: https://zerowithdot.com/colab-github-workflow/
GIT_USERNAME = "namrathaurs"
GIT_REPOSITORY = "test-repo"

GIT_PATH = "https://github.com/{}/{}.git".format(GIT_USERNAME, GIT_REPOSITORY)
!git clone "{GIT_PATH}" "{PROJECT_PATH}"
# !rsync -aP --exclude=data/ "{PROJECT_PATH}"/*  ./

In [None]:
# !git remote -v                  # view the remote repository details (aka origin)
# !git pull origin master         # pull all updates from the 'origin'
# !rm -rf /content/drive/My\ Drive/pessimism_research         # in case you want to remove the existing cloned directory in your GDrive

In [None]:
# (ONE TIME ONLY)
# install required libraries/packages for running the notebook
!pip install transformers
!pip install simpletransformers

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

from simpletransformers.classification import ClassificationModel
from sklearn import metrics
from sklearn.model_selection import KFold, StratifiedKFold
import pandas as pd
import numpy as np
import torch


# used to test if cuda support is enabled
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
def main():
    # abspath resolves redundant separators and up-level references
    data_dir = os.path.abspath(os.path.join(PROJECT_PATH, "data/data_base"))

    # Read the data from files
    train_df = pd.read_csv(os.path.join(data_dir, "trn_A.csv"))
    valdn_df = pd.read_csv(os.path.join(data_dir, "val_A.csv"))
    # test_df = pd.read_csv(os.path.join(data_dir, "tst_A.csv"))
    print(train_df.shape, valdn_df.shape)

    # change datatype of labels column from int to float to avoid runtime warning with respect to double_scalars (see Issue #2)
    train_df = train_df.astype({"labels_for_settingA": np.float64})
    valdn_df = valdn_df.astype({"labels_for_settingA": np.float64})

    # initialize the pre-trained model (pretrained weights)
    model = ClassificationModel('bert', 'bert-base-uncased', use_cuda=True)
    
    # train the model (fine-tune the pre-trained model on the training data)
    model.train_model(train_df)

    tmp1 = valdn_df[valdn_df.loc[:, 'labels_for_settingA'] == 1.0]
    print("Optimistic tweets in validation set: ", tmp1.shape)
    tmp2 = valdn_df[valdn_df.loc[:, 'labels_for_settingA'] == 0.0]
    print("Pessimistic tweets in validation set: ", tmp2.shape)

    # use the predict method to make predictions and then use these predictions for evaluation
    # predict method requires examples to be formatted as a list
    valdn_samples = valdn_df['Tweet'].to_list()
    valdn_true_lbls = valdn_df['labels_for_settingA'].to_list()

    # test the trained model on the validation data
    pred_labels, model_outputs = model.predict(valdn_samples)

        # report the evaluation metrics
    acc = metrics.accuracy_score(valdn_true_lbls, pred_labels)
    f1 = metrics.f1_score(valdn_true_lbls, pred_labels)
    cm = metrics.confusion_matrix(valdn_true_lbls, pred_labels)

    print("Accuracy: %.3f" % acc)
    print("F1 score: %.3f" % f1)
    print("Confusion matrix:")
    print(cm)


In [None]:
# upon execution, script starts here!
if __name__ == "__main__":
    main()

In [None]:
# 3 folders are created when a pre-trained model from the simpletransformers
# package is used to train and predict. You get an error message that says outputs
# directory exists when you re-run the notebook.
# Either you can delete these folders that are created programmatically
# Or you can edit the configuration settings to overwrite the directory contents
!rm -rf cache_dir/ outputs/ runs/