## IMDB Dataset

In [None]:
# Setup
!pip install fairseq
!pip install tensorboardX

In [None]:
# Downloading the IMDB dataset
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar zxvf aclImdb_v1.tar.gz

In [None]:
# Format data
import os
import random
from glob import glob

def prepare_data(datadir):
    random.seed(0)
    for split in ['train', 'test']:
        samples = []
        for class_label in ['pos', 'neg']:
            fnames = glob(os.path.join(datadir, split, class_label) + '/*.txt')
            for fname in fnames:
                with open(fname, 'r') as fin:
                    line = fin.readline().strip()
                    samples.append((line, 1 if class_label == 'pos' else 0))
        random.shuffle(samples)
        out_fname = 'train' if split == 'train' else 'dev'
        with open(os.path.join(datadir, out_fname + '.input0'), 'w') as f1, \
             open(os.path.join(datadir, out_fname + '.label'), 'w') as f2:
            for sample in samples:
                f1.write(sample[0] + '\n')
                f2.write(str(sample[1]) + '\n')

prepare_data('aclImdb')

In [None]:
!git clone https://github.com/pytorch/fairseq
%cd fairseq

In [None]:
!ls

In [None]:
# Downloading the BPE encoder and vocabulary
!wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
!wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'

# BPE encoding of the data
!python -m examples.roberta.multiprocessing_bpe_encoder \
    --encoder-json encoder.json \
    --vocab-bpe vocab.bpe \
    --inputs "../aclImdb/train.input0" "../aclImdb/dev.input0" \
    --outputs "../aclImdb/train.input0.bpe" "../aclImdb/dev.input0.bpe" \
    --workers 60 \
    --keep-empty

In [None]:
# Download the dictionary for fairseq
!wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'

# Preprocess the data for fairseq
!fairseq-preprocess \
    --only-source \
    --trainpref "../aclImdb/train.input0.bpe" \
    --validpref "../aclImdb/dev.input0.bpe" \
    --destdir "../IMDB-bin/input0" \
    --srcdict dict.txt \
    --workers 60

!fairseq-preprocess \
    --only-source \
    --trainpref "../aclImdb/train.label" \
    --validpref "../aclImdb/dev.label" \
    --destdir "../IMDB-bin/label" \
    --workers 60


In [None]:
!pip install tensorboardX

!fairseq-train "/content/IMDB-bin/" \
    --user-dir /content/fairseq/examples/linformer/linformer_src \
    --max-positions 512 \
    --batch-size 16 \
    --max-tokens 4400 \
    --task sentence_prediction \
    --reset-optimizer --reset-dataloader --reset-meters \
    --required-batch-size-multiple 1 \
    --init-token 0 --separator-token 2 \
    --arch linformer_roberta_base \
    --criterion sentence_prediction \
    --classification-head-name 'imdb_head' \
    --num-classes 2 \
    --dropout 0.1 --attention-dropout 0.1 \
    --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
    --clip-norm 0.0 \
    --lr-scheduler polynomial_decay --lr 1e-05 --total-num-update 7812 --warmup-updates 469 \
    --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
    --max-epoch 1 \
    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
    --shorten-method "truncate" \
    --find-unused-parameters \
    --update-freq 4

## Twitter Dataset

In [None]:
# Importing the dataset from Kaggle

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'sentiment140:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F2477%2F4140%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240503%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240503T155156Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D287bc94575b702a6892fe02b7bd28c326a770f21bf9b3800966a2d7b086fdff55f5a34ec7f9085607ee9b47f7377889ebb681c276883014dae0098dc44c979cef2a1da717476acc16ba0c738273585d2cca0721c12a45af861cd7deb08724952a3f6ca8ba45ed6b55aa0c22fd0054cde92f216e6cf6388c84700e4d22af8a4955a7cd524e6a1e9956d8c8b981be1773b2f187f7b969c87de1c0fca659e5e68570477c3eb52066d453f36e6229e50514d6d21c1d460acdf4d22a2763469dfa07801404cfad4383ff87b396cedf172160af365557cebe9b40ec9a1838af60e49b5a8eb8ada61146332bb53e6e0db21087781df57d2613fa4a816bf965739a546dd'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')

In [None]:
data = pd.read_csv('/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv',encoding = 'latin',header=None)
data = data[[5, 0]]
data = data.head(10000)
data.columns=['tweet', 'sentiment']
print(data.head())
data['sentiment'] = data['sentiment'].replace(4,1)
data = data.sample(frac = 1)
print(data.head())

                                               tweet  sentiment
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...          0
1  is upset that he can't update his Facebook by ...          0
2  @Kenichan I dived many times for the ball. Man...          0
3    my whole body feels itchy and like its on fire           0
4  @nationwideclass no, it's not behaving at all....          0
                                                  tweet  sentiment
3023                                   whitout friends           0
9932  Okay, Chaotic is a repeat today.  Damn, I got ...          0
3237  @RellyAB will likely be unable to attend - cry...          0
341   @hyperbets i hope this doesn't last too long. ...          0
1566  @CandiceNicolePR I haven't heard from you in w...          0


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd

train_df, dev_df = train_test_split(data, test_size=0.1, random_state=42)

train_df['tweet'].to_csv('train.input0', index=False, header=False)
train_df['sentiment'].to_csv('train.label', index=False, header=False)
dev_df['tweet'].to_csv('dev.input0', index=False, header=False)
dev_df['sentiment'].to_csv('dev.label', index=False, header=False)

print("Training texts:", len(open('train.input0').readlines()))
print("Training labels:", len(open('train.label').readlines()))
print("Validation texts:", len(open('dev.input0').readlines()))
print("Validation labels:", len(open('dev.label').readlines()))


Training texts: 144000
Training labels: 144000
Validation texts: 16000
Validation labels: 16000


In [None]:
# # BPE encoding of the data
!python -m examples.roberta.multiprocessing_bpe_encoder \
    --encoder-json encoder.json \
    --vocab-bpe vocab.bpe \
    --inputs "train.input0" "dev.input0" \
    --outputs "train.input0.bpe" "dev.input0.bpe" \
    --workers 60 \
    --keep-empty

In [None]:
# Preprocess the data for fairseq
!fairseq-preprocess \
    --only-source \
    --trainpref "train.input0.bpe" \
    --validpref "dev.input0.bpe" \
    --destdir "Tweet-bin/input0" \
    --srcdict dict.txt \
    --workers 60

!fairseq-preprocess \
    --only-source \
    --trainpref "train.label" \
    --validpref "dev.label" \
    --destdir "Tweet-bin/label" \
    --workers 60

In [None]:
# Re-doing this process with a subset of the dataset
!rm -rf Tweet-bin/input0 Tweet-bin/label
!head -n 16000 train.label > train.small.label

!fairseq-preprocess \
    --only-source \
    --trainpref "train.input0.bpe" \
    --validpref "dev.input0.bpe" \
    --destdir "Tweet-bin/input0" \
    --srcdict dict.txt \
    --workers 60

!fairseq-preprocess \
    --only-source \
    --trainpref "train.small.label" \
    --validpref "dev.label" \
    --destdir "Tweet-bin/label" \
    --workers 60

In [None]:
# Training with Linformer using the Twitter data
!fairseq-train "Tweet-bin/" \
    --user-dir /content/fairseq/examples/linformer/linformer_src \
    --max-positions 512 \
    --batch-size 16 \
    --max-tokens 4400 \
    --task sentence_prediction \
    --reset-optimizer --reset-dataloader --reset-meters \
    --required-batch-size-multiple 1 \
    --init-token 0 --separator-token 2 \
    --arch linformer_roberta_base \
    --criterion sentence_prediction \
    --classification-head-name 'sentiment_head' \
    --num-classes 2 \
    --dropout 0.1 --attention-dropout 0.1 \
    --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
    --clip-norm 0.0 \
    --lr-scheduler polynomial_decay --lr 1e-05 --total-num-update 7812 --warmup-updates 469 \
    --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
    --max-epoch 10 \
    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
    --shorten-method "truncate" \
    --find-unused-parameters \
    --update-freq 4

2024-05-03 16:51:46.859664: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-03 16:51:46.859720: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-03 16:51:46.861580: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-03 16:51:49 | INFO | numexpr.utils | Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2024-05-03 16:51:49 | INFO | numexpr.utils | NumExpr defaulting to 8 threads.
2024-05-03 16:51:52 | INFO | fairseq_cli.train | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_fo

## Amazon Dataset


In [None]:
!pip install datasets

from datasets import load_dataset

dataset = load_dataset("amazon_polarity")
train_data = dataset["train"]
test_data = dataset["test"]

In [None]:
print(dataset)
print(train_data)

DatasetDict({
    train: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 3600000
    })
    test: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 400000
    })
})
Dataset({
    features: ['label', 'title', 'content'],
    num_rows: 3600000
})


In [None]:
for example in train_data.shuffle(seed=42).select(range(5)):
    print(f"Label: {example['label']}, Review Title: {example['title']}, Review Content: {example['content']}")

Label: 0, Review Title: Anyone who likes this better than the Pekinpah is a moron., Review Content: All the pretty people in this film. Even the Rudy character played by Michael Madsen. This is adapted from a Jim Thompson novel for cryin' out loud! These are supposed to be marginal characters, not fashion models. Though McQueen and McGraw were attractive (but check out McQueen's crummy prison haircut) they were believable in the role. Baldwin and Bassinger seem like movie stars trying to act like hard cases. Action wise, the robbery scene in the Pekinpah version was about 100 times more exciting and suspenseful than anything in this re-make.
Label: 0, Review Title: Author seems mentally unstable, Review Content: I know that Tom Robbins has a loyal following and I started the book with high expectations. However, I did not enjoy this book as it was too much work to follow his confused logic. I think that he was under the influence during most of time that he wrote.
Label: 1, Review Titl

In [None]:
import pandas as pd

# Convert to pandas DataFrame
df_train = pd.DataFrame(train_data)
df_test = pd.DataFrame(test_data)

In [None]:
print(df_test.head())

df_train['label'] = df_train['label'].replace({4: 1})
df_test['label'] = df_test['label'].replace({4: 1})

print(df_train.head())

df_train = df_train.sample(frac=1, random_state=42)
df_test = df_test.sample(frac=1, random_state=42)

df_train = df_train.head(25000)
df_test = df_test.head(25000)

print(df_test.head())

   label                                              title  \
0      1                                           Great CD   
1      1  One of the best game music soundtracks - for a...   
2      0                   Batteries died within a year ...   
3      1              works fine, but Maha Energy is better   
4      1                       Great for the non-audiophile   

                                             content  
0  My lovely Pat has one of the GREAT voices of h...  
1  Despite the fact that I have only played a sma...  
2  I bought this charger in Jul 2003 and it worke...  
3  Check out Maha Energy's website. Their Powerex...  
4  Reviewed quite a bit of the combo players and ...  
   label                                              title  \
0      1                     Stuning even for the non-gamer   
1      1              The best soundtrack ever to anything.   
2      1                                           Amazing!   
3      1                               

In [None]:
df_train['title'].to_csv('train.input0', index=False, header=False)
df_train['label'].to_csv('train.label', index=False, header=False)
df_test['title'].to_csv('dev.input0', index=False, header=False)
df_test['label'].to_csv('dev.label', index=False, header=False)

print("Training texts:", len(open('train.input0').readlines()))
print("Training labels:", len(open('train.label').readlines()))
print("Validation texts:", len(open('dev.input0').readlines()))
print("Validation labels:", len(open('dev.label').readlines()))

Training texts: 25000
Training labels: 25000
Validation texts: 25000
Validation labels: 25000


In [None]:
# # BPE encoding of the data
!rm -rf Amazon-bin/input0 Amazon-bin/label

!python -m examples.roberta.multiprocessing_bpe_encoder \
    --encoder-json encoder.json \
    --vocab-bpe vocab.bpe \
    --inputs "train.input0" \
    --outputs "train.input0.bpe" \
    --workers 60 \
    --keep-empty

!python -m examples.roberta.multiprocessing_bpe_encoder \
    --encoder-json encoder.json \
    --vocab-bpe vocab.bpe \
    --inputs "dev.input0" \
    --outputs "dev.input0.bpe" \
    --workers 60 \
    --keep-empty

# Preprocess the data for fairseq
!fairseq-preprocess \
    --only-source \
    --trainpref "train.input0.bpe" \
    --validpref "dev.input0.bpe" \
    --destdir "Amazon-bin/input0" \
    --srcdict dict.txt \
    --workers 60

!fairseq-preprocess \
    --only-source \
    --trainpref "train.label" \
    --validpref "dev.label" \
    --destdir "Amazon-bin/label" \
    --workers 60

In [None]:
!wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'

--2024-05-04 17:15:43--  https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.226.210.15, 13.226.210.111, 13.226.210.25, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.226.210.15|:443... connected.
HTTP request sent, awaiting response... 304 Not Modified
File ‘dict.txt’ not modified on server. Omitting download.



In [None]:
!fairseq-train "Amazon-bin/" \
    --user-dir /content/fairseq/examples/linformer/linformer_src \
    --max-positions 512 \
    --batch-size 16 \
    --max-tokens 4400 \
    --task sentence_prediction \
    --reset-optimizer --reset-dataloader --reset-meters \
    --required-batch-size-multiple 1 \
    --init-token 0 --separator-token 2 \
    --arch linformer_roberta_base \
    --criterion sentence_prediction \
    --classification-head-name 'Amazon_head' \
    --num-classes 2 \
    --dropout 0.1 --attention-dropout 0.1 \
    --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
    --clip-norm 0.0 \
    --lr-scheduler polynomial_decay --lr 1e-05 --total-num-update 7812 --warmup-updates 469 \
    --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
    --max-epoch 10 \
    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
    --shorten-method "truncate" \
    --find-unused-parameters \
    --update-freq 4 \
    --compressed 4

2024-05-04 17:31:33.977801: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-04 17:31:34.028522: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-04 17:31:34.028575: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-04 17:31:34.029851: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-04 17:31:34.037090: I tensorflow/core/platform/cpu_feature_guar

In [None]:
import torch
import pickle

# Load the model from the .pt file
loaded_model = torch.load('checkpoints/checkpoint_best.pt')

with open('checkpoint_best.pkl', 'wb') as f:
    pickle.dump(loaded_model, f)

with open('checkpoint_best.pkl', 'rb') as f:
    loaded_model_from_pickle = pickle.load(f)