In [None]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pwd
import os
os.chdir('/content/drive/MyDrive/Panos/Εργασία DeepLearning/Assignment 2 & 3')
!pwd

/content
/content/drive/MyDrive/Panos/Εργασία DeepLearning/Assignment 2 & 3


In [None]:
import json

f = open('/content/drive/MyDrive/Panos/Εργασία DeepLearning/Assignment 2 & 3/all_data_IMDB.json')
all_data = json.load(f)

f.close()

In [None]:
import random
random.shuffle(all_data)

In [None]:
positive_samples = []
negative_samples = []
count1 = 0
count2 = 0

for item in range(0, len(all_data)):
  if all_data[item][1] == 1:
    count1 += 1
    positive_samples.append(all_data[item][0])
  else:
    count2 += 1
    negative_samples.append(all_data[item][0])

print("Positive reviews:", count1)
print("Negative reviews:", count2)

Positive reviews: 25000
Negative reviews: 25000


In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
!pip install transformers
!pip install torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

model_path = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path,num_labels=2)

In [None]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="./MyIMDBModel_2",
    do_train=True,
    do_eval=True,
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    weight_decay=0.01,
    logging_strategy='steps',
    logging_dir='./logs',
    logging_steps=200,
    save_steps=1250,
    save_strategy='steps',
    eval_steps=1250,
    evaluation_strategy='steps',
    fp16=cuda.is_available(),
    load_best_model_at_end=True)

In [None]:
import numpy as np
import pandas as pd

pos_texts= np.array(positive_samples)
neg_texts= np.array(negative_samples)
pos_labels=  np.array([1]*len(positive_samples))
neg_labels=  np.array([0]*len(negative_samples))

pos_dataset = pd.DataFrame({'review': pos_texts, 'label': pos_labels}, columns=['review', 'label'])
neg_dataset = pd.DataFrame({'review': neg_texts, 'label': neg_labels}, columns=['review', 'label'])

In [None]:
pos_train = pos_dataset.sample(frac = 0.8)
neg_train = neg_dataset.sample(frac = 0.8)
pos_part_20 = pos_dataset.drop(pos_train.index)
neg_part_20 = neg_dataset.drop(neg_train.index)

In [None]:
pos_test = pos_part_20.sample(frac = 0.5)
neg_test = neg_part_20.sample(frac = 0.5)
pos_val = pos_part_20.drop(pos_test.index)
neg_val = neg_part_20.drop(neg_test.index)

In [None]:
train_set= pd.concat([pos_train, neg_train], axis=0)
test_set=pd.concat([pos_test, neg_test], axis=0)
val_set=pd.concat([pos_val, neg_val], axis=0)
dataset =pd.concat([train_set, test_set,val_set], axis=0)

In [None]:
train_set = train_set.reset_index()
test_set = test_set.reset_index()
val_set = val_set.reset_index()
dataset = dataset.reset_index()

In [None]:
print('Raw data: ')
print('max length =',np.max([len(x) for x in dataset['review']]))
print('mean length =',np.mean([len(x) for x in dataset['review']]))

Raw data: 
max length = 13704
mean length = 1309.43102


In [None]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('wordnet')
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    words = word_tokenize(text)
    words = [word for word in words if word.isalpha() and word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(words)

train_set['review'] = train_set['review'].apply(normalize_text)
val_set['review'] = val_set['review'].apply(normalize_text)
test_set['review'] = test_set['review'].apply(normalize_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
dataset_v2 = pd.concat([train_set, test_set,val_set], axis=0)

print('After normalization: ')
print('max length =', np.max([len(x) for x in dataset_v2['review']]))
print('mean length =', np.mean([len(x) for x in dataset_v2['review']]))

After normalization: 
max length = 9164
mean length = 812.165


In [None]:
!pip install --upgrade datasets



In [None]:
!pip install 'pyarrow>=3.0.0,<10.0dev'

Collecting pyarrow<10.0dev,>=3.0.0
  Downloading pyarrow-9.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.3/35.3 MB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 10.0.1
    Uninstalling pyarrow-10.0.1:
      Successfully uninstalled pyarrow-10.0.1
Successfully installed pyarrow-9.0.0


In [None]:
import pyarrow as pa
import pyarrow.dataset as ds
from datasets import Dataset

In [None]:
# Create a schema for the Arrow table
schema = pa.schema([
    pa.field('review', pa.string()),
    pa.field('label', pa.int64())
])

# Convert the Pandas DataFrames to Arrow tables
train_table = Dataset(pa.Table.from_pandas(train_set, schema=schema))
test_table = Dataset(pa.Table.from_pandas(test_set, schema=schema))
val_table = Dataset(pa.Table.from_pandas(val_set, schema=schema))

In [None]:
enc_train = train_table.map(lambda e: tokenizer(e['review'],max_length=500,padding='max_length',truncation=True),batched=True,batch_size=1000)
enc_test = test_table.map(lambda e: tokenizer(e['review'],max_length=500,padding='max_length',truncation=True),batched=True,batch_size=1000)
enc_val = val_table.map(lambda e: tokenizer(e['review'],max_length=500,padding='max_length',truncation=True),batched=True,batch_size=1000)

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [None]:
pd.DataFrame(enc_train)

Unnamed: 0,review,label,input_ids,attention_mask
0,truly enjoyed film rare find star pull physica...,1,"[101, 5621, 5632, 2143, 4678, 2424, 2732, 4139...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,starting name joe like scene alcoholic anonymo...,1,"[101, 3225, 2171, 3533, 2066, 3496, 14813, 108...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,truly classic movie story acting film presenta...,1,"[101, 5621, 4438, 3185, 2466, 3772, 2143, 8312...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,jennifer connelly fan since phenomenon heard s...,1,"[101, 7673, 17199, 2100, 5470, 2144, 9575, 265...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,absolutely loved movie since want give much aw...,1,"[101, 7078, 3866, 3185, 2144, 2215, 2507, 2172...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...,...
39995,flying war movie buff rank bottom list histori...,0,"[101, 3909, 2162, 3185, 23176, 4635, 3953, 286...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
39996,literally fell asleep time watching movie gran...,0,"[101, 6719, 3062, 6680, 2051, 3666, 3185, 4379...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
39997,yes admire independent spirit like road trip b...,0,"[101, 2748, 19837, 2981, 4382, 2066, 2346, 444...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
39998,candidate single disappointing movie experienc...,0,"[101, 4018, 2309, 15640, 3185, 3325, 6480, 465...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [None]:
print('max length =',np.max([len(x) for x in enc_train['input_ids']]))
print('mean length =',np.mean([len(x) for x in enc_train['input_ids']]))

max length = 500
mean length = 500.0


In [None]:
!pip install torch



In [None]:
!pip show accelerate

Name: accelerate
Version: 0.26.1
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: sylvain@huggingface.co
License: Apache
Location: /usr/local/lib/python3.10/dist-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: 


In [None]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels,preds)
    return {'Accuracy': acc}

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=enc_train,
    eval_dataset=enc_val,
    compute_metrics=compute_metrics
)

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
results = trainer.train()

Step,Training Loss,Validation Loss,Accuracy
1250,0.2528,0.268849,0.8832
2500,0.1656,0.257901,0.9132
3750,0.0739,0.316482,0.9116
5000,0.0252,0.448903,0.911
6250,0.0135,0.51581,0.9106


In [None]:
q = [trainer.evaluate(eval_dataset=data) for data in [enc_train, enc_val, enc_test]]
pd.DataFrame(q, index=["train","val","test"]).iloc[:,:5]

Unnamed: 0,eval_loss,eval_Accuracy,eval_runtime,eval_samples_per_second,eval_steps_per_second
train,0.058488,0.983675,254.5143,157.162,4.911
val,0.257901,0.9132,31.8606,156.934,4.928
test,0.274953,0.906,31.7,157.729,4.953
