In [None]:
#!pip install transformers

In [None]:
import pandas as pd
from transformers import pipeline

In [None]:
# load the data
df = pd.read_csv('Hotel_Reviews.csv')

# keep only the reviews that are greater than or equal to 8.5 
# or less than or equal to 6.5
df = df.loc[(df.Reviewer_Score>=8.5) | (df.Reviewer_Score<=6.5)].copy()

# create a function for the label

def review_label(x):
    if x>=8.5:
        return 1
    else:
        return 0
    

# create the label column    
df['label'] = df['Reviewer_Score'].apply(lambda x: review_label(x))


# replace the 'No Positive' and the 'No Negative' 
# from the corresponding columns with an empty string

df['Positive_Review'].replace('^No Positive$', '', inplace=True, regex=True)
df['Negative_Review'].replace('^No Negative$', '', inplace=True, regex=True)

# concatenate the Positive and Negative Review columns into 'Reviews'
df['reviews'] = df['Negative_Review'] + df['Positive_Review']

# remove the empty reviews
df = df.loc[df['reviews'].apply(lambda x:len(x.split()))>0]
df = df[['reviews', 'label']].copy()
df.reset_index(drop = True, inplace = True)
df.head(10)

Unnamed: 0,reviews,label
0,I am so angry that i made this post available...,0
1,My room was dirty and I was afraid to walk ba...,0
2,Cleaner did not change our sheet and duvet ev...,0
3,Apart from the price for the brekfast Everyth...,1
4,Even though the pictures show very clean room...,0
5,Nothing all great Rooms were stunningly deco...,1
6,6 30 AM started big noise workers loading woo...,0
7,The floor in my room was filfy dirty Very bas...,0
8,This hotel is being renovated with great care...,1
9,The staff in the restaurant could of been mor...,1


In [None]:
df.shape

(361844, 2)

In [None]:
# undersample the data

df_grouped_by = df.groupby(['label'])
 
df_balanced = df_grouped_by.apply(lambda x: x.sample(df_grouped_by.size().min()).reset_index(drop=True))
 
df_balanced = df_balanced.droplevel(['label'])

# shuffle the data frame
df = df_balanced.sample(frac=1, random_state=1).reset_index(drop=True)
df.head(10) 

Unnamed: 0,reviews,label
0,Going to London Central took us an hour every...,1
1,Nothing specific Location,0
2,The room cleanliness we had to ask everyday f...,0
3,Room was to hot for babies It s all on a sens...,0
4,It s old and not a 5 star,0
5,So far everything is okay Really a great loca...,1
6,Nice place to stay,1
7,location,1
8,Noise Staff were so helpful,1
9,No free wifi even its 5 star hotel Location o...,0


In [None]:
# create the train and test datasets
train = df.sample(frac=0.75, random_state=1)
test = df.drop(train.index)

In [None]:
# import the "sentiment-analysis" classifier
from transformers import pipeline
classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [None]:
classifier("I hate you")[0]['label']

'NEGATIVE'

In [None]:
predictions = []
for r in test['reviews']:
  try:
    pred = classifier(r)[0]['label']
    if pred == 'NEGATIVE':
      predictions.append(0)
    else:
      predictions.append(1)


  except:
    predictions.append(0)
    print(r)


 We really didn t like that we couldn t get picked up from the train station and that the shuttle was not 24 hours The neighbourhood was a little bit sketchy and the hotel is well hidden in between buildings and back alleys so it would have been nice to get a ride We eventually figured out the bus system but it was a pain the first couple of days We went to the hotel restaurant one of the nights and it was awful They had one waiter on which would have been fine considering there were only 3 or 4 tables to serve But he spent 15 minutes talking to the table next to us and then it took another 10 for him to come over to us to talk to us for the first time since we had been seated I ordered off of the kids menu a margarita pizza and fries which turned out to be half of a frozen store bought pizza which I figured out when I asked for less sauce on the pizza and he told me they were pre made My husband got a burger and the meat was the cheapest you could buy It was very fatty and very grey T

In [None]:
from sklearn.metrics import classification_report

print(classification_report(test['label'], predictions))

              precision    recall  f1-score   support

           0       0.77      0.84      0.80     17107
           1       0.82      0.74      0.78     16893

    accuracy                           0.79     34000
   macro avg       0.79      0.79      0.79     34000
weighted avg       0.79      0.79      0.79     34000



#Fine Tune

In [None]:
#!pip install datasets

In [None]:
# save the train and test dataset as csv in order 
# to load them as datasets
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

In [None]:
import datasets
from datasets import load_dataset, load_from_disk
 
dataset = load_dataset('csv', data_files={'train': 'train.csv', 'test': 'test.csv'})
 
dataset



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-4dee3ddc2d0998be/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-4dee3ddc2d0998be/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['reviews', 'label'],
        num_rows: 101998
    })
    test: Dataset({
        features: ['reviews', 'label'],
        num_rows: 34000
    })
})

In [None]:
from transformers import AutoTokenizer
 
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
 
def tokenize_function(examples):
    return tokenizer(examples["reviews"], padding="max_length", truncation=True)
 
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/426k [00:00<?, ?B/s]

  0%|          | 0/102 [00:00<?, ?ba/s]

  0%|          | 0/34 [00:00<?, ?ba/s]

In [None]:
from transformers import AutoModelForSequenceClassification
checkpoint = "distilbert-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Downloading pytorch_model.bin:   0%|          | 0.00/251M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.b

In [None]:
import numpy as np
from datasets import load_metric
 
metric = load_metric("accuracy")
 
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [None]:
from transformers import TrainingArguments, Trainer
 
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=1)
 
 
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)
 
trainer.train()

***** Running training *****
  Num examples = 101998
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 12750
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: reviews. If reviews are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.


Epoch,Training Loss,Validation Loss


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1500
Configuration saved in test_trainer/checkpoint-1500/config.json
Model weights saved in test_trainer/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-2000
Configuration saved in test_trainer/checkpoint-2000/config.json
Model weights saved in test_trainer/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-2500
Configuration saved in test_trainer/checkpoint-2500/config.json
Model weights saved in test_trainer/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-30

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2404,0.242356,0.917912


Saving model checkpoint to test_trainer/checkpoint-12500
Configuration saved in test_trainer/checkpoint-12500/config.json
Model weights saved in test_trainer/checkpoint-12500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 34000
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: reviews. If reviews are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=12750, training_loss=0.28154548645019534, metrics={'train_runtime': 5590.6795, 'train_samples_per_second': 18.244, 'train_steps_per_second': 2.281, 'total_flos': 1.3511409728114688e+16, 'train_loss': 0.28154548645019534, 'epoch': 1.0})

In [None]:
!zip -r /content/fine_tuned_model.zip /content/test_trainer/checkpoint-12500/

  adding: content/test_trainer/checkpoint-12500/ (stored 0%)
  adding: content/test_trainer/checkpoint-12500/pytorch_model.bin (deflated 8%)
  adding: content/test_trainer/checkpoint-12500/rng_state.pth (deflated 27%)
  adding: content/test_trainer/checkpoint-12500/training_args.bin (deflated 48%)
  adding: content/test_trainer/checkpoint-12500/scheduler.pt (deflated 49%)
  adding: content/test_trainer/checkpoint-12500/optimizer.pt (deflated 23%)
  adding: content/test_trainer/checkpoint-12500/trainer_state.json (deflated 80%)
  adding: content/test_trainer/checkpoint-12500/config.json (deflated 46%)


In [None]:
# do predictions

from transformers import pipeline
# load from previously saved model
pipe = pipeline("text-classification",
                model="/content/test_trainer/checkpoint-12500",
                tokenizer="distilbert-base-uncased")

loading configuration file /content/test_trainer/checkpoint-12500/config.json
Model config DistilBertConfig {
  "_name_or_path": "/content/test_trainer/checkpoint-12500",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.21.1",
  "vocab_size": 28996
}

loading configuration file /content/test_trainer/checkpoint-12500/config.json
Model config DistilBertConfig {
  "_name_or_path": "/content/test_trainer/checkpoint-12500",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequ

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

storing https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer_config.json in cache at /root/.cache/huggingface/transformers/8c8624b8ac8aa99c60c912161f8332de003484428c47906d7ff7eb7f73eecdbb.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79
creating metadata file for /root/.cache/huggingface/transformers/8c8624b8ac8aa99c60c912161f8332de003484428c47906d7ff7eb7f73eecdbb.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79
https://huggingface.co/distilbert-base-uncased/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpqcuzj57o


Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

storing https://huggingface.co/distilbert-base-uncased/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
creating metadata file for /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.0

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

storing https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt in cache at /root/.cache/huggingface/transformers/0e1bbfda7f63a99bb52e3915dcf10c3c92122b827d92eb2d34ce94ee79ba486c.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
creating metadata file for /root/.cache/huggingface/transformers/0e1bbfda7f63a99bb52e3915dcf10c3c92122b827d92eb2d34ce94ee79ba486c.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpa_hon8sl


Downloading tokenizer.json:   0%|          | 0.00/455k [00:00<?, ?B/s]

storing https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer.json in cache at /root/.cache/huggingface/transformers/75abb59d7a06f4f640158a9bfcde005264e59e8d566781ab1415b139d2e4c603.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
creating metadata file for /root/.cache/huggingface/transformers/75abb59d7a06f4f640158a9bfcde005264e59e8d566781ab1415b139d2e4c603.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/0e1bbfda7f63a99bb52e3915dcf10c3c92122b827d92eb2d34ce94ee79ba486c.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/75abb59d7a06f4f640158a9bfcde005264e59e8d566781ab1415b139d2e4c603.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
loading