# Model Evaluation:


## Installation:

In [None]:
!pip install datasets 
!pip install transformers
!pip install evaluate
!pip install bert_score
!pip install sacrebleu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Setup

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%cd /content/gdrive/My Drive/nlp

/content/gdrive/My Drive/nlp


In [None]:
!ls

ai-story_gen  dls.pkl  story_distilgpt2_finetune  story_gpt2_finetune
data	      models   story_gpt2		  tf_dataset.pkl


In [None]:
import os
DIR_PATH=os.getcwd()

In [None]:
import pickle
from fastai.text.all import *
import torch
from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, GPT2LMHeadModel
from pprint import pprint
from evaluate import load

### Data

In [None]:
# parameters

# SAMPLE
TRAIN_ROWS=50000
TEST_ROWS=5000

# PATH OF CSV FILES
TRAIN_PATH= DIR_PATH+"/data/train_df.csv"
VALID_PATH=DIR_PATH+"/data/valid_df.csv"
TEST_PATH= DIR_PATH+"/data/test_df.csv"

# DATA PROCESSING
CONTEXT_LEN=256

# HYPERPARAMETERS
TRAIN_BS= 64
TEST_BS= 64 
EPOCHS=5

In [None]:
# with open('dls.pkl', 'wb') as f:
#     pickle.dump(dls, f)

In [None]:
with open('dls.pkl', 'rb') as f:
    dls = pickle.load(f)

### Model loading

In [None]:
# loading a pretrained tokenizer for the gpt2 model 
tokenizer_gpt = AutoTokenizer.from_pretrained("gpt2")

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
tokenizer_dgpt = AutoTokenizer.from_pretrained("distilgpt2")

Downloading (…)lve/main/config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# model 1
model1 = language_model_learner(dls, AWD_LSTM, drop_mult=0.3, 
    metrics=[accuracy, Perplexity(),CorpusBLEUMetric()]
    ).to_fp16()
model1.load(DIR_PATH+'/models/story_awd_lstm')

# model 2
model2 = language_model_learner(dls, AWD_LSTM, drop_mult=0.3, 
    metrics=[accuracy, Perplexity(),CorpusBLEUMetric()]
    ).to_fp16()
model2.load(DIR_PATH+'/models/story_awd_lstm_finetune')


<fastai.text.learner.LMLearner at 0x7fb547697850>

In [None]:
# model 3
model3=AutoModelForCausalLM.from_pretrained(DIR_PATH+"/story_distilgpt2_finetune")

# model 4
model4=AutoModelForCausalLM.from_pretrained(DIR_PATH+"/story_gpt2")

# model 5
model5=AutoModelForCausalLM.from_pretrained(DIR_PATH+"/story_gpt2_finetune")

Creating pipelines for story generation for each of the transformer models.

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
pipe_ken = pipeline(
    "text-generation", model=model3,tokenizer=tokenizer_dgpt, device=device
)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
pipe_ruby = pipeline(
    "text-generation", model=model4,tokenizer=tokenizer_gpt, device=device
)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
pipe_ed = pipeline(
    "text-generation", model=model5,tokenizer=tokenizer_gpt, device=device
)

## Automated metrics:

In [None]:
import datasets
metric = datasets.load_metric('sacrebleu')

__Model 1: AWD LSTM from scratch__

BLEU score

In [None]:
model1.validate()[-1]

0.20163284271028137

__Model 2: AWD LSTM finetuned__

BLEU score

In [None]:
model2.validate()[-1]

(#4) [3.247490882873535,0.3903941512107849,25.725709915161133,0.20200711890218223]

## Human evaluation:

In [None]:
# Enter the prompt 

txt="""
If all's not well in the end, then it's not the end my friend.
"""

### Story generation

Model 1: AWD-LSTM from scratch

In [None]:
N_WORDS = 100
N_SENTENCES = 1
mike = [model1.predict(txt, N_WORDS,no_unk=True, temperature=0.75) 
         for _ in range(N_SENTENCES)]

Model 2: AWD-LSTM fine tuned

In [None]:
N_WORDS = 100
N_SENTENCES = 1
charlie = [model2.predict(txt, N_WORDS,no_unk=True, temperature=0.75) 
         for _ in range(N_SENTENCES)]

Model 3: DistilGPT-2 fine tuned

In [None]:
ken=pipe_ken(txt, num_return_sequences=1,max_new_tokens=100,
          pad_token_id=tokenizer_dgpt.eos_token_id)[0]["generated_text"]

Model 4: GPT-2 from scratch

In [None]:
ruby=pipe_ruby(txt, num_return_sequences=1,
            # num_beams=3,
           do_sample=True,
          max_new_tokens=100,
          pad_token_id=tokenizer_gpt.eos_token_id)[0]["generated_text"]

Model 5: GPT-2 fine tuned 

In [None]:
ed=pipe_ed(txt, num_return_sequences=1,
          # num_beams=3,
          do_sample=True,
          max_new_tokens=100,
          pad_token_id=tokenizer_gpt.eos_token_id)[0]["generated_text"]

### Model Mike: 

In [None]:
pprint('\n'.join(mike))

("If all 's not well in the end , then it 's not the end my friend . You think "
 "i 'll end up with this shit , if you want to believe it , that could place "
 "you in a reality where you 're worthless and fucking waste . < newline > < "
 "newline > i do n't even bother to ask you , the man who 's left me in the "
 "post office was a useful little guy , and i 'll probably never be able to do "
 "anything about that . But i ' m not sure how long it may be , and i ' ve "
 'been doing it for months now')


### Model Charlie:

In [None]:
pprint('\n'.join(charlie))

("If all 's not well in the end , then it 's not the end my friend . i ' m a "
 "little depressed , but i ' m not a good writer . i ' ve been through this , "
 "really . i ' m a writer , and i ' m a writer . Well , my writing has really "
 "been so good . The reader is n't sure i do n't want to go to the library to "
 'find the writing . < newline > < newline > The main character is sitting in '
 'a corner with a book , a book , a book and a mind . The books')


### Model Ken:

In [None]:
pprint(ken)

('\n'
 "If all's not well in the end, then it's not the end my friend.\n"
 " <newline> <newline> It's alright, I wish I had just listened for the first "
 'words of the day. <newline> The day before Christmas. <newline> A few hours '
 'ago I was sitting in the kitchen talking to my old friend, my old friend. '
 '<newline> “ Hello? ” I call him, who just turned out of the room, as I was '
 'going to be his head when I saw him. <newline> �')


### Model Ruby:

In [None]:
pprint(ruby)

('\n'
 "If all's not well in the end, then it's not the end my friend.\n"
 '.com <newline> <newline> It was a Monday night in America, and I ’ d rather '
 'not stand any further than what was outside the neighborhood. A few weeks '
 'ago, the last family arrived to keep us in line for the most part. The '
 'family was in the state for this occasion ; they were waiting for us. The '
 'group were already sitting there, waiting, and waiting and waiting for the '
 'day. I would say goodbye for them to enter the school and see if')


### Model Ed:

In [None]:
pprint(ed)

('\n'
 "If all's not well in the end, then it's not the end my friend.\n"
 ' <newline> <newline> I was only a small child, when I met the child. A sweet '
 "little boy. I couldn't have been any older than that. He was my first clue "
 'to what to do. He was my first step forward. I knew where to look for him. '
 'When I pulled him towards me and hugged him I was greeted with a warm smile '
 'and a smile from the boy. I was greeted with the warm smile from the boy. '
 '<newline>')
