In [None]:
import re
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords as sw
from datasets import load_dataset
import pandas as pd
import gensim
import gensim.downloader as api
from gensim.models import FastText
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

nltk.download('punkt')
nltk.download('stopwords')

  "class": algorithms.Blowfish,
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\phykawing\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\phykawing\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Define tokenizer

In [None]:
def tokenize(text):

    # tokenized the articles into a single list

    tokenized_docs = []

    doc = re.sub(r'[^\w\s]','',text)

    tokenized_doc = word_tokenize(doc)

    sww = sw.words()
    tokenized_doc = [w for w in tokenized_doc if not w in sww]

    tokenized_doc = [t.lower() for t in tokenized_doc]

    return tokenized_doc


In [None]:
question = "Generate an approximately fifteen word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House price Range moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One"

answer = "Midsummer House is a moderately priced Chinese restaurant with a 3/5 customer rating, located near All Bar One."

In [None]:
tokenized_q = set(tokenize(question))

tokenized_ans = set(tokenize(answer))

### Use FastText for out of vocabulary words / misspelling

For information on pretrained FastText model:
https://fasttext.cc/docs/en/crawl-vectors.html

You need to download and extract the pretrained model to the same folder of this notebook.
https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz

In [None]:
model = FastText.load_fasttext_format('cc.en.300.bin')

  model = FastText.load_fasttext_format('cc.en.300.bin')


In [None]:
def similarity_score(tokenized_q, tokenized_ans):

    score = []

    for ans in tokenized_ans:
        for q in tokenized_q:
            q_vec = model.wv[q]
            ans_vec = model.wv[ans]
            score.append(model.wv.similarity(q, ans))

    max_val = np.max(score)
    std_dev_val = np.std(score)

    # Filter the list for values greater than mean + std_dev
    filtered_score = [i for i in score if i > max_val - 3 * std_dev_val]

    return np.mean(filtered_score)

In [None]:
similarity_score(tokenized_q, tokenized_ans)

0.7064993

### For example, OpenOrca Dataset

In [None]:
train_dataset = load_dataset("Open-Orca/OpenOrca", split="train")

In [None]:
len(train_dataset)

4233923

In [None]:
train_dataset[2888]['question']

'For this chain-of-thought reasoning and answer, what was the question?\nTo play on a jungle gym does not imply it is with friends.\n A: it is not possible to tell'

In [None]:
train_dataset[2888]['response']

"Q: Can people only have fun on a jungle gym when they're with friends?"

In [None]:
score = []
i = 0

for data in train_dataset:
    q = data['question']
    ans = data['response']

    tokenized_q = set(tokenize(q))
    tokenized_ans = set(tokenize(ans))

    if len(tokenized_q) == 0 or len(tokenized_ans) == 0:
        score.append(0)
        print(i)
        i += 1
        continue

    similarity = similarity_score(tokenized_q, tokenized_ans)

    score.append(similarity)

    print(i, similarity)

    i += 1


0 0.63455343
1 0.7215075
2 0.5001706
3 0.41853094
4 0.75507814
5 0.44518703
6 0.5705846
7 0.46203622
8 0.36956504
9 0.67863536
10 0.7184166
11 0.3569332
12 0.319293
13 0.41948196
14 0.4369383
15 0.64993525
16 0.3881169
17 0.44186518
18 0.44386274
19 0.45325783
20 0.54947627
21 0.73187596
22 0.5714581
23 0.42114848
24 0.3197536
25 0.43226904
26 0.51541793
27 0.527988
28 0.6153087
29 0.34466535
30 0.34241438
31 0.16293049
32 0.44855025
33 0.37260857
34 0.32334316
35 0.5296607
36 0.723636
37 0.39401466
38 0.41276968
39 0.4620061
40 0.3958286
41 0.4276206
42 0.54934555
43 0.48174268
44 0.39043102
45 0.47597933
46 0.48277044
47 0.3851124
48 0.44256866
49 0.6258595
50 0.40524024
51 0.4786667
52 0.25549248
53 0.4695482
54 0.37520918
55 0.39859876
56 0.371849
57 0.6719513
58 0.5659782
59 0.40509832
60 0.3691469
61 0.4300068
62 0.43142635
63 0.39136615
64 0.54346895
65 0.3335378
66 0.40410262
67 0.38244513
68 0.33467835
69 0.585711
70 0.35310003
71 0.43522033
72 0.37569007
73 0.37667918
74 0.50

KeyboardInterrupt: 

### T5-Large

In [None]:
# Specify the model
model_name = "google/flan-t5-large"

# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
llm = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
score = []
llm_ans = []
i = 0

for data in train_dataset:
    q = data['question']
    ans = data['response']

    tokenized_q = set(tokenize(q))
    inputs = tokenizer.encode(q, return_tensors="pt")

    # Generate output
    outputs = llm.generate(inputs)
    output_text = tokenizer.decode(outputs[0])
    tokenized_ans = set(tokenize(output_text))

    if len(tokenized_q) == 0 or len(tokenized_ans) == 0:
        score.append(0)
        print(i)
        i += 1
        continue

    similarity = similarity_score(tokenized_q, tokenized_ans)

    score.append(similarity)

    llm_ans.append(output_text)

    print(i, similarity)

    i += 1




0 0.43789995
1 0.55974334
2 0.27267492
3 0.320248
4 0.83472466
5 0.62109196
6 0.5776144


Token indices sequence length is longer than the specified maximum sequence length for this model (1030 > 512). Running this sequence through the model will result in indexing errors


7 0.42615637
8 0.35319573
9 0.2925676
10 0.19607279
11 0.33860868
12 0.33148032
13 0.6159
14 0.3829325
15 0.21657857
16 0.24361515
17 0.53316396
18 0.43947634
19 0.17991035
20 0.35122898
21 0.879655
22 0.34119412
23 0.17510363
24 0.31755152
25 0.37599966
26 0.7518408
27 0.640014
28 0.17072597
29 0.36289206
30 0.3890201
31 0.24465513
32 0.3944393
33 0.436052
34 0.21475467
35 0.27051073
36 0.22590354
37 0.27801812
38 0.5587281
39 0.39656395
40 0.17036775
41 0.25370505
42 0.37617782
43 0.20242378
44 0.27483928
45 0.7958326
46 0.21875982
47 0.15971427
48 0.46482572
49 0.36202264
50 0.34886745
51 0.5254971
52 0.16075017
53 0.2148688
54 0.21200311
55 0.1978098
56 0.19609846
57 0.39134932
58 0.67553824
59 0.36043847
60 0.9334779
61 0.546513
62 0.42008233
63 0.17582256
64 0.30723855
65 0.65156716
66 0.18909311
67 0.17952955
68 0.3659422
69 0.7199086
70 0.37540373
71 0.5155116
72 0.22984813
73 0.19197664
74 1.0
75 0.43008605
76 0.32758424
77 0.28310588
78 0.27705634
79 0.8865869
80 0.38745853
8

KeyboardInterrupt: 

In [None]:
train_dataset[10]['question']

'Q: Answer the following question given this paragraph:   The kidneys also secrete hormones that help maintain homeostasis. For example, they produce a hormone that stimulates bone marrow to produce red blood cells when more are needed. They also secrete a hormone that regulates blood pressure and keeps it in a normal range.   Q: What organs secrete hormones that help maintain homeostasis?   A:\nThe answer is:'

In [None]:
train_dataset[10]['response']

'The kidneys are the organs that secrete hormones to help maintain homeostasis. They produce a hormone that stimulates bone marrow to produce red blood cells when needed, and they also secrete a hormone that regulates blood pressure, keeping it within a normal range.'

In [None]:
llm_ans[10]

'<pad> kidneys</s>'