In [None]:
import openai
import pandas as pd
import torch
import numpy as np
import importlib

from sentence_transformers import SentenceTransformer
from transformers import AdaptiveEmbedding, AutoModelForSeq2SeqLM, AutoTokenizer, DistilBertConfig

In [None]:
model_name = 'bert-base-nli-mean-tokens'

st1 = SentenceTransformer(model_name)

### Preprocess
- Each prompt should end with a fixed separator to inform the model when the prompt ends and the completion begins. A simple separator which generally works well is `\n\n###\n\n`. The separator should not appear elsewhere in any prompt.

- Each completion should start with a whitespace due to our tokenization, which tokenizes most words with a preceding whitespace.

- Each completion should end with a fixed stop sequence to inform the model when the completion ends. A stop sequence could be `\n`, `###`, or any other token that does not appear in any completion.

- For inference, you should format your prompts in the same way as you did when creating the training dataset, including the same separator. Also specify the same stop sequence to properly truncate the completion.


In [None]:
import sys
import os
import importlib

# add parent to path
path = os.path.abspath(os.path.pardir)
if path not in sys.path:
  sys.path.append(path)

import src.utils as utils
from src.utils import *
importlib.reload(utils)

In [None]:
def process(df):
    tdf = df.copy()
    tdf.drop_duplicates(inplace=True)
    tdf.dropna(inplace=True)
    tdf.reset_index(drop=True, inplace=True)
    for col in tdf:
        tdf[col] = tdf[col].str.strip().str.replace('\n',' ').str.strip()
    tdf['q_a'] = tdf['question'] + ' ' + tdf['answer']
    # Prompt
    tdf['prompt'] = prep_question(tdf['question']) + prep_answer(tdf['answer'])
    ###
    vecs = st1.encode(
        tdf['q_a'].values,
        batch_size=16,
        show_progress_bar=True,
        output_value='sentence_embedding',
    )
    vec_df = pd.DataFrame(vecs, index=tdf.index)
    vec_df.columns = [f'embedding_{i}' for i in vec_df.columns]
    tdf = pd.concat((tdf,vec_df), axis=1)
    return tdf

In [None]:
%%time
jokes = pd.read_csv('../data/anti_jokes_raw.csv', sep='\t')
jokes = process(jokes.head(20))
print(jokes.shape)
jokes.head()

#### Testing

In [None]:
temp = jokes['q_a'][:6].values
temp

In [None]:
vecs = st1.encode(
    temp,
    batch_size=20,
    show_progress_bar=True,
    output_value=None,
)
vecs[0].keys()

In [None]:
{k:v.shape for k,v in vecs[0].items()}

In [None]:
vecs[0]['sentence_embedding'][:5]

In [None]:
v2 = [v['sentence_embedding'] for v in vecs]
v2 = torch.vstack(v2)
v2

## Cosine similarity 

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

emb_cols = jokes.columns[jokes.columns.str.contains('embedding_')]
cs = cosine_similarity(jokes[emb_cols].values)
cs = pd.DataFrame(cs)
print(cs.shape)

cs2 = pd.melt(cs.reset_index(), id_vars='index', var_name='other_index',value_name='cosine_sim')
cs2['left'] = jokes.loc[cs2['index']]['q_a'].values
cs2['right'] = jokes.loc[cs2['other_index']]['q_a'].values
cs2['joined_index'] = np.where(
    cs2['index'] <= cs2['other_index'],
    cs2['index'].astype('str') + '_' + cs2['other_index'].astype('str'),
    cs2['other_index'].astype('str') + '_' + cs2['index'].astype('str'),
)
cs2['joined_index_rank'] = cs2.groupby('joined_index')['index'].rank()

cs2.head()

### Remove duplicates using embeddings

In [None]:
threshold = 0.95

In [None]:
from IPython.core.display import display_html

possible_dups =(
    cs2
    .query('index != other_index')
    .query('cosine_sim > @threshold')
    .sort_values('cosine_sim', ascending=False)
)

with pd.option_context('max_colwidth', 100) as cont:
    display_html(possible_dups.to_html(),raw=True)

In [None]:
drop_indices = possible_dups.query('joined_index_rank == 1')['other_index'].values

temp_df = jokes.copy(deep=True).drop(drop_indices, axis=0)
jokes.shape[0], temp_df.shape[0]

In [None]:
temp_df.assign(
    prompt=temp_df['question'].str.strip() + PROMPT_TEXT,
    completion=' ' + temp_df['answer'].str.strip() + '###',
)[['prompt','completion']].to_csv('../data/anti_jokes_clean.csv', index=False)

## Process for openai

In [None]:
import openai

In [None]:
df2 = pd.read_csv('../data/anti_jokes_clean.csv')
df2.head()

In [None]:
(temp_df['q_a'].str.count(' ') + 1).sum()

In [None]:
!openai tools fine_tunes.prepare_data -f "data/anti_jokes_clean.csv"

In [None]:
# !openai api fine_tunes.create -t "data/anti_jokes_clean_prepared.jsonl" -m "davinci"

In [None]:
!openai api fine_tunes.list

In [None]:
!openai api fine_tunes.get -i <ID>