This notebook uses [GPTNeo](https://github.com/EleutherAI/GPTNeo) by [EleutherAI](eleuther.ai) to fine tune the model and predict multiple files.

#Fine tune
To fine-tune the model copy the excel file containg the labled dataset to "train/raw" folder on the drive

Choose the following options:
1. re-initialize this configuration [1]
2. the google account with the cloud storage [1]
3. gpt project [10]
4. No [n]

In [None]:
from google.colab import auth
auth.authenticate_user()
#!gcloud auth login
!gcloud init

Mount the drive where the generated predictions will be stored.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!mkdir /content/drive/MyDrive/train/train_text

In [None]:
import pandas as pd
from pandas import read_excel
from pathlib import Path
import numpy as np
import lxml.html
import string
import os
import re

In [None]:
def read_batch(read_dir):
    dfs = []
    c=0
    for path in os.listdir(read_dir):
        full_path = os.path.join(read_dir, path)
        if os.path.isfile(full_path):
            dfs.append(read_csv(full_path))
            c+=1
            print("read file #"+ str(c)) 

    df = pd.concat(dfs)
    #print(len(df.index))
    df.reset_index(drop=True, inplace=True)
    return df

def read_csv(path):
    #file_name = '02_batch_import_Dior.xlsx'
    #df = read_excel(Path(path,file_name), sheet_name = my_sheet,keep_default_na=False)
    df = pd.read_excel(path, sheet_name='Sheet1')
    df.reset_index(drop=True, inplace=True)
    return clean(df)

def clean(df):
    to_keep=["brand","category","description","color","gender","pattern","neckline","sleeves","material"]
    to_drop=[]
    for col in df.columns:
        if col not in to_keep:
            to_drop.append(col)
    df.drop(to_drop, inplace=True, axis=1)
    return df

In [None]:
def write_as_txt(df,write_dir):
   # Shuffle indices and split the data Train 70%, val 30%
   df = df.iloc[np.random.permutation(len(df))]
   #train, validate= np.split(df.sample(frac=1), [int(.7*len(df))])
   train = df
   write(train, "train",write_dir)
   #write(validate,"validate",write_dir)
   #write(test,"test",write_dir)

def write(df, t,write_dir):
    dir = write_dir
    if t=="train":
        path = dir+"train_text/"
    elif t=="validate":
        path = dir+"val_text/"
    #elif t=="test":
    #    path = dir+"test_text/"
    #ref_path = dir+"test_text_ref/"
    c=0
    data= df.to_dict('index')
    #if t == "test":
    #    #write the test set with and without the lables to have a reference
    #    for k,value in data.items():
    #        value = {k:v for k,v in value.items() if str(v)!= '' and str(v).strip() != '' and str(v)!='nan' and str(v)!='null'}
    #        write_dict(value,ref_path+"product"+str(c)+".txt","n")
    #        c+=1
    #    c=0
    #    for k,value in data.items():
    #        value = {k:v for k,v in value.items() if str(v)!= '' and str(v).strip() != '' and str(v)!='nan' and str(v)!='null'}
    #        write_dict(value,path+"product"+str(c)+".txt","t")
    #        c+=1 
    #else :
    for k,value in data.items():
        value = {k:v for k,v in value.items() if str(v)!= '' and str(v).strip() != '' and str(v)!='nan' and str(v)!='null' and str(v)!=  '[]'}
        write_dict(value,path+"product"+str(c)+".txt","n")
        c+=1
    print("writing "+t+ " successful")

# writes the file with format:
# when type in "n" for normal
# {"tag1" : "value1", "tag2": "value2", ....} \n description: "description_en" \n ### \n
# when type is "t" for test
# {"tag1" : "value1", "tag2": "value2", ....} \n description: 
def write_dict(dict, path, type):
    feat,desc = get_data(dict) # only write the features and the generated descriptions
    if (desc):
        with open(path, 'w') as f:
            txt = feat + "\n"
            if type == "n":
                if txt[-1] != '\n':
                    txt+='\n'
                txt += "description: " + desc + "\n###\n"
            txt = clean_txt(txt)
            print(txt,file =f)
            
def clean_txt(st):
    st = st.strip()
    st = st.replace('"','')
    st = st.replace('[','')
    st = st.replace(']','')
    st = st.replace("'",'')
    return st

def get_data(dict):
    feat = dict
    desc = dict.pop("description", None)
    feat = str(feat).replace('{','').replace('}','').replace("'",'')
    return feat,desc

In [None]:
read_dir = "/content/drive/MyDrive/train/raw/"
write_dir = "/content/drive/MyDrive/train/"
write_as_txt(read_batch(read_dir),write_dir)

In [None]:
!cat /content/drive/MyDrive/train/train_text/*.txt > /content/drive/MyDrive/train/concat.txt

In [None]:
import os
%tensorflow_version 2.x
!git clone https://github.com/EleutherAI/gpt-neo
%cd gpt-neo
!pip3 install -q -r requirements.txt
pretrained_model = None
dataset = None


In [None]:
!pip install -U tensorflow-gcs-config==2.1.3
!pip install -q t5 tensorflow-text==2.3

In [None]:
path_to_cloud_bucket = 'gs://test-gpt-j/' 

# Dataset

In [None]:
# Select a Dataset:
import os
dataset_path = "/content/drive/MyDrive/train/concat/"
dataset_name = 'dataset_name'
out_name = dataset_name + "_tokenized"

In [None]:
# Tokenize Data
!python data/create_tfrecords.py --input_dir $dataset_path --name $dataset_name --files_per 1000 --output_dir $out_name --write_dataset_config --processes 1

# copy the data to your bucket
if not path_to_cloud_bucket.endswith('/'):
  path_to_cloud_bucket += '/'
copy_loc = path_to_cloud_bucket + "dataset/"
!gsutil -m cp -r /content/gpt-neo/$out_name $copy_loc
!gsutil ls $path_to_cloud_bucket

# Configs
dataset configs
If dataset_name was changed, change the name of the file being written accordingly.

In [None]:
%%writefile configs/dataset_configs/dataset_name.json

{
  "path": "gs://test-gpt-j/dataset/dataset_name_*.tfrecords",
  "eval_path": "",
  "n_vocab": 50256,
  "tokenizer_is_pretrained": true,
  "tokenizer_path": "gpt2",
  "eos_id": 50256,
  "padding_id": 50257
}


Model configs

If dataset_name was changed, change the value for "dataset" in the config accordingly.

In [None]:
%%writefile configs/GPT3_XL.json

{
    "n_head": 16,
    "n_vocab": 50257,
    "embed_dropout": 0,
    "lr": 0.0002,
    "lr_decay": "cosine",
    "warmup_steps": 3000,
    "beta1": 0.9,
    "beta2": 0.95,
    "epsilon": 1e-8,
    "opt_name": "adam",
    "weight_decay": 0,
    "train_batch_size": 256,
    "attn_dropout": 0,
    "train_steps": 600000,
    "eval_steps": 0,
    "predict_steps": 1,
    "res_dropout": 0,
    "eval_batch_size": 4,
    "predict_batch_size": 1,
    "iterations": 100,
    "n_embd": 2048,
    "datasets": [["dataset_name", null, null, null]],
    "model": "GPT",
    "model_path": "gs://test-gpt-j/",
    "n_ctx": 2048,
    "n_layer": 24,
    "scale_by_depth": true,
    "scale_by_in": false,
    "attention_types" :  [[["global", "local"],12]],
    "mesh_shape": "x:4,y:2",
    "layout": "intermediate_expanded:x,heads:x,vocab:n_vocab,memory_length:y,embd:y",
    "activation_function": "gelu",
    "recompute_grad": true,
    "gradient_clipping": 1.0,
    "tokens_per_mb_per_replica": 2048,
    "precision": "bfloat16"
}

#Pretrained Model

In [None]:
pretrained_model = 'GPT3_XL' 
path_to_local_weights = f"/content/gpt-neo/the-eye.eu/public/AI/gptneo-release/{pretrained_model}"

In [None]:
pretrained_model = 'GPT3_XL' 
!wget -m -np -c -U "eye02" -w 2 -R "index.html*" "https://the-eye.eu/public/AI/gptneo-release/$pretrained_model/"
path_to_local_weights = f"/content/gpt-neo/the-eye.eu/public/AI/gptneo-release/{pretrained_model}"


In [None]:

bucket_base = "gs://" + path_to_cloud_bucket.replace('gs://', '').split('/')[0]
!gsutil -m cp -r $path_to_local_weights $bucket_base

If dataset_name was modified, change teh value of "dataset" in "mods" accordingly.

In [None]:
import json
from pprint import pprint

path_to_model = "" 
batch_size = 8 
dset = "prod_desc_gpt_j"  
mesh_shape = "x:4,y:2"
train_steps = 1000 
steps_per_checkpoint = 500 
start_step = 400000 if pretrained_model == "GPT3_2-7B" else 362000

if path_to_model == "":
  path_to_model = f'{bucket_base.strip("/")}/{pretrained_model}'
print(f'MODEL PATH: {path_to_model}\n')

if dset == "" and dataset != "Sampling_Only":
  dset = dataset
elif dataset is None and dset == "":
  dset = "pile"

def pad_to_multiple_of(n, mult):
  """
  pads n to a multiple of mult
  """
  extra = n % mult
  if extra > 0:
      n = n + mult - extra
  return n

with open(f'{path_to_local_weights}/config.json', 'r') as f:
  data = json.load(f)
  pprint(data)
  dset_val = [[dset, None, None, None]] if dset != "" else data["datasets"]
  mods = {
          "mesh_shape": mesh_shape,
          "layout": "intermediate_expanded:x,heads:x,memory_length:y,embd:y",
          "model_path": path_to_model,
          "datasets": [["dataset_name", None, None, None]],
          "train_steps": start_step + train_steps,
          "eval_steps": 0,
          "train_batch_size": batch_size,
          "predict_batch_size": batch_size
        }
  data.update(mods)
  print('\n--->\n')
  pprint(data)
  with open(f'configs/{pretrained_model}.json', 'w') as outfile:
    json.dump(data, outfile, indent=2)

### Begin Fine-Tuning


Give permissions to the service by adding it as a storage admin of the bucket if permission denied errors rise.

In [None]:
!python3 main.py --model $pretrained_model --steps_per_checkpoint $steps_per_checkpoint --tpu colab

#Evaluate the model

If you want to evaluate the model, you have to make another concat.txt file, tokenize it and upload to the bucket as before and run the model on eval mode.

In [None]:
# Select a Dataset:
import os
dataset_path = "/content/drive/MyDrive/eval_dataset/"
dataset_name = 'eval_dataset'
out_name = dataset_name + "_tokenized"

In [None]:
# Tokenize Data
!python data/create_tfrecords.py --input_dir $dataset_path --name $dataset_name --files_per 1000 --output_dir $out_name --write_dataset_config --processes 1

# copy the data to your bucket
if not path_to_cloud_bucket.endswith('/'):
  path_to_cloud_bucket += '/'
copy_loc = path_to_cloud_bucket + "eval_datasets/"
!gsutil -m cp -r /content/gpt-neo/$out_name $copy_loc
!gsutil ls $path_to_cloud_bucket

create dataset configs

In [None]:
%%writefile configs/dataset_configs/eval_dataset.json

{
  "path": "",
  "eval_path": "gs://test-gpt-j/eval_datasets/*.tfrecords",
  "n_vocab": 50256,
  "tokenizer_is_pretrained": true,
  "tokenizer_path": "gpt2",
  "eos_id": 50256,
  "padding_id": 50257
}


update the configs to point to the dataset

In [None]:
import json
from pprint import pprint

batch_size = 8
assert pretrained_model is not None
with open(f'configs/{pretrained_model}.json', 'r') as f:
  data = json.load(f)
  pprint(data)
  dset_val = [["eval_dataset", None, None, None]]
  mods = {
          "datasets": dset_val,
          "eval_steps": 139 // batch_size,
          "train_batch_size": batch_size,
          "eval_batch_size": batch_size,
        }
  data.update(mods)
  print('\n--->\n')
  pprint(data)
  with open(f'configs/{pretrained_model}.json', 'w') as outfile:
    json.dump(data, outfile, indent=2)

Run evaluation

In [None]:
!python3 main.py --eval --tpu colab --model $pretrained_model