In [1]:
from sklearn.datasets import load_iris

X, y = load_iris(return_X_y=True, as_frame=True)
Xy = X.assign(y=y)

X.shape, y.shape, Xy.shape

((150, 4), (150,), (150, 5))

In [2]:
%%time

from openai import OpenAI
import pandas as pd

def get_dimensions(r):
    return r['sepal length (cm)'], r['sepal width (cm)'], r['petal length (cm)'], r['petal width (cm)'], r['y']
    
def get_messages(sepal_length, sepal_width, petal_length, petal_width):
    messages = [
        {
            'role': 'system',
            'content': f'''You are a helpful assistant that will generate a description of an iris flowers based on its sepal length and width as well as its petal length and width. All dimensions of length and width are in centimeters. The dimensions of the flower will be given in triple backticks.

Iris flower dimensions
```
sepal length: {sepal_length}
sepal width: {sepal_width}
petal length: {petal_length}
petal width: {petal_width}
```

Your response:'''
        }
    ]
    return messages

def get_description(sepal_length, sepal_width, petal_length, petal_width, y, completion_model='gpt-4', embedding_model='text-embedding-ada-002'):
    client = OpenAI()
    response = client.chat.completions.create(
        model=completion_model,
        messages=get_messages(sepal_length, sepal_width, petal_length, petal_width),
        temperature=0
    )
    
    content = response.choices[0].message.content

    response = client.embeddings.create(input=[content], model=embedding_model)
    embedding = response.data[0].embedding

    return {
        'sepal_length': sepal_length,
        'sepal_width': sepal_width,
        'petal_length': petal_length,
        'petal_width': petal_width,
        'y': y,
        'content': content,
        'embedding': embedding
    }

df = pd.DataFrame((get_description(*get_dimensions(r)) for _, r in Xy.iterrows()))
df.shape

CPU times: user 2.01 s, sys: 9.69 ms, total: 2.02 s
Wall time: 22min 2s


(150, 7)

In [4]:
df.to_pickle('./data/iris-gpt.pickle')

In [5]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,y,content,embedding
0,5.1,3.5,1.4,0.2,0.0,This iris flower has a sepal that measures 5.1...,"[-0.00509683508425951, 0.022241873666644096, -..."
1,4.9,3.0,1.4,0.2,0.0,This iris flower has a sepal length of 4.9 cen...,"[-0.0008553705411031842, 0.02272571250796318, ..."
2,4.7,3.2,1.3,0.2,0.0,This iris flower has a sepal that measures 4.7...,"[-0.005902666598558426, 0.022460468113422394, ..."
3,4.6,3.1,1.5,0.2,0.0,This iris flower has a sepal length of 4.6 cen...,"[-0.0018561771139502525, 0.023472873494029045,..."
4,5.0,3.6,1.4,0.2,0.0,This iris flower has a sepal that measures 5.0...,"[-0.0070672184228897095, 0.02132122591137886, ..."


In [38]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

columns = [f'x{i}' for i in range(len(df.iloc[0]['embedding']))]
_X = pd.DataFrame(df['embedding'].tolist(), columns=columns)
_y = df['y']

_tr, _te = next(StratifiedShuffleSplit(n_splits=10, test_size=0.5, random_state=37).split(_X, _y))

_X_tr, _y_tr = _X.iloc[_tr], _y.iloc[_tr]
_X_te, _y_te = _X.iloc[_te], _y.iloc[_te]

m = RandomForestClassifier(n_jobs=-1, random_state=37, class_weight='balanced')
m.fit(_X_tr, _y_tr)

roc_auc_score(_y_te, m.predict_proba(_X_te), multi_class='ovo')

0.9516

In [41]:
_X = df.drop(columns=['y', 'content', 'embedding'])
_y = df['y']

_tr, _te = next(StratifiedShuffleSplit(n_splits=10, test_size=0.5, random_state=37).split(_X, _y))

_X_tr, _y_tr = _X.iloc[_tr], _y.iloc[_tr]
_X_te, _y_te = _X.iloc[_te], _y.iloc[_te]

m = RandomForestClassifier(n_jobs=-1, random_state=37, class_weight='balanced')
m.fit(_X_tr, _y_tr)

roc_auc_score(_y_te, m.predict_proba(_X_te), multi_class='ovo')

0.9989333333333333