#### Set Up

In [121]:
from pathlib import Path
import json

CLASSIFICATIONS = {
    0: 'none',
    1: 'remove_duplicate_values',
    2: 'fill_missing_values',
    3: 'perform_dimensionality_reduction',
    4: 'perform_correalation_analysis',
}

system_prompts = {f.stem: f.read_text() for f in Path("system_prompt").glob("*.txt")}
system_prompt_classification = system_prompts['classification'].format(functions_dict=str(CLASSIFICATIONS))

user_prompts = [f.read_text() for f in Path("user_prompt").glob("*.txt")]
structured_outputs = {f.stem: json.load(f.open('r')) for f in Path("structured_output").glob("*.json")}

#### Main System

In [122]:
import re

def postprocess_classification_respond(respond):
    match = re.search(r'\d+', respond)
    if match:
        res = int(match.group(0))
        if res < len(CLASSIFICATIONS) and res > 0:
            return res
    return 0

def perform_classification(client, user_request):
    messages = [
        {
            "role": "system",
            "content": system_prompt_classification,
        },
        {
            "role": "user",
            "content": user_request,
        }
    ]
    response = client.chat.completions.create(
        model="solar-pro",
        messages=messages,
    )
    return postprocess_classification_respond(response.choices[0].message.content)

def perform_function_mapping(client, user_request, system_prompt, structured_output):
    messages = [
            {
                'role': 'system',
                'content': system_prompt
            },
            {
                'role': 'user',
                'content': user_request
            }
        ]
    response = client.chat.completions.create(
            model="solar-pro",
            messages=messages,
            response_format=structured_output
        )
    return response.choices[0].message.content

def process_user_request(client, user_request):
    '''
    Outpupt: function_name (string), function_parameters (JSON), model_respond (string)
    Return the JSON string with the information of the function that the user want to call together the respond of the system.
    '''
    try:
        classification = perform_classification(client, user_request)
        function_name = CLASSIFICATIONS[classification]
        function_parameters = {}
        
        if classification != 0:
            function_parameters = perform_function_mapping(
                client, user_request, 
                system_prompt = system_prompts[CLASSIFICATIONS[classification]], 
                structured_output = structured_outputs[CLASSIFICATIONS[classification]])
            function_parameters = json.loads(function_parameters)
        return True, function_name, function_parameters
    except Exception as e:
        return False, e, {}

In [123]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import io
import base64
from sklearn.decomposition import PCA

def check_cols_in_dataframe(df: pd.DataFrame, cols):
    # Edge Cases
    if cols is None or cols == []:
        return True
    
    df_columns = df.columns
    
    for col in cols:
        if col not in df_columns:
            return False
    return True

def convert_subset_to_message(subset):
    return subset if subset not in [None, []] else 'all'

'''
The list of funcitons below return success (bool), message (str), processed_dataframe (pandas.DataFrame), plot
'''
def remove_duplicate_values(df: pd.DataFrame, subset, keep):
    if keep not in ['first', 'last', False]:
        return False, f'Keep is not recognized: {keep}', None, None
    
    if check_cols_in_dataframe(df,subset):
        try:
            return_message = f'Here is your data with duplicate values remove in subset: {convert_subset_to_message(subset)}, keep: {keep}'
            return True, return_message, df.drop_duplicates(subset=subset, keep=keep), None
        except Exception as e:
            return False, e, None, None
    else:
        return False, f'Columns not in the DataFrame: {subset}', None, None
    
def fill_missing_values(df: pd.DataFrame, subset, metric):
    if metric not in ['mean', 'median', 'mode']:
        return False, f'Metric is not recognized: {metric}', None, None
    
    if check_cols_in_dataframe(df, subset):
        try:
            return_message = f'Here is your data with missing values remove in subset: {convert_subset_to_message(subset)}, metric: {metric}'
            df_result = df.copy()
            
            for col in subset:
                if metric == 'mean':
                    fill_value = round(df_result[col].mean(), 2)
                elif metric == 'median':
                    fill_value = round(df_result[col].median(), 2)
                elif metric == 'mode':
                    fill_value = df_result[col].mode()
                df_result[col].fillna(fill_value, inplace=True)
            return True, return_message, df_result, None
        except Exception as e:
            return False, e, None, None
    else:
        return False, f'Columns not in the DataFrame: {subset}', None, None
    
    
def perform_correalation_analysis(df: pd.DataFrame, subset):
    if check_cols_in_dataframe(df, subset):
        try:
            corr = df[subset].corr()
            plt.figure()
            sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
            plt.title('Correlation Heatmap')
            buf = io.BytesIO()
            plt.savefig(buf, format='png')
            plt.close()
            buf.seek(0)
            img_base64 = base64.b64encode(buf.read()).decode('utf-8')
            return True, f'Here is the correalation analysis in subset: {convert_subset_to_message(subset)}', corr, img_base64
        except Exception as e:
            return False, e, None, None
    else:
        return False, f'Columns not in the DataFrame: {subset}', None, None
    
def perform_dimensionality_reduction(df: pd.DataFrame, features, target):
    subset = features.append(target)
    
    if check_cols_in_dataframe(df, subset):
        try:
            return_message = f'Here is the pca result of features: {features}, target: {target}'
            # PCA
            X = df[features]
            pca = PCA(n_components=2)
            X_pca = pca.fit_transform(X)
            pca_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'], index=df.index)
            y = df[target]
            
            # Plot
            plt.figure(figsize=(8,6))
            sns.scatterplot(x='PC1', y='PC2', hue=y, palette='viridis', data=pca_df.join(y))
            plt.title('PCA Scatter Plot (PC1 vs PC2)')
            plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.2f}% variance)')
            plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.2f}% variance)')
            plt.legend(title=target)
            
            buf = io.BytesIO()
            plt.savefig(buf, format='png')
            plt.close()
            buf.seek(0)
            img_base64 = base64.b64encode(buf.read()).decode('utf-8')
            
            pca_params = {
                'explained_variance_ratio': pca.explained_variance_ratio_.tolist(),
                'components': pca.components_.tolist()
            }
            return True, return_message, [pca_df, y, pca_params], img_base64 
        except Exception as e:
            return False, e, None, None
    else:
        return False, f'Columns not in the DataFrame: {subset}', None, None
        
def map_json_to_function(df: pd.DataFrame, function_name, function_parameters):
    if function_name == 'remove_duplicate_values':
        return remove_duplicate_values(df, subset=function_parameters['subset'], keep=function_parameters['keep'])
    elif function_name == 'fill_missing_values':
        return fill_missing_values(df, subset=function_parameters['subset'], metric=function_parameters['metric'])
    elif function_name == 'perform_correalation_analysis':
        return perform_correalation_analysis(df, subset=function_parameters['subset'])
    elif function_name == 'perform_dimensionality_reduction':
        return perform_dimensionality_reduction(df, features=function_parameters['features'], target=function_parameters['target'])
    

In [124]:
import pandas as pd

def read_data(file_path_train, file_path_test):
	train_df = pd.read_csv(file_path_train)
	test_df = pd.read_csv(file_path_test)
	return train_df, test_df

train_df, test_df = read_data("../../data/train.csv", "../../data/test.csv")

In [125]:
train_df[0:20]

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1
5,2011-01-01 05:00:00,1,0,0,2,9.84,12.88,75,6.0032,0,1,1
6,2011-01-01 06:00:00,1,0,0,1,9.02,13.635,80,0.0,2,0,2
7,2011-01-01 07:00:00,1,0,0,1,8.2,12.88,86,0.0,1,2,3
8,2011-01-01 08:00:00,1,0,0,1,9.84,14.395,75,0.0,1,7,8
9,2011-01-01 09:00:00,1,0,0,1,13.12,17.425,76,0.0,8,6,14


In [126]:

from openai import OpenAI
import toml

parsed_toml = toml.load('../../../secrets.toml')

client = OpenAI(
	api_key=parsed_toml['upstage_api_key'], 
 	base_url="https://api.upstage.ai/v1"
)

for user_prompt in user_prompts:
    print(f"user_prompt: {user_prompt}")
    success_function_info, function_name, function_parameters = process_user_request(client, user_request=user_prompt)
    print(f"function_name: {function_name}")
    print(f"function_parameters: {function_parameters}")
    if success_function_info:
        success_request, message, data, graph = map_json_to_function(train_df, function_name, function_parameters)
        if success_request:
            print(message)
        print()

user_prompt: I want you to output the correlation map of columns atemp, temp, and humidity.
function_name: perform_correalation_analysis
function_parameters: {'subset': ['atemp', 'temp', 'humidity']}
Here is the correalation analysis in subset: ['atemp', 'temp', 'humidity']



In [127]:
message

"Here is the correalation analysis in subset: ['atemp', 'temp', 'humidity']"

In [128]:
from PIL import Image
import io
import base64

img = Image.open(io.BytesIO(base64.decodebytes(bytes(graph, "utf-8"))))
if img.mode in ("RGBA", "P"):
    img = img.convert("RGB")
img.save('my-image.jpeg')
