#### Set Up

In [13]:
from pathlib import Path
import json

CLASSIFICATIONS = {
    0: 'none',
    1: 'remove_duplicate_values',
    2: 'fill_missing_values',
    3: 'perform_dimensionality_reduction',
    4: 'perform_correlation_analysis',
}

system_prompts = {f.stem: f.read_text() for f in Path("system_prompt").glob("*.txt")}
system_prompt_classification = system_prompts['classification'].format(functions_dict=str(CLASSIFICATIONS))

user_prompts = [f.read_text() for f in Path("user_prompt").glob("*.txt")]
structured_outputs = {f.stem: json.load(f.open('r')) for f in Path("structured_output").glob("*.json")}

#### Main System

In [14]:
import re

def postprocess_classification_respond(respond):
    match = re.search(r'\d+', respond)
    if match:
        res = int(match.group(0))
        if res < len(CLASSIFICATIONS) and res > 0:
            return res
    return 0

def perform_classification(client, user_request):
    messages = [
        {
            "role": "system",
            "content": system_prompt_classification,
        },
        {
            "role": "user",
            "content": user_request,
        }
    ]
    response = client.chat.completions.create(
        model="solar-pro",
        messages=messages,
    )
    return postprocess_classification_respond(response.choices[0].message.content)

def perform_function_mapping(client, user_request, system_prompt, structured_output):
    messages = [
            {
                'role': 'system',
                'content': system_prompt
            },
            {
                'role': 'user',
                'content': user_request
            }
        ]
    response = client.chat.completions.create(
            model="solar-pro",
            messages=messages,
            response_format=structured_output
        )
    return response.choices[0].message.content

def process_user_request(client, user_request):
    '''
    Outpupt: function_name (string), function_parameters (JSON), model_respond (string)
    Return the JSON string with the information of the function that the user want to call together the respond of the system.
    '''
    try:
        classification = perform_classification(client, user_request)
        function_name = CLASSIFICATIONS[classification]
        function_parameters = {}
        
        if classification != 0:
            function_parameters = perform_function_mapping(
                client, user_request, 
                system_prompt = system_prompts[CLASSIFICATIONS[classification]], 
                structured_output = structured_outputs[CLASSIFICATIONS[classification]])
            function_parameters = json.loads(function_parameters)
        return True, function_name, function_parameters
    except Exception as e:
        return False, e, {}

In [15]:
import io
import base64
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

def check_cols_in_dataframe(df: pd.DataFrame, cols):
    # Edge Cases
    if cols is None or cols == []:
        return True
    df_columns = df.columns
    
    for col in cols:
        if col not in df_columns:
            return False
    return True

def convert_subset_to_message(subset):
    return subset if subset not in [None, []] else 'all'

'''
The funcitons below return success (bool), message (str), processed_data (pandas.DataFrame), image
'''
def remove_duplicate_values(df: pd.DataFrame, subset, keep):
    if keep not in ['first', 'last', False]:
        return False, f'Keep is not recognized: {keep}', None, None
    
    if check_cols_in_dataframe(df,subset):
        try:
            return_message = f'Here is your data with duplicate values remove in subset: {convert_subset_to_message(subset)}, keep: {keep}'
            return True, return_message, df.drop_duplicates(subset=subset, keep=keep), None
        except Exception as e:
            return False, e, None, None
    else:
        return False, f'Columns not in the DataFrame: {subset}', None, None
    
def fill_missing_values(df: pd.DataFrame, subset, metric):
    if metric not in ['mean', 'median', 'mode']:
        return False, f'Metric is not recognized: {metric}', None, None
    
    if check_cols_in_dataframe(df, subset):
        try:
            return_message = f'Here is your data with missing values remove in subset: {convert_subset_to_message(subset)}, metric: {metric}'
            df_result = df.copy()
            
            for col in subset:
                if metric == 'mean':
                    fill_value = round(df_result[col].mean(), 3)
                elif metric == 'median':
                    fill_value = round(df_result[col].median(), 3)
                elif metric == 'mode':
                    fill_value = df_result[col].mode()
                df_result[col].fillna(fill_value, inplace=True)
            return True, return_message, df_result, None
        except Exception as e:
            return False, e, None, None
    else:
        return False, f'Columns not in the DataFrame: {subset}', None, None
    
    
def perform_correlation_analysis(df: pd.DataFrame, subset):
    if check_cols_in_dataframe(df, subset):
        try:
            corr = df[subset].corr()
            plt.figure()
            sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
            plt.title('Correlation Heatmap')
            buf = io.BytesIO()
            plt.savefig(buf, format='png')
            plt.close()
            buf.seek(0)
            img_base64 = base64.b64encode(buf.read()).decode('utf-8')
            return True, f'Here is the correlation analysis in subset: {convert_subset_to_message(subset)}', corr, img_base64
        except Exception as e:
            return False, e, None, None
    else:
        return False, f'Columns not in the DataFrame: {subset}', None, None
    
def perform_dimensionality_reduction(df: pd.DataFrame, features, target):
    subset = features.append(target)
    
    if check_cols_in_dataframe(df, subset):
        try:
            return_message = f'Here is the pca result of features: {features}, target: {target}'
            # PCA
            X = df[features]
            pca = PCA(n_components=2)
            X_pca = pca.fit_transform(X)
            pca_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'], index=df.index)
            y = df[target]
            
            # Plot
            plt.figure(figsize=(8,6))
            sns.scatterplot(x='PC1', y='PC2', hue=y, palette='viridis', data=pca_df.join(y))
            plt.title('PCA Scatter Plot (PC1 vs PC2)')
            plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.2f}% variance)')
            plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.2f}% variance)')
            plt.legend(title=target)
            
            buf = io.BytesIO()
            plt.savefig(buf, format='png')
            plt.close()
            buf.seek(0)
            img_base64 = base64.b64encode(buf.read()).decode('utf-8')
            
            pca_params = {
                'explained_variance_ratio': pca.explained_variance_ratio_.tolist(),
                'components': pca.components_.tolist()
            }
            return True, return_message, [pca_df, y, pca_params], img_base64 
        except Exception as e:
            return False, e, None, None
    else:
        return False, f'Columns not in the DataFrame: {subset}', None, None
        
def map_json_to_function(df: pd.DataFrame, function_name, function_parameters):
    if function_name == 'remove_duplicate_values':
        return remove_duplicate_values(df, subset=function_parameters['subset'], keep=function_parameters['keep'])
    elif function_name == 'fill_missing_values':
        return fill_missing_values(df, subset=function_parameters['subset'], metric=function_parameters['metric'])
    elif function_name == 'perform_correlation_analysis':
        return perform_correlation_analysis(df, subset=function_parameters['subset'])
    elif function_name == 'perform_dimensionality_reduction':
        return perform_dimensionality_reduction(df, features=function_parameters['features'], target=function_parameters['target'])
    else:
        return False, f'No function recognized.', None, None
    

In [4]:
import pandas as pd

train_df = pd.read_csv("../../data/train.csv")
test_df = pd.read_csv("../../data/test.csv")
modified_train_df = pd.read_csv("../../data/modified_train.csv")

In [5]:
from openai import OpenAI
import toml

parsed_toml = toml.load('../../../secrets.toml')

client = OpenAI(
	api_key=parsed_toml['upstage_api_key'], 
 	base_url="https://api.upstage.ai/v1"
)

# User Prompt 1: NA Data
user_prompt = user_prompts[0]
print(f"user_prompt: {user_prompt}")
success_function_info, function_name, function_parameters = process_user_request(client, user_request=user_prompt)
print(f"function_name: {function_name}")
print(f"function_parameters: {function_parameters}")
if success_function_info:
        success_request, message, data_1, graph_1 = map_json_to_function(modified_train_df, function_name, function_parameters)
        print(message)
        print()

# User Prompt 2: NA Data
user_prompt = user_prompts[1]
print(f"user_prompt: {user_prompt}")
success_function_info, function_name, function_parameters = process_user_request(client, user_request=user_prompt)
print(f"function_name: {function_name}")
print(f"function_parameters: {function_parameters}")
if success_function_info:
        success_request, message, data_2, graph_2 = map_json_to_function(modified_train_df, function_name, function_parameters)
        print(message)
        print()

# User Prompt 3: Non NA Data
user_prompt = user_prompts[2]
print(f"user_prompt: {user_prompt}")
success_function_info, function_name, function_parameters = process_user_request(client, user_request=user_prompt)
print(f"function_name: {function_name}")
print(f"function_parameters: {function_parameters}")
if success_function_info:
        success_request, message, data_3, graph_3 = map_json_to_function(train_df, function_name, function_parameters)
        print(message)
        print()

# User Prompt 4: Non NA Data
user_prompt = user_prompts[3]
print(f"user_prompt: {user_prompt}")
success_function_info, function_name, function_parameters = process_user_request(client, user_request=user_prompt)
print(f"function_name: {function_name}")
print(f"function_parameters: {function_parameters}")
if success_function_info:
        success_request, message, data_4, graph_4 = map_json_to_function(train_df, function_name, function_parameters)
        print(message)
        print()

# User Prompt 5: 
user_prompt = user_prompts[4]
print(f"user_prompt: {user_prompt}")
success_function_info, function_name, function_parameters = process_user_request(client, user_request=user_prompt)
print(f"function_name: {function_name}")
print(f"function_parameters: {function_parameters}")
if success_function_info:
        success_request, message, data_5, graph_5 = map_json_to_function(train_df, function_name, function_parameters)
        print(message)
        print()

user_prompt: Please remove duplication values in columns temp, atemp, and humidity; don't keep any value.
function_name: remove_duplicate_values
function_parameters: {'keep': 'last', 'subset': ['temp', 'atemp', 'humidity']}
Here is your data with duplicate values remove in subset: ['temp', 'atemp', 'humidity'], keep: last

user_prompt: Fill the NULL values in column atemp using the column's mode.
function_name: fill_missing_values
function_parameters: {'metric': 'mode', 'subset': ['atemp']}
Here is your data with missing values remove in subset: ['atemp'], metric: mode

user_prompt: I want you to perform PCA on columns temp, atemp, and humidity. The target column is holiday.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_result[col].fillna(fill_value, inplace=True)


function_name: perform_dimensionality_reduction
function_parameters: {'features': ['temp', 'atemp', 'humidity'], 'target': 'holiday'}
Here is the pca result of features: ['temp', 'atemp', 'humidity', 'holiday'], target: holiday

user_prompt: I want you to output the correlation map of columns atemp, temp, and humidity.
function_name: 'perform_correlation_analysis'
function_parameters: {}
user_prompt: After spending most of his life uniting the Mongol tribes, he launched a series of military campaigns, conquering large parts of China and Central Asia.
function_name: none
function_parameters: {}
No function recognized.



In [16]:
# User Prompt 4: Non NA Data
user_prompt = user_prompts[3]
print(f"user_prompt: {user_prompt}")
success_function_info, function_name, function_parameters = process_user_request(client, user_request=user_prompt)
print(f"function_name: {function_name}")
print(f"function_parameters: {function_parameters}")
if success_function_info:
        success_request, message, data_4, graph_4 = map_json_to_function(train_df, function_name, function_parameters)
        print(message)
        print()

user_prompt: I want you to output the heat map of columns atemp, temp, and humidity.
function_name: perform_correlation_analysis
function_parameters: {'subset': ['atemp', 'temp', 'humidity']}
Here is the correlation analysis in subset: ['atemp', 'temp', 'humidity']



#### Validation

In [6]:
duplications = data_1.duplicated()
duplications[duplications == True]

Series([], dtype: bool)

In [7]:
data_2['atemp']

0        14.395
1        13.635
2        13.635
3        14.395
4        14.395
          ...  
11425    28.790
11426    25.760
11427    22.725
11428    12.880
11429    25.760
Name: atemp, Length: 11430, dtype: float64

In [8]:
data_3

[             PC1        PC2
 0      19.781460 -12.888486
 1      18.838205 -14.050746
 2      18.838205 -14.050746
 3      13.788877 -13.184217
 4      13.788877 -13.184217
 ...          ...        ...
 10881 -11.568141  -6.640351
 10882  -4.472997  -8.520107
 10883  -0.398288  -9.991826
 10884  -0.446737  -8.876341
 10885   4.602591  -9.742871
 
 [10886 rows x 2 columns],
 0        0
 1        0
 2        0
 3        0
 4        0
         ..
 10881    0
 10882    0
 10883    0
 10884    0
 10885    0
 Name: holiday, Length: 10886, dtype: int64,
 {'explained_variance_ratio': [0.7376217211210322, 0.2603843045546407],
  'components': [[-0.03805444855608231,
    -0.031979085849069694,
    0.9987638343051585,
    1.724551398659321e-05],
   [0.674864570261991,
    0.736293726279616,
    0.04928852878252111,
    -3.710566027481271e-05]]}]

In [9]:
from PIL import Image

img = Image.open(io.BytesIO(base64.decodebytes(bytes(graph_3, "utf-8"))))
if img.mode in ("RGBA", "P"):
    img = img.convert("RGB")
img.save('pca.jpeg')

In [17]:
data_4

Unnamed: 0,atemp,temp,humidity
atemp,1.0,0.984948,-0.043536
temp,0.984948,1.0,-0.064949
humidity,-0.043536,-0.064949,1.0


In [18]:
img = Image.open(io.BytesIO(base64.decodebytes(bytes(graph_4, "utf-8"))))
if img.mode in ("RGBA", "P"):
    img = img.convert("RGB")
img.save('correlation-analysis.jpeg')