In [None]:
#pip install ucimlrepo
#pip install aif360

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo 
from openai import OpenAI
import os
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from aif360.datasets import BinaryLabelDataset
from aif360.explainers import MetricTextExplainer

# Import Data

In [2]:
# fetch dataset 
statlog_german_credit_data = fetch_ucirepo(id=144) 
 
# metadata 
print(statlog_german_credit_data.metadata) 
  
# variable information 
print(statlog_german_credit_data.variables) 


{'uci_id': 144, 'name': 'Statlog (German Credit Data)', 'repository_url': 'https://archive.ics.uci.edu/dataset/144/statlog+german+credit+data', 'data_url': 'https://archive.ics.uci.edu/static/public/144/data.csv', 'abstract': 'This dataset classifies people described by a set of attributes as good or bad credit risks. Comes in two formats (one all numeric). Also comes with a cost matrix', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1000, 'num_features': 20, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Other', 'Marital Status', 'Age', 'Occupation'], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1994, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5NC77', 'creators': ['Hans Hofmann'], 'intro_paper': None, 'additional_info': {'summary': 'Two datasets are provided.  the original dataset, in the form provided by

# Reload the clean dataset automatically

In [3]:
def load_clean_dataset_mappingv1(): 
    statlog_german_credit_data = fetch_ucirepo(id=144) 
    
    # data (as pandas dataframes) 
    X = statlog_german_credit_data.data.features 
    y = statlog_german_credit_data.data.targets
    
    # Mapping dictionaries for each qualitative attribute
    mapping_1 = {
        'A11': '< 0 DM',
        'A12': '0 <= ... < 200 DM',
        'A13': '>= 200 DM / salary assignments for at least 1 year',
        'A14': 'no checking account'
    }
    
    mapping_3 = {
        'A30': 'no credits taken/ all credits paid back duly',
        'A31': 'all credits at this bank paid back duly',
        'A32': 'existing credits paid back duly till now',
        'A33': 'delay in paying off in the past',
        'A34': 'critical account/ other credits existing (not at this bank)'
    }
    
    mapping_4 = {
        'A40': 'car (new)',
        'A41': 'car (used)',
        'A42': 'furniture/equipment',
        'A43': 'radio/television',
        'A44': 'domestic appliances',
        'A45': 'repairs',
        'A46': 'education',
        'A47': '(vacation - does not exist?)',
        'A48': 'retraining',
        'A49': 'business',
        'A410': 'others'
    }
    
    mapping_6 = {
        'A61': '< 100 DM',
        'A62': '100 <= ... < 500 DM',
        'A63': '500 <= ... < 1000 DM',
        'A64': '>= 1000 DM',
        'A65': 'unknown/ no savings account'
    }
    
    mapping_7 = {
        'A71': 'unemployed',
        'A72': '< 1 year',
        'A73': '1 <= ... < 4 years',
        'A74': '4 <= ... < 7 years',
        'A75': '>= 7 years'
    }
    
    mapping_9 = {
        'A91': 'male: divorced/separated',
        'A92': 'female: divorced/separated/married',
        'A93': 'male: single',
        'A94': 'male: married/widowed',
        'A95': 'female: single'
    }
    
    mapping_10 = {
        'A101': 'none',
        'A102': 'co-applicant',
        'A103': 'guarantor'
    }
    
    mapping_12 = {
        'A121': 'real estate',
        'A122': 'building society savings agreement/ life insurance',
        'A123': 'car or other, not in attribute 6',
        'A124': 'unknown / no property'
    }
    
    mapping_14 = {
        'A141': 'bank',
        'A142': 'stores',
        'A143': 'none'
    }
    
    mapping_15 = {
        'A151': 'rent',
        'A152': 'own',
        'A153': 'for free'
    }
    
    mapping_17 = {
        'A171': 'unemployed/ unskilled - non-resident',
        'A172': 'unskilled - resident',
        'A173': 'skilled employee / official',
        'A174': 'management/ self-employed/ highly qualified employee/ officer'
    }
    
    mapping_19 = {
        'A191': 'none',
        'A192': 'yes, registered under the customer\'s name'
    }
    
    mapping_20 = {
        'A201': 'yes',
        'A202': 'no'
    }
    
    # Apply the mappings to the dataframe
    for attribute, mapping in zip(['Attribute1', 'Attribute3', 'Attribute4', 'Attribute6', 'Attribute7', 'Attribute9', 'Attribute10', 'Attribute12', 'Attribute14', 'Attribute15', 'Attribute17', 'Attribute19', 'Attribute20'],
                                   [mapping_1, mapping_3, mapping_4, mapping_6, mapping_7, mapping_9, mapping_10, mapping_12, mapping_14, mapping_15, mapping_17, mapping_19, mapping_20]):
        X[attribute] = X[attribute].map(mapping)
    return X, y
    

In [None]:
X, y = load_clean_dataset_mappingv1()

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.99, random_state=42)
print(X)

for column in X.columns:
    print(f"Valeurs uniques dans {column}:")
    print(X[column].value_counts())
    print("\n")
for column in y.columns:
    print(f"Valeurs uniques dans {column}:")
    print(y[column].value_counts())
    print("\n")

In [5]:
X.head(2)

Unnamed: 0,Attribute1,Attribute2,Attribute3,Attribute4,Attribute5,Attribute6,Attribute7,Attribute8,Attribute9,Attribute10,Attribute11,Attribute12,Attribute13,Attribute14,Attribute15,Attribute16,Attribute17,Attribute18,Attribute19,Attribute20
0,< 0 DM,6,critical account/ other credits existing (not ...,radio/television,1169,unknown/ no savings account,>= 7 years,4,male: single,none,4,real estate,67,none,own,2,skilled employee / official,1,"yes, registered under the customer's name",yes
1,0 <= ... < 200 DM,48,existing credits paid back duly till now,radio/television,5951,< 100 DM,1 <= ... < 4 years,2,female: divorced/separated/married,none,2,real estate,22,none,own,1,skilled employee / official,1,none,yes


In [6]:
y.head(2)

Unnamed: 0,class
0,1
1,2


In [7]:
def features_to_text(row): 
    return (
        f"Applicant with a checking account status: {row['Attribute1']}, "
        f"loan duration: {row['Attribute2']} months, "
        f"credit history: {row['Attribute3']}, "
        f"loan purpose: {row['Attribute4']}, "
        f"credit amount: {row['Attribute5']}, "
        f"savings account/bonds: {row['Attribute6']}, "
        f"employment since: {row['Attribute7']}, "
        f"installment rate as percentage of disposable income: {row['Attribute8']}, "
        f"personal status and sex: {row['Attribute9']}, "
        f"other debtors/guarantors: {row['Attribute10']}, "
        f"present residence since: {row['Attribute11']}, "
        f"property: {row['Attribute12']}, "
        f"age: {row['Attribute13']} years, "
        f"other installment plans: {row['Attribute14']}, "
        f"housing: {row['Attribute15']}, "
        f"existing credits at this bank: {row['Attribute16']}, "
        f"job: {row['Attribute17']}, "
        f"number of people liable for maintenance: {row['Attribute18']}, "
        f"telephone: {'yes' if row['Attribute19'] == 1 else 'no'}, "
        f"foreign worker: {'yes' if row['Attribute20'] == 1 else 'no'}."
    )

# Analysis of Standard Classifiers

In [None]:
# data (as pandas dataframes) 
X = statlog_german_credit_data.data.features 
y = statlog_german_credit_data.data.targets

# Convert categorical variables to dummy/indicator variables if necessary
X = pd.get_dummies(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Initialize the models
logreg = LogisticRegression()
rf = RandomForestClassifier()

# Train the models
logreg.fit(X_train, y_train)
rf.fit(X_train, y_train)

# Predict on the test set
logreg_pred = logreg.predict(X_test)
rf_pred = rf.predict(X_test)

# Evaluate the models
logreg_accuracy = accuracy_score(y_test, logreg_pred)
logreg_f1 = f1_score(y_test, logreg_pred, average='weighted')
logreg_confusion = confusion_matrix(y_test, logreg_pred)

rf_accuracy = accuracy_score(y_test, rf_pred)
rf_f1 = f1_score(y_test, rf_pred, average='weighted')
rf_confusion = confusion_matrix(y_test, rf_pred)

# Print the metrics in a tabular form
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest'],
    'Accuracy': [logreg_accuracy, rf_accuracy],
    'F1 Score': [logreg_f1, rf_f1],
    'Confusion Matrix': [logreg_confusion, rf_confusion]
})

print(results)

# Plot confusion matrices
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
sns.heatmap(logreg_confusion, annot=True, fmt='d', cmap='Purples', ax=ax[0])
ax[0].set_title('Logistic Regression Confusion Matrix')
sns.heatmap(rf_confusion, annot=True, fmt='d', cmap='Purples', ax=ax[1])
ax[1].set_title('Random Forest Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()


# Analysis of GPT models

In [None]:
def chat_with_gpt(description):
    api_key = 'x'  # my API KEY from my openAI playground
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", api_key))
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"Classify the credit risk based on the following description as 'Good' or as 'Bad': {description}\n\nClassification:",
            }
        ],
        model="gpt-3.5-turbo", #gpt-3.5-turbo
        max_tokens = 10, #max words
        temperature= 0.3
    )
    return chat_completion
    

# Function to classify using GPT
def classify_with_gpt(description):
    print(description)
    response = chat_with_gpt(description)
    return response.choices[0].message.content

In [None]:
# Convert features to text and classify
X_test_descriptions = X_test.apply(features_to_text, axis=1)
#X_test_descriptions = X_test_descriptions[:5] 

predictions = X_test_descriptions.apply(classify_with_gpt)
#print(predictions)
# Replace 'Good' as 1 and 'Bad' as 2
predictions = predictions.replace({'Good': 1, 'Bad': 2})
y_test = y_test.loc[predictions.index]
# Evaluate the model
llm_accuracy = accuracy_score(y_test, predictions)
llm_f1 = f1_score(y_test, predictions, average='weighted')
llm_confusion = confusion_matrix(y_test, predictions)
print("Accuracy:", llm_accuracy)
print("F1 Score:", llm_f1)
print('Confusion:', llm_confusion)  

In [None]:
# Plot confusion matrices
llm_confusion = np.array(llm_confusion) 
plt.figure(figsize=(8,6))
sns.set(style="whitegrid", font_scale=1.2)


sns.heatmap(llm_confusion, annot=True, fmt='d', cmap='Purples', xticklabels=['Good', 'Bad'], yticklabels=['Good', 'Bad'])
#sns.heatmap(llm_confusion[0] + llm_confusion[1], annot=True, fmt='d', cmap='Purples', xticklabels=['Good', 'Bad'], yticklabels=['Good', 'Bad'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('GPT 3.5 Confusion Matrix')
plt.show()


# Bias Measurement

In [None]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

from sklearn.preprocessing import LabelEncoder

def predo(data):
    s = (data.dtypes == 'object')
    object_cols = list(s[s].index)
    pre_data = data.copy()
    label_encoder = LabelEncoder()
    for col in object_cols:
        pre_data[col] = label_encoder.fit_transform(data[col])
    pre_data['Attribute13'][pre_data['Attribute13'] <= 45] = 0
    pre_data['Attribute13'][pre_data['Attribute13'] > 45]= 1

    pre_data['Attribute9'][pre_data['Attribute9'] == 2] = 0  # male
    pre_data['Attribute9'][pre_data['Attribute9'] == 3] = 0  # male
    pre_data['Attribute9'][pre_data['Attribute9'] == 5] = 1  # female
    return pd.DataFrame(pre_data.values.tolist()) 
    
mean_list = ['Status of existing checking account', 'Duration in month', 'Credit history', 'Purpose',
             'Credit amount', 'Savings account/bonds', 'Present employment since',
             'Installment rate in percentage of disposable income', 'Personal status and sex',
             ' Other debtors / guarantors', 'Present residence since', 'Property', 'Age in years',
             'Other installment plans', 'Housing', 'Number of existing credits at this bank' ,'Job',
             'Number of people being liable to provide maintenance for' , 'Telephone' , 'foreign worker',
             'target']

train = predo(train)
test = predo(test)

train.columns = mean_list
test.columns = mean_list
result = test.copy()
result['target'] = predictions.reset_index()[0]


In [None]:
'''data bias test'''

test_data = BinaryLabelDataset(favorable_label=1, unfavorable_label=2, df=test, label_names=['target'], protected_attribute_names=['Personal status and sex','Age in years','foreign worker'])
metric = BinaryLabelDatasetMetric(test_data, unprivileged_groups=[{'foreign worker':0}], privileged_groups=[{'foreign worker':1}])
text_res = MetricTextExplainer(metric)
print('DI:', text_res.disparate_impact())

metric = BinaryLabelDatasetMetric(test_data, unprivileged_groups=[{'Age in years':1}], privileged_groups=[{'Age in years':0}])
text_res = MetricTextExplainer(metric)
print('DI:', text_res.disparate_impact())

metric = BinaryLabelDatasetMetric(test_data, unprivileged_groups=[{'Personal status and sex':1}], privileged_groups=[{'Personal status and sex':0}])
text_res = MetricTextExplainer(metric)
print('DI:', text_res.disparate_impact())


train_data = BinaryLabelDataset(favorable_label=1, unfavorable_label=2, df=train, label_names=['target'], protected_attribute_names=['Personal status and sex','Age in years','foreign worker'])
metric = BinaryLabelDatasetMetric(train_data, unprivileged_groups=[{'foreign worker':0}], privileged_groups=[{'foreign worker':1}])
text_res = MetricTextExplainer(metric)
print('DI:', text_res.disparate_impact())

metric = BinaryLabelDatasetMetric(train_data, unprivileged_groups=[{'Age in years':1}], privileged_groups=[{'Age in years':0}])
text_res = MetricTextExplainer(metric)
print('DI:', text_res.disparate_impact())

metric = BinaryLabelDatasetMetric(train_data, unprivileged_groups=[{'Personal status and sex':1}], privileged_groups=[{'Personal status and sex':0}])
text_res = MetricTextExplainer(metric)
print('DI:', text_res.disparate_impact())


'''method bias test'''

result_data = BinaryLabelDataset(favorable_label=1, unfavorable_label=2, df=result, label_names=['target'], protected_attribute_names=['Personal status and sex','Age in years','foreign worker'])

metric = ClassificationMetric(test_data, result_data, unprivileged_groups=[{'foreign worker':0}], privileged_groups=[{'foreign worker':1}])
text_res = MetricTextExplainer(metric)

print('EOD:', text_res.equal_opportunity_difference())
print('ERR:', text_res.average_odds_difference())

metric = ClassificationMetric(test_data, result_data, unprivileged_groups=[{'Age in years':1}], privileged_groups=[{'Age in years':0}])
text_res = MetricTextExplainer(metric)

print('EOD:', text_res.equal_opportunity_difference())
print('ERR:', text_res.average_odds_difference())

metric = ClassificationMetric(test_data, result_data, unprivileged_groups=[{'Personal status and sex':1}], privileged_groups=[{'Personal status and sex':0}])
text_res = MetricTextExplainer(metric)

print('EOD:', text_res.equal_opportunity_difference())
print('ERR:', text_res.average_odds_difference())

print('down')

In [None]:
di_results = {
    'Attribute': ['Foreign Worker', 'Age', 'Sex', 'Foreign Worker', 'Age', 'Sex'],
    'DI Value': [1.305, 0.9473684210526315, 0.7380952380952381, 1.2809897692124672, 1.1463128602663486, 0.9313997662185856],  
    'Dataset': ['Test', 'Test', 'Test', 'Train', 'Train', 'Train']
}

# Convert to DataFrame
df = pd.DataFrame(di_results)

# Print the DataFrame to verify its correctness
print(df)

test_data = df[df['Dataset'] == 'Test']
train_data = df[df['Dataset'] == 'Train']

# Plotting
fig, ax = plt.subplots(figsize=(10, 6))
width = 0.35  # the width of the bars

# Position of bars on the x-axis
indices = range(len(test_data))

# Plotting both test and train data
rects1 = ax.barh([x - width/2 for x in indices], test_data['DI Value'], width, label='Test', color='mediumpurple')
rects2 = ax.barh([x + width/2 for x in indices], train_data['DI Value'], width, label='Train', color='plum')

# Adding a vertical line at x = 1
ax.axvline(x=1, color='red', linestyle='--', label='No Bias Threshold')

# Labeling and aesthetics
ax.set_xlabel('Disparate Impact (DI)')
ax.set_title('DI by Attribute and Dataset')
ax.set_yticks(indices)
ax.set_yticklabels(test_data['Attribute'])
ax.legend()

# Function to add labels on the bars
def add_labels(rects):
    for rect in rects:
        width = rect.get_width()
        ax.annotate(f'{width:.2f}',
                    xy=(width, rect.get_y() + rect.get_height() / 2),
                    xytext=(3, 0),  # 3 points horizontal offset
                    textcoords="offset points",
                    ha='left', va='center')

# Add labels to the bars
add_labels(rects1)
add_labels(rects2)

plt.show()

# Hyper parameter tuning, library optuna to increase accuracy

In [None]:
#pip install optuna

In [None]:
import optuna
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from openai import OpenAI

In [None]:
def objective(trial):
    # Suggesting parameters
    temperature = trial.suggest_float('temperature', 0.1, 1.0)
    top_p = trial.suggest_float('top_p', 0.1, 1.0)
    max_tokens = trial.suggest_int('max_tokens', 5, 50)
    frequency_penalty = trial.suggest_float('frequency_penalty', 0.0, 2.0)
    presence_penalty = trial.suggest_float('presence_penalty', 0.0, 2.0)

    # Redefining the classifier with new parameters
    def classify_with_gpt(description):
        api_key = 'x'
        client = OpenAI(api_key=api_key)
        prompt = f"Classify the credit risk based on the following description as 'Good' or as 'Bad': {description}\n\nClassification:"
        
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model="gpt-4",
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            frequency_penalty=frequency_penalty,
            presence_penalty=presence_penalty
        )
        
        response = chat_completion.choices[0].message.content.strip()
        return 'Good' if 'Good' in response else 'Bad'

    # Predicting using the classifier
    predictions = X_test_descriptions.apply(classify_with_gpt)
    predictions = predictions.replace({'Good': 1, 'Bad': 2})

    # Evaluation
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average='weighted')

    return accuracy

In [None]:
X_test_descriptions = X_test.apply(features_to_text, axis=1)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10, timeout=600)  # Run for 10 trials or stop after 600 seconds

print("Best trial:")
trial = study.best_trial

print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# Focus on Gender Bias

In [None]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

from sklearn.preprocessing import LabelEncoder

def predo(data):
    s = (data.dtypes == 'object')
    object_cols = list(s[s].index)
    pre_data = data.copy()
    label_encoder = LabelEncoder()
    for col in object_cols:
        pre_data[col] = label_encoder.fit_transform(data[col])
    pre_data['Attribute13'][pre_data['Attribute13'] <= 45] = 0
    pre_data['Attribute13'][pre_data['Attribute13'] > 45]= 1

    pre_data['Attribute9'][pre_data['Attribute9'] == 2] = 0  # male
    pre_data['Attribute9'][pre_data['Attribute9'] == 3] = 0  # male
    pre_data['Attribute9'][pre_data['Attribute9'] == 5] = 1  # female
    return pd.DataFrame(pre_data.values.tolist()) 
    
mean_list = ['Status of existing checking account', 'Duration in month', 'Credit history', 'Purpose',
             'Credit amount', 'Savings account/bonds', 'Present employment since',
             'Installment rate in percentage of disposable income', 'Personal status and sex',
             ' Other debtors / guarantors', 'Present residence since', 'Property', 'Age in years',
             'Other installment plans', 'Housing', 'Number of existing credits at this bank' ,'Job',
             'Number of people being liable to provide maintenance for' , 'Telephone' , 'foreign worker',
             'target']

train = predo(train)
test = predo(test)

train.columns = mean_list
test.columns = mean_list
result = test.copy()
result['target'] = predictions.reset_index()[0]

In [None]:
test_data = BinaryLabelDataset(favorable_label=1, unfavorable_label=2, df=test, label_names=['target'], protected_attribute_names=['Personal status and sex'])
result_data = BinaryLabelDataset(favorable_label=1, unfavorable_label=2, df=result, label_names=['target'], protected_attribute_names=['Personal status and sex'])

metric = ClassificationMetric(test_data, result_data, unprivileged_groups=[{'Personal status and sex':1}], privileged_groups=[{'Personal status and sex':0}])
text_res = MetricTextExplainer(metric)

print('EOD:', text_res.equal_opportunity_difference())
print('ERR:', text_res.average_odds_difference())


In [None]:
def chat_with_gpt(description):
    api_key = 'x'  # my API KEY from my openAI playground
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", api_key))
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"Evaluate the financial details provided and determine the credit risk as only 'Good' or 'Bad' based solely on financial metrics. Avoid considering personal demographics unrelated to creditworthiness. For instance, if the provided data shows consistent income, no existing debts, and asset ownership, the decision should be 'Good'. Please focus strictly on the financial profile provided. Financial Profile: {description}\nYou can only answer with 'Good' or 'Bad'",
            }
        ],
        model="gpt-4", #model name
        max_tokens = 45, #max words
        temperature= 0.74,
        top_p = 0.17,
        frequency_penalty = 0.45,
        presence_penalty = 1.37
    )
    return chat_completion
    

# Function to classify using GPT
def classify_with_gpt(description):
    response = chat_with_gpt(description)
    return response.choices[0].message.content

# Convert features to text and classify
X_test_descriptions = X_test.apply(features_to_text, axis=1)
#X_test_descriptions = X_test_descriptions[:5] 

predictions = X_test_descriptions.apply(classify_with_gpt)
#print(predictions)
# Replace 'Good' as 1 and 'Bad' as 2
predictions = predictions.replace({'Good': 1, 'Bad': 2})
y_test = y_test.loc[predictions.index]
# Evaluate the model
llm_accuracy = accuracy_score(y_test, predictions)
llm_f1 = f1_score(y_test, predictions, average='weighted')
llm_confusion = confusion_matrix(y_test, predictions)
print("Accuracy:", llm_accuracy)
print("F1 Score:", llm_f1)
print('Confusion:', llm_confusion)

In [None]:
result = test.copy()
result['target'] = predictions.reset_index()[0]

test_data = BinaryLabelDataset(favorable_label=1, unfavorable_label=2, df=test, label_names=['target'], protected_attribute_names=['Personal status and sex'])
result_data = BinaryLabelDataset(favorable_label=1, unfavorable_label=2, df=result, label_names=['target'], protected_attribute_names=['Personal status and sex'])

metric = ClassificationMetric(test_data, result_data, unprivileged_groups=[{'Personal status and sex':1}], privileged_groups=[{'Personal status and sex':0}])
text_res = MetricTextExplainer(metric)

print('EOD:', text_res.equal_opportunity_difference())
print('ERR:', text_res.average_odds_difference())

In [None]:
def chat_with_gpt(description):
    api_key = 'x'  # my API KEY from my openAI playground
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", api_key))
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"Evaluate the financial details provided and determine the credit risk as only 'Good' or 'Bad' based solely on financial metrics. Avoid considering personal demographics unrelated to creditworthiness. Financial Profile: {description}\nYou can only answer with 'Good' or 'Bad'",
            }
        ],
        model="gpt-4", #model name
        max_tokens = 45, #max words
        temperature= 0.74,
        top_p = 0.17,
        frequency_penalty = 0.45,
        presence_penalty = 1.37
    )
    return chat_completion
    

# Function to classify using GPT
def classify_with_gpt(description):
    response = chat_with_gpt(description)
    return response.choices[0].message.content

# Convert features to text and classify
X_test_descriptions = X_test.apply(features_to_text, axis=1)
#X_test_descriptions = X_test_descriptions[:5] 

predictions = X_test_descriptions.apply(classify_with_gpt)
#print(predictions)
# Replace 'Good' as 1 and 'Bad' as 2
predictions = predictions.replace({'Good': 1, 'Bad': 2})
y_test = y_test.loc[predictions.index]
# Evaluate the model
llm_accuracy = accuracy_score(y_test, predictions)
llm_f1 = f1_score(y_test, predictions, average='weighted')
llm_confusion = confusion_matrix(y_test, predictions)
print("Accuracy:", llm_accuracy)
print("F1 Score:", llm_f1)
print('Confusion:', llm_confusion)

In [None]:
result = test.copy()
result['target'] = predictions.reset_index()[0]

test_data = BinaryLabelDataset(favorable_label=1, unfavorable_label=2, df=test, label_names=['target'], protected_attribute_names=['Personal status and sex'])
result_data = BinaryLabelDataset(favorable_label=1, unfavorable_label=2, df=result, label_names=['target'], protected_attribute_names=['Personal status and sex'])

metric = ClassificationMetric(test_data, result_data, unprivileged_groups=[{'Personal status and sex':1}], privileged_groups=[{'Personal status and sex':0}])
text_res = MetricTextExplainer(metric)

print('EOD:', text_res.equal_opportunity_difference())
print('ERR:', text_res.average_odds_difference())

# Correlation matrix

In [None]:
#reload the initial dataset without mappings for the analysis of correlation matrix

statlog_german_credit_data = fetch_ucirepo(id=144) 
    
# data (as pandas dataframes) 
X = statlog_german_credit_data.data.features 
y = statlog_german_credit_data.data.targets

In [None]:
categorical_columns = X.select_dtypes(include='object').columns
df_encoded = pd.get_dummies(X, columns=categorical_columns)
print(df_encoded)

# Calculer la matrice de corrélation
corr_matrix = df_encoded.corr()

# Afficher la matrice de corrélation avec Seaborn
plt.figure(figsize=(16,12))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# GPT-4o

In [None]:
def chat_with_gpt(description):
    api_key = 'x'  # my API KEY from my openAI playground
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", api_key))
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"Classify the credit risk based on the following description as Good or as Bad: {description}\nYou can only answer with 'Good' or 'Bad'",
            }
        ],
        model="gpt-4o", #model name
        max_tokens = 5, #max words
        temperature= 0.3
    )
    return chat_completion
    

# Function to classify using GPT
def classify_with_gpt(description):
    response = chat_with_gpt(description)
    return response.choices[0].message.content

# Convert features to text and classify
X_test_descriptions = X_test.apply(features_to_text, axis=1)
#X_test_descriptions = X_test_descriptions[:5] 

predictions = X_test_descriptions.apply(classify_with_gpt)
#print(predictions)
# Replace 'Good' as 1 and 'Bad' as 2
predictions = predictions.replace({'Good': 1, 'Bad': 2})
y_test = y_test.loc[predictions.index]
# Evaluate the model
llm_accuracy = accuracy_score(y_test, predictions)
llm_f1 = f1_score(y_test, predictions, average='weighted')
llm_confusion = confusion_matrix(y_test, predictions)
print("Accuracy:", llm_accuracy)
print("F1 Score:", llm_f1)
print('Confusion:', llm_confusion)  

In [None]:
def chat_with_gpt(description):
    api_key = 'x'  # my API KEY from my openAI playground
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", api_key))
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"Classify the credit risk based on the following description as Good or as Bad: {description}\nYou can only answer with 'Good' or 'Bad'",
            }
        ],
        model="gpt-4o", #model name
        max_tokens = 45, #max words
        temperature= 0.74,
        top_p = 0.17,
        frequency_penalty = 0.45,
        presence_penalty = 1.37
    )
    return chat_completion
    

# Function to classify using GPT
def classify_with_gpt(description):
    response = chat_with_gpt(description)
    return response.choices[0].message.content

# Convert features to text and classify
X_test_descriptions = X_test.apply(features_to_text, axis=1)
#X_test_descriptions = X_test_descriptions[:5] 

predictions = X_test_descriptions.apply(classify_with_gpt)
#print(predictions)
# Replace 'Good' as 1 and 'Bad' as 2
predictions = predictions.replace({'Good': 1, 'Bad': 2})
y_test = y_test.loc[predictions.index]
# Evaluate the model
llm_accuracy = accuracy_score(y_test, predictions)
llm_f1 = f1_score(y_test, predictions, average='weighted')
llm_confusion = confusion_matrix(y_test, predictions)
print("Accuracy:", llm_accuracy)
print("F1 Score:", llm_f1)
print('Confusion:', llm_confusion)  

### Bias measurement

In [None]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

from sklearn.preprocessing import LabelEncoder

def predo(data):
    s = (data.dtypes == 'object')
    object_cols = list(s[s].index)
    pre_data = data.copy()
    label_encoder = LabelEncoder()
    for col in object_cols:
        pre_data[col] = label_encoder.fit_transform(data[col])
    pre_data['Attribute13'][pre_data['Attribute13'] <= 45] = 0
    pre_data['Attribute13'][pre_data['Attribute13'] > 45]= 1

    pre_data['Attribute9'][pre_data['Attribute9'] == 2] = 0  # male
    pre_data['Attribute9'][pre_data['Attribute9'] == 3] = 0  # male
    pre_data['Attribute9'][pre_data['Attribute9'] == 5] = 1  # female
    return pd.DataFrame(pre_data.values.tolist()) 
    
mean_list = ['Status of existing checking account', 'Duration in month', 'Credit history', 'Purpose',
             'Credit amount', 'Savings account/bonds', 'Present employment since',
             'Installment rate in percentage of disposable income', 'Personal status and sex',
             ' Other debtors / guarantors', 'Present residence since', 'Property', 'Age in years',
             'Other installment plans', 'Housing', 'Number of existing credits at this bank' ,'Job',
             'Number of people being liable to provide maintenance for' , 'Telephone' , 'foreign worker',
             'target']

train = predo(train)
test = predo(test)

train.columns = mean_list
test.columns = mean_list
result = test.copy()
result['target'] = predictions.reset_index()[0]

In [None]:
'''data bias test'''

test_data = BinaryLabelDataset(favorable_label=1, unfavorable_label=2, df=test, label_names=['target'], protected_attribute_names=['Personal status and sex','Age in years','foreign worker'])
metric = BinaryLabelDatasetMetric(test_data, unprivileged_groups=[{'foreign worker':0}], privileged_groups=[{'foreign worker':1}])
text_res = MetricTextExplainer(metric)
print('DI:', text_res.disparate_impact())

metric = BinaryLabelDatasetMetric(test_data, unprivileged_groups=[{'Age in years':1}], privileged_groups=[{'Age in years':0}])
text_res = MetricTextExplainer(metric)
print('DI:', text_res.disparate_impact())

metric = BinaryLabelDatasetMetric(test_data, unprivileged_groups=[{'Personal status and sex':1}], privileged_groups=[{'Personal status and sex':0}])
text_res = MetricTextExplainer(metric)
print('DI:', text_res.disparate_impact())


train_data = BinaryLabelDataset(favorable_label=1, unfavorable_label=2, df=train, label_names=['target'], protected_attribute_names=['Personal status and sex','Age in years','foreign worker'])
metric = BinaryLabelDatasetMetric(train_data, unprivileged_groups=[{'foreign worker':0}], privileged_groups=[{'foreign worker':1}])
text_res = MetricTextExplainer(metric)
print('DI:', text_res.disparate_impact())

metric = BinaryLabelDatasetMetric(train_data, unprivileged_groups=[{'Age in years':1}], privileged_groups=[{'Age in years':0}])
text_res = MetricTextExplainer(metric)
print('DI:', text_res.disparate_impact())

metric = BinaryLabelDatasetMetric(train_data, unprivileged_groups=[{'Personal status and sex':1}], privileged_groups=[{'Personal status and sex':0}])
text_res = MetricTextExplainer(metric)
print('DI:', text_res.disparate_impact())


'''method bias test'''

result_data = BinaryLabelDataset(favorable_label=1, unfavorable_label=2, df=result, label_names=['target'], protected_attribute_names=['Personal status and sex','Age in years','foreign worker'])

metric = ClassificationMetric(test_data, result_data, unprivileged_groups=[{'foreign worker':0}], privileged_groups=[{'foreign worker':1}])
text_res = MetricTextExplainer(metric)

print('EOD:', text_res.equal_opportunity_difference())
print('ERR:', text_res.average_odds_difference())

metric = ClassificationMetric(test_data, result_data, unprivileged_groups=[{'Age in years':1}], privileged_groups=[{'Age in years':0}])
text_res = MetricTextExplainer(metric)

print('EOD:', text_res.equal_opportunity_difference())
print('ERR:', text_res.average_odds_difference())

metric = ClassificationMetric(test_data, result_data, unprivileged_groups=[{'Personal status and sex':1}], privileged_groups=[{'Personal status and sex':0}])
text_res = MetricTextExplainer(metric)

print('EOD:', text_res.equal_opportunity_difference())
print('ERR:', text_res.average_odds_difference())

print('down')

## Other tests with GPT-4o

In [None]:
def chat_with_gpt(description):
    api_key = 'x'  # my API KEY from my openAI playground
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", api_key))
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"Evaluate the creditworthiness of a customer with the following financial profile and respond with only Good or Bad. For example, a client described as having a stable income, no previous debts, and owning property should be classified as Good. Client:{description}",
            }
        ],
        model="gpt-4o", #model name
        max_tokens = 5, #max words
        temperature= 0.3
    )
    return chat_completion
    

# Function to classify using GPT
def classify_with_gpt(description):
    response = chat_with_gpt(description)
    return response.choices[0].message.content

# Convert features to text and classify
X_test_descriptions = X_test.apply(features_to_text, axis=1)
#X_test_descriptions = X_test_descriptions[:5] 

predictions = X_test_descriptions.apply(classify_with_gpt)
#print(predictions)
# Replace 'Good' as 1 and 'Bad' as 2
predictions = predictions.replace({'Good': 1, 'Bad': 2})
y_test = y_test.loc[predictions.index]
# Evaluate the model
llm_accuracy = accuracy_score(y_test, predictions)
llm_f1 = f1_score(y_test, predictions, average='weighted')
llm_confusion = confusion_matrix(y_test, predictions)
print("Accuracy:", llm_accuracy)
print("F1 Score:", llm_f1)
print('Confusion:', llm_confusion)  

In [None]:
def objective(trial):
    # Suggesting parameters
    temperature = trial.suggest_float('temperature', 0.1, 1.0)
    top_p = trial.suggest_float('top_p', 0.1, 1.0)
    max_tokens = trial.suggest_int('max_tokens', 5, 50)
    frequency_penalty = trial.suggest_float('frequency_penalty', 0.0, 2.0)
    presence_penalty = trial.suggest_float('presence_penalty', 0.0, 2.0)

    # Redefining the classifier with new parameters
    def classify_with_gpt(description):
        api_key = 'x'
        client = OpenAI(api_key=api_key)
        prompt = f"Classify the credit risk based on the following description as 'Good' or as 'Bad': {description}\n\nClassification:"
        
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model="gpt-4o",
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            frequency_penalty=frequency_penalty,
            presence_penalty=presence_penalty
        )
        
        response = chat_completion.choices[0].message.content.strip()
        return 'Good' if 'Good' in response else 'Bad'

    # Predicting using the classifier
    predictions = X_test_descriptions.apply(classify_with_gpt)
    predictions = predictions.replace({'Good': 1, 'Bad': 2})

    # Evaluation
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average='weighted')

    return accuracy

In [None]:
X_test_descriptions = X_test.apply(features_to_text, axis=1)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10, timeout=600)  # Run for 10 trials or stop after 600 seconds

print("Best trial:")
trial = study.best_trial

print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

In [None]:
#test with best parameter 
def chat_with_gpt(description):
    api_key = 'x'  # my API KEY from my openAI playground
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", api_key))
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"Classify the credit risk based on the following description as Good or as Bad: {description}\nYou can only answer with 'Good' or 'Bad'",
            }
        ],
        model="gpt-4o", #model name
        max_tokens = 32, #max words
        temperature= 0.5,
        top_p = 0.74,
        frequency_penalty = 0.18,
        presence_penalty = 0.98
    )
    return chat_completion
    

# Function to classify using GPT
def classify_with_gpt(description):
    response = chat_with_gpt(description)
    return response.choices[0].message.content

# Convert features to text and classify
X_test_descriptions = X_test.apply(features_to_text, axis=1)
#X_test_descriptions = X_test_descriptions[:5] 

predictions = X_test_descriptions.apply(classify_with_gpt)
#print(predictions)
# Replace 'Good' as 1 and 'Bad' as 2
predictions = predictions.replace({'Good': 1, 'Bad': 2})
y_test = y_test.loc[predictions.index]
# Evaluate the model
llm_accuracy = accuracy_score(y_test, predictions)
llm_f1 = f1_score(y_test, predictions, average='weighted')
llm_confusion = confusion_matrix(y_test, predictions)
print("Accuracy:", llm_accuracy)
print("F1 Score:", llm_f1)
print('Confusion:', llm_confusion)

## Gender bias focus

In [None]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

from sklearn.preprocessing import LabelEncoder

def predo(data):
    s = (data.dtypes == 'object')
    object_cols = list(s[s].index)
    pre_data = data.copy()
    label_encoder = LabelEncoder()
    for col in object_cols:
        pre_data[col] = label_encoder.fit_transform(data[col])
    pre_data['Attribute13'][pre_data['Attribute13'] <= 45] = 0
    pre_data['Attribute13'][pre_data['Attribute13'] > 45]= 1

    pre_data['Attribute9'][pre_data['Attribute9'] == 2] = 0  # male
    pre_data['Attribute9'][pre_data['Attribute9'] == 3] = 0  # male
    pre_data['Attribute9'][pre_data['Attribute9'] == 5] = 1  # female
    return pd.DataFrame(pre_data.values.tolist()) 
    
mean_list = ['Status of existing checking account', 'Duration in month', 'Credit history', 'Purpose',
             'Credit amount', 'Savings account/bonds', 'Present employment since',
             'Installment rate in percentage of disposable income', 'Personal status and sex',
             ' Other debtors / guarantors', 'Present residence since', 'Property', 'Age in years',
             'Other installment plans', 'Housing', 'Number of existing credits at this bank' ,'Job',
             'Number of people being liable to provide maintenance for' , 'Telephone' , 'foreign worker',
             'target']

train = predo(train)
test = predo(test)

train.columns = mean_list
test.columns = mean_list
result = test.copy()
result['target'] = predictions.reset_index()[0]

In [None]:
test_data = BinaryLabelDataset(favorable_label=1, unfavorable_label=2, df=test, label_names=['target'], protected_attribute_names=['Personal status and sex'])
result_data = BinaryLabelDataset(favorable_label=1, unfavorable_label=2, df=result, label_names=['target'], protected_attribute_names=['Personal status and sex'])

metric = ClassificationMetric(test_data, result_data, unprivileged_groups=[{'Personal status and sex':1}], privileged_groups=[{'Personal status and sex':0}])
text_res = MetricTextExplainer(metric)

print('EOD:', text_res.equal_opportunity_difference())
print('ERR:', text_res.average_odds_difference())


# Testing prompts

## GPT-4

max_tokens = 45, #max words
        temperature= 0.74,
        top_p = 0.17,
        frequency_penalty = 0.45,
        presence_penalty = 1.37

In [9]:
def chat_with_gpt(description):
    api_key = 'x'  # my API KEY from my openAI playground
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", api_key))
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"Your task is to Classify the credit risk of individuals based on detailed financial and demographic data as 'Good' or as 'Bad'. Ensure fairness and avoid bias based on age, gender, or nationality. \n The Dataset used is 'Statlog German Credit Data', assessing creditworthiness using 20 attributes related to financial stability and history. Directly following this instruction, analyze the provided data attributes and determine the credit risk as either 'Good' or 'Bad', your answer should only be 'Good' or 'Bad', nothing else."
            },
            {
                "role": "user",
                "content": description
            }
        ],
        model="gpt-4",
        max_tokens=45,  
        temperature=0.74, 
        top_p = 0.17,
        frequency_penalty = 0.45,
        presence_penalty = 1.37
    )
    return chat_completion

def classify_with_gpt(description):
    response = chat_with_gpt(description)
    output = response.choices[0].message.content.strip()
    if output not in ['Good', 'Bad']:
        raise ValueError(f"Unexpected output: {output}")
    return output

# Convert features to text and classify
X_test_descriptions = X_test.apply(features_to_text, axis=1)
predictions = X_test_descriptions.apply(classify_with_gpt)

# Replace 'Good' as 1 and 'Bad' as 2
predictions = predictions.replace({'Good': 1, 'Bad': 2})
y_test = y_test.loc[predictions.index]

# Evaluate the model
llm_accuracy = accuracy_score(y_test, predictions)
llm_f1 = f1_score(y_test, predictions, average='weighted')
llm_confusion = confusion_matrix(y_test, predictions)

print("Accuracy:", llm_accuracy)
print("F1 Score:", llm_f1)
print('Confusion:', llm_confusion)

Accuracy: 0.706060606060606
F1 Score: 0.6490992893823093
Confusion: [[649  45]
 [246  50]]


In [None]:
def chat_with_gpt(description):
    api_key = 'x'  # my API KEY from my openAI playground
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", api_key))
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"Task: Accurately classify the credit risk of individuals as ’Good’ or ’Bad’ based on the detailed description provided below. Your response must strictly adhere to this binary classification.Dataset: ’Statlog (German Credit Data)’, comprising attributes relevant to determining creditworthiness, including demographic information, financial history, and employment details. Guidelines: 1. Your classification must reflect a rigorous analysis of the financial and demographic data without bias.2. Do not provide explanations or justifications. Your response should categorically state ’Good’ or ’Bad’ based on the credit risk assessment, do not explain the reason behind the classification.3. Ensure fairness and impartiality in every assessment, avoiding discrimination based on age, gender, or nationality.Directly following this instruction, analyze the provided data attributes and determine the credit risk as either ’Good’ or ’Bad’."
            },
            {
                "role": "user",
                "content": description
            }
        ],
        model="gpt-4",
        max_tokens=2,  # Slightly increased to allow for the correct response
        temperature=0.0  # Lower temperature for deterministic output
    )
    return chat_completion

def classify_with_gpt(description):
    response = chat_with_gpt(description)
    output = response.choices[0].message.content.strip()
    if output not in ['Good', 'Bad']:
        raise ValueError(f"Unexpected output: {output}")
    return output

# Convert features to text and classify
X_test_descriptions = X_test.apply(features_to_text, axis=1)
predictions = X_test_descriptions.apply(classify_with_gpt)

# Replace 'Good' as 1 and 'Bad' as 2
predictions = predictions.replace({'Good': 1, 'Bad': 2})
y_test = y_test.loc[predictions.index]

# Evaluate the model
llm_accuracy = accuracy_score(y_test, predictions)
llm_f1 = f1_score(y_test, predictions, average='weighted')
llm_confusion = confusion_matrix(y_test, predictions)

print("Accuracy:", llm_accuracy)
print("F1 Score:", llm_f1)
print('Confusion:', llm_confusion)

## GPT-3.5-Turbo

In [18]:
def chat_with_gpt(description):
    api_key = 'x'  # my API KEY from my openAI playground
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", api_key))
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"Given the input below, classify the credit risk as either 'Good' or 'Bad'. Your answer must strictly be 'Good' or 'Bad'. Do not provide any additional text or explanation."
            },
            {
                "role": "user",
                "content": description
            }
        ],
        model="gpt-3.5-turbo",
        max_tokens=2,  # Slightly increased to allow for the correct response
        temperature=0.0  # Lower temperature for deterministic output
    )
    return chat_completion

def classify_with_gpt(description):
    response = chat_with_gpt(description)
    output = response.choices[0].message.content.strip()
    if output not in ['Good', 'Bad']:
        raise ValueError(f"Unexpected output: {output}")
    return output

# Convert features to text and classify
X_test_descriptions = X_test.apply(features_to_text, axis=1)
predictions = X_test_descriptions.apply(classify_with_gpt)

# Replace 'Good' as 1 and 'Bad' as 2
predictions = predictions.replace({'Good': 1, 'Bad': 2})
y_test = y_test.loc[predictions.index]

# Evaluate the model
llm_accuracy = accuracy_score(y_test, predictions)
llm_f1 = f1_score(y_test, predictions, average='weighted')
llm_confusion = confusion_matrix(y_test, predictions)

print("Accuracy:", llm_accuracy)
print("F1 Score:", llm_f1)
print('Confusion:', llm_confusion)

Accuracy: 0.6515151515151515
F1 Score: 0.5906810544444271
Confusion: [[616  78]
 [267  29]]


In [11]:
def chat_with_gpt(description):
    api_key = 'x'  # my API KEY from my openAI playground
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", api_key))
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"Your task is to Classify the credit risk of individuals based on detailed financial and demographic data as 'Good' or as 'Bad'. Ensure fairness and avoid bias based on age, gender, or nationality. \n The Dataset used is 'Statlog German Credit Data', assessing creditworthiness using 20 attributes related to financial stability and history. Directly following this instruction, analyze the provided data attributes and determine the credit risk as either 'Good' or 'Bad', your answer should only be 'Good' or 'Bad', nothing else."
            },
            {
                "role": "user",
                "content": description
            }
        ],
        model="gpt-3.5-turbo",
        max_tokens=39,  
        temperature=0.66, 
        top_p = 0.95,
        frequency_penalty = 1.06,
        presence_penalty = 1.38
    )
    return chat_completion

def classify_with_gpt(description):
    response = chat_with_gpt(description)
    output = response.choices[0].message.content.strip()
    if output not in ['Good', 'Bad']:
        raise ValueError(f"Unexpected output: {output}")
    return output

# Convert features to text and classify
X_test_descriptions = X_test.apply(features_to_text, axis=1)
predictions = X_test_descriptions.apply(classify_with_gpt)

# Replace 'Good' as 1 and 'Bad' as 2
predictions = predictions.replace({'Good': 1, 'Bad': 2})
y_test = y_test.loc[predictions.index]

# Evaluate the model
llm_accuracy = accuracy_score(y_test, predictions)
llm_f1 = f1_score(y_test, predictions, average='weighted')
llm_confusion = confusion_matrix(y_test, predictions)

print("Accuracy:", llm_accuracy)
print("F1 Score:", llm_f1)
print('Confusion:', llm_confusion)

Accuracy: 0.693939393939394
F1 Score: 0.5831443412159858
Confusion: [[682  12]
 [291   5]]
