In [5]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, precision_score, recall_score, mean_absolute_error
import scipy.stats as stats



In [6]:
def cv_shuffle(df):
    # Shuffle the DataFrame and reset the index
    shuffled_df = df.sample(frac=1).reset_index(drop=True)

    # Calculate the lengths of the five dataframes
    total_length = len(shuffled_df)
    partition_lengths = [total_length // 5] * 4 + [total_length - (total_length // 5) * 4]

    # Split the DataFrame into five parts
    dfs = [shuffled_df.iloc[sum(partition_lengths[:i]):sum(partition_lengths[:i+1])] for i in range(5)]

    # Optional: Reset the index of each DataFrame
    dfs = [df.reset_index(drop=True) for df in dfs]
    return dfs

In [7]:
# CI
def calc_ci(data):
    result = {}

    for key, values in data.items():
        # Calculate mean and standard deviation
        mean_value = np.mean(values)
        std_dev = np.std(values)

        # Calculate confidence interval
        confidence_interval = stats.norm.interval(0.95, loc=mean_value, scale=std_dev/np.sqrt(len(values)))

        result[key] = {'mean': mean_value, 'confidence_interval': confidence_interval}

    print(result)
    return result

## Classification

In [9]:
# data import
adult = pd.read_csv('datasets/clean/adult_scaled.csv')

In [10]:
# Recreate train_test_split of the model
train_set = adult.sample(frac=0.7, random_state=42)
val_set = adult.drop(train_set.index)

#### No Feedback

In [17]:
syn_data_nof = pd.read_csv('sampled/adult_nof_100e_500b.csv')
syn_data_nof = syn_data_nof.drop('Unnamed: 0', axis=1)

In [18]:
dfs = cv_shuffle(syn_data_nof)
res_dict = {'precision':[], 'recall':[]}
for df in dfs:
    logreg = LogisticRegression()
    logreg.fit(df.iloc[:, :-1], df.iloc[:, -1])
    precision = precision_score(val_set.iloc[:, -1], logreg.predict(val_set.iloc[:, :-1]))
    recall = recall_score(val_set.iloc[:, -1], logreg.predict(val_set.iloc[:, :-1]))
    res_dict['precision'].append(precision)
    res_dict['recall'].append(recall)
    print(f'precision:{precision}, recall:{recall}')

precision:0.572289156626506, recall:0.4498493327593629
precision:0.5700619020821609, recall:0.4360740421868274
precision:0.5719844357976653, recall:0.4429616874730951
precision:0.5740011254924029, recall:0.4390873869995695
precision:0.5795257374204743, recall:0.4313387860525183


In [19]:
res = calc_ci(res_dict)

{'precision': {'mean': 0.5735724714838419, 'confidence_interval': (0.570742566600792, 0.5764023763668918)}, 'recall': {'mean': 0.4398622470942747, 'confidence_interval': (0.4343610823662003, 0.44536341182234906)}}


#### Feedback

In [None]:
syn_data_feedback = pd.read_csv('sampled/adult_feedback_100e_500b.csv')
syn_data_feedback = syn_data_feedback.drop('Unnamed: 0', axis=1)

In [None]:
dfs = cv_shuffle(syn_data_feedback)

In [None]:
res_dict = {'precision':[], 'recall':[]}
for df in dfs:
    logreg = LogisticRegression()
    logreg.fit(df.iloc[:, :-1], df.iloc[:, -1])
    precision = precision_score(val_set.iloc[:, -1], logreg.predict(val_set.iloc[:, :-1]))
    recall = recall_score(val_set.iloc[:, -1], logreg.predict(val_set.iloc[:, :-1]))
    res_dict['precision'].append(precision)
    res_dict['recall'].append(recall)
    print(f'precision:{precision}, recall:{recall}')

In [None]:
res = calc_ci(res_dict)

## Regression

In [4]:
# data import
house = pd.read_csv('datasets/clean/house_price.csv')

In [5]:
# Recreate train_test_split of the model
train_set = house.sample(frac=0.7, random_state=42)
val_set = house.drop(train_set.index)

#### No Feedback

In [6]:
syn_data_nof = pd.read_csv('sampled/house_nof_500e_200b.csv')
syn_data_nof = syn_data_nof.drop('Unnamed: 0', axis=1)

In [7]:
data = pd.read_csv('datasets/raw/house_price.csv')

In [8]:
dfs = cv_shuffle(syn_data_nof)

In [9]:
res_dict = {'rmse':[], 'r2':[]}
for df in dfs:
    linreg = LinearRegression()
    linreg.fit(df.iloc[:, :-1], df.iloc[:, -1])
    rmse = np.sqrt(mean_squared_error(val_set.iloc[:, -1], linreg.predict(val_set.iloc[:, :-1])))
    rsqrt = r2_score(val_set.iloc[:, -1], linreg.predict(val_set.iloc[:, :-1]))
    res_dict['rmse'].append(rmse)
    res_dict['r2'].append(rsqrt)
    print(f'rmse:{rmse}, r2:{rsqrt}')

rmse:0.011593275383325295, r2:0.3806921606739512
rmse:0.011790829085683758, r2:0.3594058601632941
rmse:0.012123981225061274, r2:0.32269421726899383
rmse:0.011613713248734208, r2:0.37850667181156883
rmse:0.01176469542093801, r2:0.36224239007582903


In [10]:
res = calc_ci(res_dict)

{'rmse': {'mean': 0.01177729887274851, 'confidence_interval': (0.011610457478498537, 0.011944140266998481)}, 'r2': {'mean': 0.3607082599987274, 'confidence_interval': (0.3424679249360426, 0.3789485950614122)}}


#### Feedback

In [15]:
syn_data_feedback = pd.read_csv('sampled/house_feedback_500e_200b_new.csv')
syn_data_feedback = syn_data_feedback.drop('Unnamed: 0', axis=1)

In [16]:
dfs = cv_shuffle(syn_data_feedback)

In [17]:
res_dict = {'rmse':[], 'r2':[]}
for df in dfs:
    linreg = LinearRegression()
    linreg.fit(df.iloc[:, :-1], df.iloc[:, -1])
    rmse = np.sqrt(mean_squared_error(val_set.iloc[:, -1], linreg.predict(val_set.iloc[:, :-1])))
    rsqrt = r2_score(val_set.iloc[:, -1], linreg.predict(val_set.iloc[:, :-1]))
    res_dict['rmse'].append(rmse)
    res_dict['r2'].append(rsqrt)
    print(f'rmse:{rmse}, r2:{rsqrt}')

rmse:0.011470106099226779, r2:0.39378155866730014
rmse:0.011456215973495672, r2:0.39524891246133453
rmse:0.011457711581260717, r2:0.3950910017753292
rmse:0.01163397269274467, r2:0.37633646298299817
rmse:0.011495882484004313, r2:0.39105382839258496


In [18]:
res = calc_ci(res_dict)

{'rmse': {'mean': 0.011502777766146432, 'confidence_interval': (0.011443944814425643, 0.01156161071786722)}, 'r2': {'mean': 0.39030235285590936, 'confidence_interval': (0.3840413151194081, 0.39656339059241064)}}
