In [1]:
import pandas as pd
import numpy as np
import sklearn
from skopt import BayesSearchCV
from sklearn.model_selection import train_test_split
from bayes_opt import BayesianOptimization

Selection of procedure:

In [2]:
# TODO: Implement!

In [3]:
# Select preprocessing methods
# You can choose between: None, ...
preprocess_method = ['None']

# Select regression model
# You can choose between: ...
regression_model = ''

# Select hyperparameter optimization method
# You can choose between: ...
hyp_opt_method = ''

Basic functions:

In [4]:
def create_one_hot_encoded_vector(mapping: dict, data: np.ndarray) -> np.ndarray:
    """
    Creates one-hot encoded vector
    
    :param mapping: A dictionary mapping category labels to unique indices.
    :param data: Should have shape (n_samples,)
    :return: returns one-hot encoded vector with shape (n_samples, len(mapping))
    """
    num_samples = len(data)
    num_categories = len(mapping)
    one_hot_encoded = np.zeros((num_samples, num_categories), dtype=int)

    for i, value in enumerate(data):
        index = int(mapping[value])
        one_hot_encoded[i, index] = 1

    return one_hot_encoded
    

## Load Data and make temporary train-test-split

In [5]:
df_train = pd.read_csv('C:/Users/Johannes/Documents/A_KIT/Informatik/Praktikum Smart Data Analytics/Übung 1/Aufgabe 1/teco-psda-exercisesheet1-credit-2024/credit_train.csv')
df_test = pd.read_csv('C:/Users/Johannes/Documents/A_KIT/Informatik/Praktikum Smart Data Analytics/Übung 1/Aufgabe 1/teco-psda-exercisesheet1-credit-2024/credit_test.csv')

print(df_test.columns)
print(df_train.columns)

Index(['Unnamed: 0', 'Income', 'Limit', 'Rating', 'Cards', 'Age', 'Education',
       'Gender', 'Student', 'Married', 'Ethnicity'],
      dtype='object')
Index(['Unnamed: 0', 'Income', 'Limit', 'Rating', 'Cards', 'Age', 'Education',
       'Gender', 'Student', 'Married', 'Ethnicity', 'Balance'],
      dtype='object')


In [6]:
train = df_train.values
train_ids = train[:, 0]
train_balance = train[:, -1]
train_data = train[:, 1:-1]

test = df_test.values 

print(train.shape)
print(train_ids.shape)
print(train_balance.shape)
print(train_data.shape)
print(test.shape)

(350, 12)
(350,)
(350,)
(350, 10)
(50, 11)


In [7]:
np.mean(train_balance)

521.7085714285714

Encoding for Strings in data:

In [8]:
map_gender = np.vectorize(lambda x: 0 if x == 'Female' else 1)
map_no_and_yes = np.vectorize(lambda x: 0 if x == 'No' else 1)

# One-hot encoding for ethnicity
different_ethnicities = set(test[:, -1]).union(set(train_data[:, -1]))
dict_different_ethnicities = dict(zip(different_ethnicities, range(len(different_ethnicities))))

In [9]:
# TODO: Make function!
converted_train_data = np.zeros((train_data.shape[0], train_data[:, :9].shape[1] + len(different_ethnicities)))

print(converted_train_data.shape)

converted_train_data[:, :6] = train_data[:, :6]
converted_train_data[:, 6] = map_gender(train_data[:, 6])
converted_train_data[:, 7] = map_no_and_yes(train_data[:, 7])
converted_train_data[:, 8] = map_no_and_yes(train_data[:, 8])
converted_train_data[:, 9:] = create_one_hot_encoded_vector(dict_different_ethnicities, train_data[:, 9])

(350, 12)


In [10]:
temp_train_ids, temp_test_ids, temp_train_data, temp_test_data, temp_train_balance, temp_test_balance = train_test_split(train_ids, converted_train_data, train_balance, test_size=50, random_state=4)

## Preprocess data

In [11]:
snv = sklearn.preprocessing.StandardScaler()
prep_temp_train_data = snv.fit_transform(temp_train_data)
prep_temp_test_data = snv.transform(temp_test_data)

## Create regression models 

In [39]:
logistic_regression = sklearn.linear_model.LogisticRegression()
logistic_regression.fit(prep_temp_train_data, temp_train_balance.astype(int))

In [40]:
temp_test_balance_pred = logistic_regression.predict(prep_temp_test_data)

print(sklearn.metrics.mean_absolute_error(temp_test_balance, temp_test_balance_pred))

165.76


In [41]:
gaussian_process = sklearn.gaussian_process.GaussianProcessRegressor()
gaussian_process.fit(prep_temp_train_data, temp_train_balance)

In [42]:
temp_test_balance_pred = gaussian_process.predict(prep_temp_test_data)

print(sklearn.metrics.mean_absolute_error(temp_test_balance, temp_test_balance_pred))

274.9256429764146


In [43]:
def hyperparameter_function_gp(x, y):
    """ Function for hyperparameter optimization
    """
    pass

In [44]:
svm = sklearn.svm.SVR()
svm.fit(prep_temp_train_data, temp_train_balance)

In [45]:
temp_test_balance_pred = svm.predict(prep_temp_test_data)

print(sklearn.metrics.mean_absolute_error(temp_test_balance, temp_test_balance_pred))

370.76696834150954


In [46]:
def hyperparameter_function_svr(gamma, C, epsilon):
    """ Function for hyperparameter optimization
    """
    svm = sklearn.svm.SVR(gamma=gamma, C=C, epsilon=epsilon)
    svm.fit(prep_temp_train_data, temp_train_balance)
    temp_test_balance_pred = svm.predict(prep_temp_test_data)
    return - sklearn.metrics.mean_absolute_error(temp_test_balance, temp_test_balance_pred)

# Bounded region of parameter space
pbounds = {'gamma': (0, 1.5), 'C': (0.00001, 2), 'epsilon': (0, 0.5)}

optimizer = BayesianOptimization(
    f=hyperparameter_function_svr,
    pbounds=pbounds,
    random_state=6,
)

In [47]:
optimizer.maximize(
    init_points=50,
    n_iter=100,
)

|   iter    |  target   |     C     |  epsilon  |   gamma   |
-------------------------------------------------------------
| [0m1        [0m | [0m-377.6   [0m | [0m1.786    [0m | [0m0.166    [0m | [0m1.232    [0m |
| [0m2        [0m | [0m-377.9   [0m | [0m0.0834   [0m | [0m0.05383  [0m | [0m0.8926   [0m |
| [95m3        [0m | [95m-376.9   [0m | [95m1.06     [0m | [95m0.2094   [0m | [95m0.5031   [0m |
| [0m4        [0m | [0m-377.6   [0m | [0m1.245    [0m | [0m0.2191   [0m | [0m1.104    [0m |
| [0m5        [0m | [0m-377.6   [0m | [0m1.036    [0m | [0m0.2894   [0m | [0m0.968    [0m |
| [95m6        [0m | [95m-376.6   [0m | [95m1.98     [0m | [95m0.4099   [0m | [95m0.6198   [0m |
| [95m7        [0m | [95m-365.9   [0m | [95m1.753    [0m | [95m0.4119   [0m | [95m0.08171  [0m |
| [0m8        [0m | [0m-377.6   [0m | [0m1.437    [0m | [0m0.4011   [0m | [0m1.105    [0m |
| [0m9        [0m | [0m-373.0   [0m | [0

In [53]:
svm = sklearn.svm.SVR(gamma=2, C=0.3817, epsilon=0.4847)
svm.fit(prep_temp_train_data, temp_train_balance)
temp_test_balance_pred = svm.predict(prep_temp_test_data)
print(sklearn.metrics.mean_absolute_error(temp_test_balance, temp_test_balance_pred))

377.8795372873753


In [58]:
random_forest = sklearn.ensemble.RandomForestRegressor(n_estimators=100, random_state=6)
random_forest.fit(prep_temp_train_data, temp_train_balance)

In [59]:
temp_test_balance_pred = random_forest.predict(prep_temp_test_data)

print(sklearn.metrics.mean_absolute_error(temp_test_balance, temp_test_balance_pred))

55.192800000000005


In [51]:
adaboost = sklearn.ensemble.AdaBoostRegressor()
adaboost.fit(prep_temp_train_data, temp_train_balance)

In [52]:
temp_test_balance_pred = adaboost.predict(prep_temp_test_data)

print(sklearn.metrics.mean_absolute_error(temp_test_balance, temp_test_balance_pred))

115.56893756438748
