# PHW 2 : The California Housing Prices Dataset
> 201835503 이지민, 201835474 안해빈, 202037634 윤주은, 201835508 임윤수

# Contents of Table
> ## 1. Objective
> ## 2. End-To-End Process
>> ### 2-1. Data Inspection, Data Preparation
>> ### 2-2  Analysis Algorithms, Evaluation + Choose Parameters
> ## 3. Function - "AutoML"
> ## 4. Main Program
> ## 5. Conclusion

# 1. Objective

> We would like to show different combinations of algorithms to analyze a dataset
> We will create a program structure with a 
>    single major function **“AutoML”** that will automatically run different combinations of the following : 
> -	Data Scaling, Data Encoding
> -	Various Clustering Algorithms
> -	Various quality measuring tools
> -	Various values of parameters and hyperparameters
> -	Various subsets of the features of the dataset

# 2. End-to-End Process
# 2.1
> ## Dataset
>> https://www.kaggle.com/camnugent/california-housing-prices
> ## Dataset Inspection

In [None]:
import pandas as pd
import numpy as np

# Data exploration
df = pd.read_csv('housing.csv', sep=',')
print("__California Housing Prices Dataset\n")
df.head()

In [None]:
from matplotlib import pyplot as plt
K = [4, 5, 6]

df_sort = df.sort_values('median_house_value')
    
    
for N in K :
    
    min = df_sort['median_house_value'].min()
    max = df_sort['median_house_value'].max()
    delta = max-min
    size = delta/N
    extra= delta*0.001
    intervals = np.arange(min,max+extra,size)
    #min-extra
    intervals[0] -= delta*0.001
    print('for N = ', N, intervals)
    
    n = 0
    
    colors = ['blue', 'red', 'green', 'yellow', 'black', 'magenta']
    count = 0
    colors_index = 0
    
    for i in intervals :
        if (df['median_house_value'][count] > n and df['median_house_value'][count] <  i) :
            plt.scatter(df.index, df['median_house_value'], c = colors[colors_index])
            count = count + 1
            n = n + i
    colors_index = colors_index + 1
    plt.scatter(x1, y1, color = 'purple', alpha = 0.6, label = 'uniform')
    plt.scatter(x2, y2, color = 'orange', alpha = 0.7, label = 'normal')
    plt.legend()
    plt.show()

In [None]:
import pandas as pd
import numpy as np

def targetValueLabeling(n):
    df = pd.read_csv('housing.csv', sep=',').iloc[:,8]
    
    th = (max(df) - min(df))/n
    bins = [min(df)]
    
    for i in range(0, n):
        bins.append(th*(i+1))
        
    bins.append(max(df))
    
    bins_label = []
    for i in range(0, n+1):
        bins_label.append(i)
        
    df["level"] = pd.cut(df, bins, right=True, labels=bins_label)
    
    return df["level"]

pd.set_option('display.max_rows', None)
targetValueLabeling(5)

In [None]:
df.info()

In [None]:
df.describe()

> ## Data Preprocessing
> - Drop target value : "median_house_value"
> - Clean dirty data

In [None]:
# Data preprocessing_1 : Drop attribute "median_house_value", because it is the answer value
df.drop(df.columns[8], axis = 1, inplace = True)
print("__Drop attribute 'median_house_value', because it is the answer value\n")
df.head()

In [None]:
# Data preprocessing_2 : find NA
print("__Find NA\n")
print(df.isna().sum(), "\n")
# replace NA to MEDIAN
df = df.replace(np.nan,int(df.iloc[:, 5].median()))
print("__NA replaced to MEDIAN value. Find NA again\n")
print(df.isna().sum(), "\n")

# 2.2
> ## Analysis Algorithms, Evaluation
> ### Algo : K-means, EM(GMM), CLARANS, DBSCAN, Spectral Clustering
> ### Scaler : StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler, Normalizer
> ### Encoder : LabelEncoder, OneHotEncoder

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler, Normalizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import proj3d
from sklearn.mixture import GaussianMixture
from pyclustering.cluster.clarans import clarans
from pyclustering.utils import timedcall
from sklearn.cluster import DBSCAN
from sklearn.cluster import SpectralClustering

In [None]:
# Encoding (attribute 'ocean_proximity')
label = df['ocean_proximity']
df_encoded = df.drop(df.columns[8], axis = 1)
        
#Label encoding
le = LabelEncoder()
le.fit(label)
label_encoded = le.transform(label)
df_label_encoded = pd.DataFrame(label_encoded, columns = ['labelEncoded_oceanProximity'])
df_label_encoded = pd.concat([df_encoded, df_label_encoded], axis = 1)

#Onehot encodingZ
df_oneHot_encoded = pd.get_dummies(label)
df_oneHot_encoded = pd.concat([df_encoded, df_oneHot_encoded], axis = 1)

In [None]:
# Encoding + Scale
df_oneHot_standard = StandardScaler().fit_transform(df_oneHot_encoded)
df_oneHot_robust = RobustScaler().fit_transform(df_oneHot_encoded)
df_oneHot_minmax = MinMaxScaler().fit_transform(df_oneHot_encoded)
df_oneHot_maxabs = MaxAbsScaler().fit_transform(df_oneHot_encoded)
df_oneHot_normalizer = Normalizer().fit_transform(df_oneHot_encoded)

df_label_standard = StandardScaler().fit_transform(df_label_encoded)
df_label_robust = RobustScaler().fit_transform(df_label_encoded)
df_label_minmax = MinMaxScaler().fit_transform(df_label_encoded)
df_label_maxabs = MaxAbsScaler().fit_transform(df_label_encoded)
df_label_normalizer = Normalizer().fit_transform(df_label_encoded)

In [None]:
# Change format of scaled data (np.arrays -> DataFrame)
df_label_standard = pd.DataFrame(df_label_standard, columns=df_label_encoded.iloc[:,0:9].columns)
df_label_robust = pd.DataFrame(df_label_robust, columns=df_label_encoded.iloc[:,0:9].columns)
df_label_minmax = pd.DataFrame(df_label_minmax, columns=df_label_encoded.iloc[:,0:9].columns)
df_label_maxabs = pd.DataFrame(df_label_maxabs, columns=df_label_encoded.iloc[:,0:9].columns)
df_label_normalizer = pd.DataFrame(df_label_normalizer, columns=df_label_encoded.iloc[:,0:9].columns)

df_oneHot_standard = pd.DataFrame(df_oneHot_standard, columns=df_oneHot_encoded.iloc[:,0:13].columns)
df_oneHot_robust = pd.DataFrame(df_oneHot_robust, columns=df_oneHot_encoded.iloc[:,0:13].columns)
df_oneHot_minmax = pd.DataFrame(df_oneHot_minmax, columns=df_oneHot_encoded.iloc[:,0:13].columns)
df_oneHot_maxabs = pd.DataFrame(df_oneHot_maxabs, columns=df_oneHot_encoded.iloc[:,0:13].columns)
df_oneHot_normalizer = pd.DataFrame(df_oneHot_normalizer, columns=df_oneHot_encoded.iloc[:,0:13].columns)

data_list = ['df_label_standard', 'df_label_robust', 'df_label_minmax', 'df_label_maxabs', 

             'df_oneHot_standard', 'df_oneHot_robust','df_oneHot_minmax' , 'df_oneHot_maxabs', ]

In [None]:
# K-means
for K in [4, 5, 6] :
    print("\n__Result of K-Means Clustering, K : ", K)

    for data in data_list:
        clust_model = KMeans(n_clusters = K, random_state = 42, algorithm = 'auto')
        clust_model.fit(eval(data))
        centers = clust_model.cluster_centers_
        pred = clust_model.predict(eval(data))
        df_clusted = eval(data).copy()
        df_clusted['cluster'] = pred

        #Silhouette score
        score = silhouette_score(eval(data), pred, metric="euclidean")
        print(data, ": Silhouette Score:", score)

# Visualization
fig = plt.figure(figsize = (8, 8))
ax = fig.add_subplot(111, projection = '3d')

X = df_clusted

ax.scatter(X.iloc[:,0],X.iloc[:,1],X.iloc[:,2], c = X.cluster, s = 10, cmap = 'rainbow', alpha = 1)
ax.scatter(centers[:,0],centers[:,1],centers[:,2], c = 'black', marker = '*')
plt.show()

In [None]:
# GMM
for K in [4, 5, 6] :
    print("\n__Result of GMM Clustering, K : ", K)
    for data in data_list:
        gmm = GaussianMixture(n_components = K, random_state = 42)
        gmm.fit(eval(data))
        gmm_cluster_labels = gmm.predict(eval(data))
        df_clusted = eval(data).copy()
        df_clusted['gmm_cluster'] = gmm_cluster_labels

        #Silhouette score
        score = silhouette_score(eval(data), gmm_cluster_labels, metric="euclidean")
        print(data, "Silhouette Score:", score)
    


# Visualization
fig = plt.figure(figsize = (8, 8))
ax = fig.add_subplot(111, projection = '3d')

X = df_clusted

ax.scatter(X.iloc[:,0],X.iloc[:,1],X.iloc[:,2], c = X.gmm_cluster, s = 10, cmap = 'rainbow', alpha = 1)
plt.show()

In [None]:
# CLARANS    

"""
The pyclustering library clarans implementation requires
list of lists as its input dataset.
Thus we convert the data from numpy array to list.
"""
df_label_standard = df_label_standard.values.tolist()

K = 3
I = 100
N = 10

clarans_instance = clarans(df_label_standard, K, I, N)

clarans_instance.process()

clusters = clarans_instance.get_clusters()
medoids = clarans_instance.get_medoids()

print("Index of the points that are in a cluster : ", clusters)
print("The index of medoids that algorithm found to be best : ", medoids)

In [None]:
# DBSCAN

for e in [0.4, 0.6] :
    for threshold in [10, 15] :
        print("e : ", e, " threshold : " , threshold) 
        for data in data_list:
            model = DBSCAN(eps=e, min_samples=threshold)
            model.fit(eval(data))
            df_scaled = eval(data).copy()
            df_scaled['cluster'] = model.fit_predict(eval(data))

            #Silhouette score
            score = silhouette_score(df_scaled, df_scaled['cluster'], metric="euclidean")
            print(data, "Silhouette Score:", score)

print("\n__Result of DBSCAN Clustering **Outlier = -1**\n")

# Visualization
fig = plt.figure(figsize = (8, 8))
ax = fig.add_subplot(111, projection = '3d')

X = df_scaled

ax.scatter(X.iloc[:,0],X.iloc[:,1],X.iloc[:,2], c = X.cluster, s = 10, cmap = 'rainbow', alpha = 1)
plt.show()

In [None]:
K = 4

for data in data_list:
    sc = SpectralClustering(n_clusters = K).fit(eval(data))
    print(data, sc.labels_)
    


> ### Choose Parmeters :

# 3. Function "AutoML"
> ### A function AutoML is a all-in-one classifier function. It Automatically runs different combinations of the below : 
> -	Data Scaling, Data Encoding
> -	Various Clustering Algorithms
> -	Various quality measuring tools
> -	Various values of parameters and hyperparameters
> -	Various subsets of the features of the dataset
***
> ### AutoML (scaler_list, encoder_list, model_list, training_dataset)

In [None]:
def autoML(encoder_list, scaler_list, model_list, df):

    
    # Set supported scalers, encofders and models
    encoder = ['LabelEncoder()', 'OneHotEncoder()']
    scaler = ['StandardScaler()', 'MinMaxScaler()', 'RobustScaler()', 'MaxAbsScaler()', 'Normalizer()']
    model = ['KMeans', 'GMM', 'CLARANS', 'DBSCAN', 'Spectral']
    
    # Verify that the autoML function supports the scaler, encoder, and model entered as a function parameter.
    notspt = False;
    for e in encoder_list:
        if e not in encoder:
            print("The entered encoder", e, "is not supported in the autoML function.\n")
            notspt = True;
    for s in scaler_list:
        if s not in scaler:
            print("The entered scaler", s, "is not supported in the autoML function.\n")
            notspt = True;
    for m in model_list:
        if m not in model:
            print("The entered model", m, "is not supported in the autoML function.\n")
            notspt = True;
    if notspt:
        print("Try again with supported parameters.\n")
        return
    
    for e in encoder_list:
        
        # Encoding (attribute 'ocean_proximity')
        if 'ocean_proximity' in df.columns:
            label = df['ocean_proximity']
            df_encoded = df.drop(labels= 'ocean_proximity', axis = 1)
            if e == 'LabelEncoder()':
                le = LabelEncoder()
                le.fit(label)
                label_encoded = le.transform(label)
                df_label_encoded = pd.DataFrame(label_encoded, columns = ['labelEncoded_oceanProximity'])
                df_encoded = pd.concat([df_encoded, df_label_encoded], axis = 1)
                print("__", e, " Finished.")

            if e == 'OneHotEncoder()':
                df_oneHot_encoded = pd.get_dummies(label)
                df_encoded = pd.concat([df_encoded, df_oneHot_encoded], axis = 1)
                print("__", e, " Finished.")
        else :
            df_encoded = df
        # Scale
        for s in scaler_list:
            # Scale
            if s == 'StandardScaler()':
                df_scaled = StandardScaler().fit_transform(df_encoded)
            if s == 'RobustScaler()':
                df_sclaed = RobustScaler().fit_transform(df_encoded)
            if s == 'MinMaxScaler()':
                df_scaled = MinMaxScaler().fit_transform(df_encoded)
            if s == 'MaxAbsScaler()':
                df_scaled = MaxAbsScaler().fit_transform(df_encoded)
            if s == 'Normalizer()':
                df_scaled = Normalizer().fit_transform(df_encoded)
            # Change format of scaled data (np.arrays -> DataFrame)
            if e == 'LabelEncoder()':
                cols = df_scaled.shape[1]
                df_scaled = pd.DataFrame(df_scaled, columns=df_encoded.iloc[:,0:cols].columns)
            if e == 'OneHotEncoder()': 
                cols = df_scaled.shape[1]
                df_scaled = pd.DataFrame(df_scaled, columns=df_encoded.iloc[:,0:cols].columns)
            
            for m in model_list:
                # Clustering
                # K-Means
                if m == 'KMeans':
                    K = 6
                    clust_model = KMeans(n_clusters = K, random_state = 42, algorithm = 'auto')
                    clust_model.fit(df_scaled)

                    centers = clust_model.cluster_centers_
                    pred = clust_model.predict(df_scaled)

                    df_clusted = df_scaled.copy()
                    df_clusted['cluster'] = pred

                    print("\n__Result of K-Means Clustering\n")
                    print("Scaler : ",s,"Encoder : ", e)
                    print("Column list : ", df.columns.to_list())

                    #Silhouette score
                    score = silhouette_score(df_scaled, pred, metric="euclidean")
                    print("Silhouette Score:", score)

                    # Visualization
                    fig = plt.figure(figsize = (8, 8))
                    ax = fig.add_subplot(111, projection = '3d')

                    X = df_clusted

                    ax.scatter(X.iloc[:,0],X.iloc[:,1],X.iloc[:,2], c = X.cluster, s = 10, cmap = 'rainbow', alpha = 1)
                    ax.scatter(centers[:,0],centers[:,1],centers[:,2], c = 'black', marker = '*')
                    plt.show()

                # GMM
                if m == 'GMM':
                    K = 6
                    gmm = GaussianMixture(n_components = K, random_state = 42)
                    gmm.fit(df_scaled)
                    gmm_cluster_labels = gmm.predict(df_scaled)

                    df_clusted = df_scaled.copy()
                    df_clusted['gmm_cluster'] = gmm_cluster_labels

                    print("\n__Result of GMM Clustering\n")
                    print("Scaler : ",s,"Encoder : ", e)
                    print("Column list : ", df.columns.to_list())
                    
                    #Silhouette score
                    score = silhouette_score(df_scaled, gmm_cluster_labels, metric="euclidean")
                    print("Silhouette Score:", score)

                    # Visualization
                    fig = plt.figure(figsize = (8, 8))
                    ax = fig.add_subplot(111, projection = '3d')

                    X = df_clusted

                    ax.scatter(X.iloc[:,0],X.iloc[:,1],X.iloc[:,2], c = X.gmm_cluster, s = 10, cmap = 'rainbow', alpha = 1)
                    plt.show()

                # CLARANS    
                if m == 'CLARANS':
                    """
                    The pyclustering library clarans implementation requires
                    list of lists as its input dataset.
                    Thus we convert the data from numpy array to list.
                    """
                    df_scaled = df_scaled.values.tolist()

                    K = 4
                    I = int(input("Input the amount of iterations: "))
                    N = int(input("Input the number of max neighbors: "))

                    clarans_instance = clarans(df_scaled, K, I, N)

                    clarans_instance.process()

                    clusters = clarans_instance.get_clusters()
                    medoids = clarans_instance.get_medoids()

                    print("Index of the points that are in a cluster : ", clusters)
                    print("The index of medoids that algorithm found to be best : ", medoids)

                # DBSCAN
                if m == 'DBSCAN':
                    eps = 0.6
                    threshold = 10

                    model = DBSCAN(eps=eps, min_samples=threshold)
                    model.fit(df_scaled)
                    df_scaled['cluster'] = model.fit_predict(df_scaled)

                    print("\n__Result of DBSCAN Clustering **Outlier = -1**\n")
                    print("Scaler : ",s,"Encoder : ", e)
                    print("Column list : ", df.columns.to_list())
                    
                    #Silhouette score
                    score = silhouette_score(df_scaled, df_scaled['cluster'], metric="euclidean")
                    print("Silhouette Score:", score)

                    # Visualization
                    fig = plt.figure(figsize = (8, 8))
                    ax = fig.add_subplot(111, projection = '3d')

                    X = df_scaled

                    ax.scatter(X.iloc[:,0],X.iloc[:,1],X.iloc[:,2], c = X.cluster, s = 10, cmap = 'rainbow', alpha = 1)
                    plt.show()

                if m == 'Spectral':
                    K = int(input("Input the number of clusters: "))

                    sc = SpectralClustering(n_clusters = K).fit(df_scaled)
                    print(sc.labels_)





> ## Parameters::
>> ### scaler_list, default=None
>>  if None, then the base estimator is DecisionTreeClassifier initialized with max_depth=1.
>>  <br>**Supported Scaler = ['StandardScaler()', 'MinMaxScaler()', 'RobustScaler()', 'MaxAbsScaler()', 'Normalizer()']**
>>  <br> Only supported arguments are available. Else, error returned
>> ### encoder_list, default=None
>>  if None, then the base estimator is DecisionTreeClassifier initialized with max_depth=1.
    <br>**Supported Encoder = ['LabelEncoder()', 'OneHotEncoder()']**
    >>  <br> Only supported arguments are available. Else, error returned
>> ### model_list, default=None
>>  if None, then the base estimator is DecisionTreeClassifier initialized with max_depth=1.
    <br>**Supported Model = ['KMeans', 'GMM', 'CLARANS', 'DBSCAN', 'Spectral']**
    >>  <br> Only supported arguments are available. Else, error returned
>> ### training_dataset, default=None
>>  if None, then the function returns error
> ## Attributes::
>> ### encoder
>>  encoder = ['LabelEncoder()', 'OneHotEncoder()']
>> ### scaler
>> scaler = ['StandardScaler()', 'MinMaxScaler()', 'RobustScaler()', 'MaxAbsScaler()', 'Normalizer()']
>> ### model
>> model = ['KMeans', 'GMM', 'CLARANS', 'DBSCAN', 'Spectral']

# 4. Main Program

In [None]:
import warnings
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler, Normalizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import proj3d
from sklearn.mixture import GaussianMixture
from pyclustering.cluster.clarans import clarans
from pyclustering.utils import timedcall
from sklearn.cluster import DBSCAN
from sklearn.cluster import SpectralClustering
import pandas as pd
import numpy as np

warnings.filterwarnings(action='ignore')

def load_df(attList):
    # Data exploration
    df = pd.read_csv('housing.csv', sep=',')
    print("__California Housing Prices Dataset\n")
    # Drop the target value: "median_house_value"
    df.drop(df.columns[8], axis = 1, inplace = True)
    print("__Drop attribute 'median_house_value', because it is the answer value\n")
    # replace NA to MEDIAN
    df = df.replace(np.nan,int(df.iloc[:, 5].median()))
    print("__NA replaced to MEDIAN value.\n")
    
    df2 = pd.DataFrame()
    for i in attList:
        if(i in df.columns):
            df2[i] = df[i]
            
    return df2


####이렇게 함수 활용해야 함
encoder_list_1 = ['LabelEncoder()', 'OneHotEncoder()']
scaler_list_1 = ['StandardScaler()', 'MinMaxScaler()', 'RobustScaler()', 'MaxAbsScaler()', 'Normalizer()']
model_list_1 = ['KMeans', 'GMM', 'DBSCAN', 'Spectral']

encoder_list_2 = ['LabelEncoder()', 'OneHotEncoder()']
scaler_list_2 = ['StandardScaler()', 'MinMaxScaler()', 'RobustScaler()']
model_list_2 = ['KMeans', 'GMM', 'DBSCAN']

att_list1 = ['longitude','latitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income',
            'median_house_value','ocean_proximity']

att_list2 = ['longitude','latitude','housing_median_age','total_rooms']

dataset_1 = load_df(att_list1)

dataset_2 = load_df(att_list2)

# autoML(encoder_list_1, scaler_list_1, model_list_1, dataset_1)
# autoML(encoder_list_1, scaler_list_1, model_list_1, dataset_2)

autoML(encoder_list_2, scaler_list_2, model_list_2, dataset_1)
autoML(encoder_list_2, scaler_list_2, model_list_2, dataset_2)

# 5. Final Code

In [None]:
import warnings
warnings.filterwarnings(action='ignore')
import pandas as pd
import numpy as np
import random

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import proj3d
from sklearn.mixture import GaussianMixture
from pyclustering.cluster.clarans import clarans
from pyclustering.utils import timedcall
from sklearn.cluster import DBSCAN
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn import metrics
from sklearn.metrics import *
from pyclustering.cluster import cluster_visualizer_multidim

def load_df(attList):
    # Data exploration
    df = pd.read_csv('housing.csv', sep=',')
    print("__California Housing Prices Dataset\n")
    # Drop the target value: "median_house_value"
    df.drop(df.columns[8], axis=1, inplace=True)
    print("__Drop attribute 'median_house_value', because it is the answer value\n")
    # replace NA to MEDIAN
    df = df.replace(np.nan, int(df.iloc[:, 5].median()))
    print("__NA replaced to MEDIAN value.\n")

    df2 = pd.DataFrame()
    for i in attList:
        if (i in df.columns):
            df2[i] = df[i]

    return df2

def purity_score(y_true, y_pred):
    # Compute confusion matrix
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    # Return purity score
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)
encoder_list = ['LabelEncoder()', 'OneHotEncoder()']

scaler_list_1 = ['StandardScaler()', 'MinMaxScaler()', 'RobustScaler()', 'MaxAbsScaler()']
model_list_1 = ['KMeans', 'GMM','DBSCAN']

scaler_list_2 = ['StandardScaler()', 'MinMaxScaler()', 'RobustScaler()', 'MaxAbsScaler()']
model_list_2 = ['CLARANS', 'MeanShift']

att_list1 = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households',
             'median_income', 'median_house_value', 'ocean_proximity']
att_list2 = ['longitude', 'latitude', 'housing_median_age', 'total_rooms']

dataset_1 = load_df(att_list1)
dataset_2 = load_df(att_list2)

df = pd.read_csv('housing.csv', sep=',')

# Divide target value in 4 quantiles
quantiles = list(df['median_house_value'].quantile([0.25, 0.5, 0.75, 1.0]))
df.loc[df['median_house_value'] >= quantiles[0], 'quantiles'] = 1
df.loc[df['median_house_value'] >= quantiles[1], 'quantiles'] = 2
df.loc[df['median_house_value'] >= quantiles[2], 'quantiles'] = 3
df.loc[df['median_house_value'] >= quantiles[3], 'quantiles'] = 4

y = df['quantiles'].astype("category")

autoML(encoder_list, scaler_list_1, model_list_1, dataset_1, y)
autoML(encoder_list, scaler_list_2, model_list_2, dataset_2, y)


