# Практикум №1: Попередня обробка даних і побудова навчальних наборів

0. Імпорт залежностей.

In [19]:
import random
import pandas as pd
import numpy as np

from sklearn.experimental import enable_iterative_imputer
from sklearn.datasets import fetch_california_housing
from sklearn.impute import KNNImputer, SimpleImputer, IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline

from src.data.util.dataset import download_and_unzip
from src.definitions import EXTERNAL_DATA_FOLDER
from src.util.metrics import calculate_metrics

random_seed = 42

random.seed(random_seed)
np.random.seed(random_seed)

1. Обрати набір даних з категоріальними ознаками.

In [2]:
loan_data_path = download_and_unzip(
    "architsharma01",
    "loan-approval-prediction-dataset",
    EXTERNAL_DATA_FOLDER,
)

loan_df = pd.read_csv(loan_data_path / "loan_approval_dataset.csv")
loan_df.columns = loan_df.columns.str.replace(" ", "")

loan_df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


2. Виконати обробку категоріальних даних: 
- відобразити номінальні ознаки за допомогою pandas,
- кодувати мітки класів,
- зробити оne-hot кодування номінальних ознак,
- виконати кодування порядкових ознак (за їх наявності).

In [3]:
bool_to_int = {"Yes": 1, "No": 0}
status_to_int = {"Approved": 1, "Rejected": 0}
education_to_int = {"Graduate": 1, "Not Graduate": 0}

loan_df["education"] = loan_df["education"].str.strip().map(education_to_int)
loan_df["self_employed"] = loan_df["self_employed"].str.strip().map(bool_to_int)
loan_df["loan_status"] = loan_df["loan_status"].str.strip().map(status_to_int)

loan_df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,1,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,1
1,2,0,0,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,0
2,3,3,1,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,0
3,4,3,1,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,0
4,5,5,0,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,0


3. Обрати великий набір даних з відсутніми значеннями. 2K прикладів у наборі буде достатньо. 
Набір можна сформувати самостійно випадково вилучаючи по одному значенню з кожного рядка наборів fetch_california_housing або load_diabetes з sklearn.datasets.

In [4]:
housing = fetch_california_housing()

housing_x = housing.data
housing_y = housing.target

housing_data = pd.DataFrame(housing_x, columns=housing.feature_names)

housing_data.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [5]:
housing_data.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


In [6]:
def randomly_set_one_none_per_row(
    df: pd.DataFrame, probability: float = 0.3
) -> pd.DataFrame:
    for i in range(len(df)):
        if np.random.rand() < probability:
            col = np.random.choice(df.columns)
            df.at[i, col] = None

randomly_set_one_none_per_row(housing_data, probability=0.3)

housing_data.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,,1.081081,565.0,2.181467,37.85,-122.25


In [7]:
housing_data.isnull().sum()

MedInc        797
HouseAge      772
AveRooms      778
AveBedrms     783
Population    717
AveOccup      767
Latitude      788
Longitude     772
dtype: int64

In [8]:
np_housing_data = housing_data.to_numpy()

4. Заповнити відсутні значення шляхом інтерполяції:
   - алгоритмом KNNImputer з sklearn.impute,   
   - одновимірним алгоритмом SimpleImputer з sklearn.impute,
   - багатовимірним алгоритмом IterativeImputer з sklearn.impute.

In [9]:
knn_imputer = KNNImputer(n_neighbors=5)

knn_imputed = knn_imputer.fit_transform(np_housing_data)

pd_knn_imputed = pd.DataFrame(knn_imputed, columns=housing.feature_names)

pd_knn_imputed.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

In [10]:
simple_imputer = SimpleImputer(strategy="mean")

simple_imputed = simple_imputer.fit_transform(np_housing_data)

pd_simple_imputed = pd.DataFrame(simple_imputed, columns=housing.feature_names)

pd_simple_imputed.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

In [11]:
iterative_imputer = IterativeImputer(random_state=random_seed)

iterative_imputed = iterative_imputer.fit_transform(np_housing_data)

pd_iterative_imputed = pd.DataFrame(iterative_imputed, columns=housing.feature_names)

pd_iterative_imputed.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

5. Розглянути різні значення параметрів алгоритмів SimpleImputer та IterativeImputer:
   - параметрів strategy в SimpleImputer та initial_strategy в IterativeImputer, а також інших параметрів в якості експеримента,
   - параметра estimator в IterativeImputer.
Наприклад, параметр estimator може приймати значення: BayesianRidge, RandomForestRegressor, KNeighborsRegressor.

In [12]:
# Немає необхідності створювати інстанси *Imputer з різними параметрами, оскільки вони підбираються через грід сьорч.

6. Побудувати конвеєр з оцінщиків, використовуючи sklearn.pipeline.make_pipeline. 

In [13]:
simple_imputer_pipe = make_pipeline(
    SimpleImputer(), RandomForestRegressor(random_state=random_seed)
)

knn_imputer_pipe = make_pipeline(
    KNNImputer(), RandomForestRegressor(random_state=random_seed)
)

iterative_imputer_pipe = make_pipeline(
    IterativeImputer(random_state=random_seed), RandomForestRegressor(random_state=random_seed)
)

7. Порівняти результати за різними алгоритмами та при різних значеннях їх параметрів. Показником якості (score) обрати, наприклад, MSE.
Побудувати графік/діаграму значень показника якості для різних алгоритмів заповнення (та різних наборів значень параметрів цих алгоритмів). 

In [14]:
baseline_x_train, baseline_x_test, baseline_y_train, baseline_y_test = train_test_split(
    housing_x, housing_y, random_state=random_seed
)

baseline_model = RandomForestRegressor(random_state=random_seed)

baseline_model.fit(baseline_x_train, baseline_y_train)

baseline_y_pred = baseline_model.predict(baseline_x_test)

calculate_metrics(baseline_y_test, baseline_y_pred)

{'r2': 0.7859624314052124,
 'mae': 0.35187954222868234,
 'mse': 0.28321838589620607}

In [15]:
(
    housing_x_train,
    housing_x_test,
    housing_y_train,
    housing_y_test,
) = train_test_split(np_housing_data, housing_y, random_state=random_seed)

In [16]:
simple_imputer_param_grid = {
    "simpleimputer__strategy": ["mean", "median", "most_frequent"],
}

simple_imputer_search = GridSearchCV(simple_imputer_pipe, simple_imputer_param_grid, scoring="r2")
simple_imputer_search.fit(housing_x_train, housing_y_train)

print(
    "Best parameter (CV score=%0.3f): %s"
    % (simple_imputer_search.best_score_, simple_imputer_search.best_params_)
)

simple_imputed_y_pred = simple_imputer_search.predict(housing_x_test)

print(calculate_metrics(housing_y_test, simple_imputed_y_pred))

Best parameter (CV score=0.777): {'simpleimputer__strategy': 'median'}
{'r2': 0.7787266406422824, 'mae': 0.3553564154457366, 'mse': 0.29279291523707823}


In [17]:
knn_imputer_param_grid = {
    "knnimputer__n_neighbors": np.arange(2, 20, 2),
}

knn_imputer_search = GridSearchCV(
    knn_imputer_pipe, knn_imputer_param_grid, scoring="r2"
)
knn_imputer_search.fit(housing_x_train, housing_y_train)

print(
    "Best parameter (CV score=%0.3f): %s"
    % (knn_imputer_search.best_score_, knn_imputer_search.best_params_)
)

knn_imputed_y_pred = knn_imputer_search.predict(housing_x_test)

print(calculate_metrics(housing_y_test, knn_imputed_y_pred))

Best parameter (CV score=0.779): {'knnimputer__n_neighbors': 4}
{'r2': 0.7854584405591684, 'mae': 0.3489868494186048, 'mse': 0.28388527570840205}


In [None]:
knn_imputer_param_grid = {
    "iterativeimputer__initial_strategy": ["mean", "median", "most_frequent"],
    "iterativeimputer__estimator": [
        BayesianRidge(), RandomForestRegressor(), KNeighborsRegressor()
    ],
}

iterative_imputer_search = GridSearchCV(
    iterative_imputer_pipe, knn_imputer_param_grid, scoring="r2"
)
iterative_imputer_search.fit(housing_x_train, housing_y_train)

print(
    "Best parameter (CV score=%0.3f): %s"
    % (iterative_imputer_search.best_score_, iterative_imputer_search.best_params_)
)

iterative_imputed_y_pred = iterative_imputer_search.predict(housing_x_test)

print(calculate_metrics(housing_y_test, iterative_imputed_y_pred))

8. Зробити висновки. Обрати найкращий алгоритм для заповнення пропусків у ваших даних.