In [1]:
%load_ext autoreload
%autoreload 2

In [24]:
import pandas as pd
import geopandas as gpd

from sklearn.ensemble import GradientBoostingClassifier

from gcnb_pkg.ml_logic.preprocessor import clean_data, processing_pipe
from gcnb_pkg.ml_logic.registry import save_data, save_data_geo, save_pipeline, load_data, load_model

In [3]:
root_path = '../'

# Exploring preprocess

Intermediate steps in the function preprocess() in main.py.

In [4]:
df_raw = gpd.read_file(root_path+'raw_data/project_data.shp')
green_thresshold = 15
df_raw['green_roof'] = (df_raw['gruen20_p']>green_thresshold).astype(int)

In [5]:
# cleaning and saving clean data
df = clean_data(df_raw)

In [6]:
# separating target and attribute. Notice we drop geometry since it is not
# an attribute
X_clean = df.drop(columns=['green_roof', 'geometry']).reset_index(drop = True)
y = df['green_roof'].reset_index(drop = True)

In [7]:
#fitting the preprocessing pipeline and saving it
final_pipe = processing_pipe(X_clean)

In [8]:
#transforming data and merging target values back to dataframe
X = pd.DataFrame(final_pipe.transform(X_clean))

In [9]:
preprocessed_df = X.merge(y, left_index = True, right_index = True)

# Exploring model

Intermediate steps in the function initialize_train_model() in main.py

In [18]:
preprocessed_df = load_data(root_path, '_preprocessed')

In [19]:
X = preprocessed_df.drop(columns = 'green_roof')
y = preprocessed_df['green_roof']

In [22]:
# defining model parameters
model_params = {'ccp_alpha': 0.0,
'criterion': 'squared_error',
'init': None,
'learning_rate': 0.09,
'loss': 'exponential',
'max_depth': 6,
'max_features': None,
'max_leaf_nodes': None,
'min_impurity_decrease': 0.0,
'min_samples_leaf': 6,
'min_samples_split': 100,
'min_weight_fraction_leaf': 0.0,
'n_estimators': 100,
'n_iter_no_change': None,
'random_state': 42,
'subsample': 1.0,
'tol': 0.0001,
'validation_fraction': 0.1,
'verbose': 0,
'warm_start': False}

model = GradientBoostingClassifier(**model_params).fit(X, y)

Intermediate steps in the function evaluate_model() in main.py

In [26]:
model = load_model(root_path)
preprocessed_df = load_data(root_path, '_preprocessed')
X = preprocessed_df.drop(columns = 'green_roof')
y = preprocessed_df['green_roof']

In [27]:
y_pred_proba = model.predict_proba(X)[:,1]

In [29]:
y_pred_proba.ndim

1