# Example: Pipelines usage
- todo: celoten file si posebej kopiraj še od Leona (po predavanjih)... na koncu uporabljeno drugacno poimenovanje test setov
    - train
    - valid
    - test
    - raw

In [36]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

In [37]:
train = pd.read_csv('data/house_train.csv')
X_test = pd.read_csv('data/house_test.csv') # uporabimo samo na koncu, med modeliranjem jih niti ne gledamo

In [38]:
from sklearn.model_selection import train_test_split

X = train.drop('SalePrice', axis=1)
y = train["SalePrice"]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.3, random_state=1121218)

In [39]:
X_train.describe().T.iloc[:10] # All numerical cols

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,1022.0,728.62818,417.491868,1.0,374.5,734.5,1082.0,1459.0
MSSubClass,1022.0,57.030333,42.86121,20.0,20.0,50.0,70.0,190.0
LotFrontage,838.0,70.190931,24.110495,21.0,60.0,70.0,80.0,313.0
LotArea,1022.0,10472.601761,8782.768055,1491.0,7560.0,9571.0,11742.5,164660.0
OverallQual,1022.0,6.071429,1.374094,1.0,5.0,6.0,7.0,10.0
OverallCond,1022.0,5.578278,1.101703,1.0,5.0,5.0,6.0,9.0
YearBuilt,1022.0,1971.221135,29.863975,1875.0,1954.0,1973.0,2000.0,2009.0
YearRemodAdd,1022.0,1984.813112,20.67152,1950.0,1966.0,1994.0,2003.75,2010.0
MasVnrArea,1015.0,101.768473,180.299391,0.0,0.0,0.0,160.0,1600.0
BsmtFinSF1,1022.0,441.294521,438.43075,0.0,0.0,381.0,707.5,2260.0


Želimo narediti model, ki napove ceno stanovanja na okoli 16.000 EUR natančno 

In [40]:
X_train.describe(include="object").T.iloc[:10] # All object cols

Unnamed: 0,count,unique,top,freq
MSZoning,1022,5,RL,809
Street,1022,2,Pave,1017
Alley,67,2,Grvl,37
LotShape,1022,4,Reg,654
LandContour,1022,4,Lvl,920
Utilities,1022,2,AllPub,1021
LotConfig,1022,5,Inside,733
LandSlope,1022,3,Gtl,966
Neighborhood,1022,25,NAmes,156
Condition1,1022,9,Norm,881


In [41]:
above_0_missing = X_train.isnull().sum() > 0

In [42]:
X_train.isnull().sum()[above_0_missing]

LotFrontage      184
Alley            955
MasVnrType         7
MasVnrArea         7
BsmtQual          30
BsmtCond          30
BsmtExposure      31
BsmtFinType1      30
BsmtFinType2      31
Electrical         1
FireplaceQu      480
GarageType        58
GarageYrBlt       58
GarageFinish      58
GarageQual        58
GarageCond        58
PoolQC          1018
Fence            821
MiscFeature      988
dtype: int64

In [43]:
numerical_features = X_train.select_dtypes(include='number').columns.tolist()

In [44]:
print(f'There are {len(numerical_features)} numerical features:', '\n')

There are 37 numerical features: 



In [45]:
print(numerical_features)

['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']


In [46]:
categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()

In [47]:
print(f'There are {len(categorical_features)} categorical features:', '\n')

There are 43 categorical features: 



In [48]:
print(categorical_features)

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


Pri processiranju najprej ponavadi ločimo numerične in kategorične podatke - saj je analiza drugačna. Zgoraj smo definirali dva lista stolpcev: `categorical_features` in `numerical_features`

In [49]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline

In [50]:
numeric_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    # handle_unknown='ignore' -> ce se v test setu pojavi neka kategorija, ki jo v train setu ni, to ignoriramo. V tem primeru ne dobimo napake
])

In [51]:
from sklearn.compose import ColumnTransformer

Column transformer definira vzporedne korake, ki jih zelimo izvesti. Filtriramo po razlicnih stolpcih, kaj želimo izvesti.

In [52]:
full_processor = ColumnTransformer(transformers=[
    ('number', numeric_pipeline, numerical_features), # ime_pipelinea, pipeline, list featurjev
    ('category', categorical_pipeline, categorical_features)
])

In [53]:
full_processor

In [54]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error

In [55]:
lasso = Lasso(alpha=0.1)

lasso_pipeline = Pipeline(steps=[
    ('preprocess', full_processor),
    ('model', lasso)
])

lasso_pipeline.fit(X_train, y_train)

preds = lasso_pipeline.predict(X_valid)

  model = cd_fast.enet_coordinate_descent(


In [56]:
lasso_pipeline

In [57]:
mean_absolute_error(y_valid, preds)

19830.527070323817

In [58]:
lasso_pipeline.score(X_valid, y_valid)

0.7079758134752527

In [59]:
def run_training(data, model):
    X = data.drop('SalePrice', axis=1)
    y = data["SalePrice"]
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.3, random_state=1121218)
    
    numerical_features = X_train.select_dtypes(include='number').columns.tolist()
    categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()
    
    # pipeline
    numeric_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='mean')),
        ('scale', MinMaxScaler())
    ])
    categorical_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])  
    full_processor = ColumnTransformer(transformers=[
        ('number', numeric_pipeline, numerical_features),
        ('category', categorical_pipeline, categorical_features)
    ])
    
    full_pipeline = Pipeline(steps=[
        ('preprocess', full_processor),
        ('model', model)
    ])
    
    full_pipeline.fit(X_train, y_train)
    preds = full_pipeline.predict(X_valid)
    
    print(f"Mean absolute error: {mean_absolute_error(y_valid, preds)}")
    print(f"Score: {full_pipeline.score(X_valid, y_valid)}")

In [60]:
train = pd.read_csv('data/house_train.csv')
lasso = Lasso(alpha=0.1)
run_training(train, lasso)

Mean absolute error: 19830.527070323817
Score: 0.7079758134752527


  model = cd_fast.enet_coordinate_descent(


Naloga: Dodajte feature selection v pipeline.

In [68]:
# naloga spodaj
from sklearn.feature_selection import SelectPercentile, f_regression

def run_training(data, model):
    X = data.drop('SalePrice', axis=1)
    y = data["SalePrice"]
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.3, random_state=1121218)
    
    numerical_features = X_train.select_dtypes(include='number').columns.tolist()
    categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()
    
    # pipeline
    numeric_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='mean')),
        ('scale', MinMaxScaler())
    ])
    categorical_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])  
    full_processor = ColumnTransformer(transformers=[
        ('number', numeric_pipeline, numerical_features),
        ('category', categorical_pipeline, categorical_features)
    ])
    
    full_pipeline = Pipeline(steps=[
        ('preprocess', full_processor),
        # feature selection naredimo za preprocessiranje, ko smo že generirali veliko spt. preko one-hot encodinga
        # ter pred samim modeliranjem
        ('select', SelectPercentile(score_func=f_regression,percentile=80)), # kot score funkcija se uporabi f_regression, saj gre za regresijski problem
        ('model', model)
    ])
    
    full_pipeline.fit(X_train, y_train)
    preds = full_pipeline.predict(X_valid)
    
    print(f"Mean absolute error: {mean_absolute_error(y_valid, preds)}")
    print(f"Score: {full_pipeline.score(X_valid, y_valid)}")

train = pd.read_csv('data/house_train.csv')
lasso = Lasso(alpha=0.1)
run_training(train, lasso)

Mean absolute error: 19329.64015912954
Score: 0.7883839728739452


  model = cd_fast.enet_coordinate_descent(


In [62]:
from sklearn.model_selection import GridSearchCV

param_dict = {'model__alpha': np.arange(0.01, 1, 0.05)}

search = GridSearchCV(lasso_pipeline, param_dict, 
                      cv=10, 
                      scoring='neg_mean_absolute_error')

search.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [69]:
print('Best score:', abs(search.best_score_))
print('Best alpha:', search.best_params_)

# vidimo da je oprimalni aplha na robu našega range-a. V nadaljevanju probamo razširiti range.

Best score: 16468.24463894264
Best alpha: {'model__alpha': 86}


In [64]:
param_dict = {'model__alpha': np.arange(1, 100, 5)}

search = GridSearchCV(lasso_pipeline, param_dict, 
                      cv=5, 
                      scoring='neg_mean_absolute_error')

search.fit(X_train, y_train)

print('Best score:', abs(search.best_score_))
print('Best alpha:', search.best_params_)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best score: 16468.24463894264
Best alpha: {'model__alpha': 86}


In [65]:
def run_training_gridcv(data, model, params):
    X = data.drop('SalePrice', axis=1)
    y = data["SalePrice"]
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.3, random_state=1121218)
    
    numerical_features = X_train.select_dtypes(include='number').columns.tolist()
    categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()
    
    # pipeline
    numeric_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='mean')),
        ('scale', MinMaxScaler())
    ])
    categorical_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])  
    full_processor = ColumnTransformer(transformers=[
        ('number', numeric_pipeline, numerical_features),
        ('category', categorical_pipeline, categorical_features)
    ])
    
    full_pipeline = Pipeline(steps=[
        ('preprocess', full_processor),
        ("select", SelectPercentile(score_func=f_regression, percentile=80)),
        ('model', model)
    ])
    
    
    search = GridSearchCV(full_pipeline, params, 
                      cv=5, 
                      scoring='neg_mean_absolute_error')
    
    search.fit(X_train, y_train)

    print('Best score:', abs(search.best_score_))
    print('Best params:', search.best_params_)
    return search
    
train = pd.read_csv('data/house_train.csv')
param_dict = {'model__alpha': np.arange(1, 300, 10)}
lasso = Lasso(alpha=0.1) # default alpha podan
run_training_gridcv(train, lasso, param_dict)

# bolj posplošen rezultat kot prejšnji primer, zato malo slabši rezultat.

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best score: 16934.904615000498
Best params: {'model__alpha': 151}


In [66]:
lasso = Lasso(alpha=151) # uporabimo optimalen alpha 

# prednost pipeline-a je v tem, da lahko tudi nad testnimi podatki uporabimo dele pipeline-a. Preprocesorski del.
final_lasso_pipe = Pipeline(steps=[
    ('preprocess', full_processor),
    ('model', lasso)
])

final_lasso_pipe.fit(X_train, y_train)
preds = final_lasso_pipe.predict(X_valid)

# v tem primeru je za nas y_valid kot tesni del podatkov
# na realnih podatkih, ki so oznaceni kot test nimamo poznanih cen rezultatov
mean_absolute_error(y_valid, preds)

17870.072952257695

In [67]:
# primer test podatkov, kjer predictorji niso znani. Model bi lahko evalvirali šele kasneje, ko bi se te nepremičnine dejansko prodale na trgu.
preds_final = final_lasso_pipe.predict(X_test)

output = pd.DataFrame({'Id': X_test["Id"], 'SalePrice': preds_final})
output.head()

Unnamed: 0,Id,SalePrice
0,1461,108175.322829
1,1462,161472.124919
2,1463,183853.576017
3,1464,194416.087478
4,1465,200090.933363


In [None]:
# za vajo lahko poskusiš še dodati različne izboljšave
# npr.: outlinerji