In [1]:
import pandas as pd

df_url = 'https://raw.githubusercontent.com/INRIA/scikit-learn-mooc/main/datasets/ames_housing_no_missing.csv'

# Load Your DF
ames_housing_df = pd.read_csv(df_url)

ames_housing_df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,Gd,MnPrv,Shed,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,Grvl,Reg,Lvl,AllPub,FR2,...,0,Gd,MnPrv,Shed,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,Grvl,IR1,Lvl,AllPub,Inside,...,0,Gd,MnPrv,Shed,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,Grvl,IR1,Lvl,AllPub,Corner,...,0,Gd,MnPrv,Shed,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,Grvl,IR1,Lvl,AllPub,FR2,...,0,Gd,MnPrv,Shed,0,12,2008,WD,Normal,250000


In [2]:
# Target is to predict whether a target variable is > 200_000 or not
target = (ames_housing_df['SalePrice'] > 200_000).astype(int)


# Drop the features colum
features_df = ames_housing_df.drop(columns='SalePrice')

In [5]:
ames_housing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [6]:
features_df.shape

(1460, 79)

In [10]:
# Splitting deatures into numerical vs categorical

num_features = features_df.select_dtypes('number')

cat_features = features_df.select_dtypes('object')

In [11]:
cat_features.shape

(1460, 43)

In [12]:
num_features.shape

(1460, 36)

# Using only Numerical Features

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate


model = make_pipeline(
                # Step 1
                StandardScaler(),
                # Step 2
                LogisticRegression(max_iter=1000)
)


# Instantiate the cv mo
cv_results = cross_validate(
                    model,
                    num_features,
                    target,
                    cv=10,
                    error_score='raise'
                            )


scores = cv_results["test_score"]

print(
    "The mean cross-validation accuracy is: "
    f"{scores.mean():.3f} ± {scores.std():.3f}"
)

The mean cross-validation accuracy is: 0.925 ± 0.012


# Using Both Numeric and Categorical Features

In [14]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


num_preprocessor = StandardScaler()
cat_preprocessor = OneHotEncoder(sparse_output=False, handle_unknown='ignore')


pre_proc = ColumnTransformer(
            [
                # Step 1 - Numerical Preprocessor
                ('standard_scaler', num_preprocessor, num_features.columns),
                # Step 2 - Ordinal Preprocessor
                ('one-hot-encoder', cat_preprocessor, cat_features.columns),
            ]
)


# Build Pipelinw with the full features
model = make_pipeline(
                # Pre-processor 
                pre_proc,
                # Model
                LogisticRegression(max_iter=1000)
                    )


model

In [15]:
# Instantiate the cv mo
cv_results = cross_validate(
                    model,
                    features_df,
                    target,
                    cv=10,
                    error_score='raise'
                            )


scores = cv_results["test_score"]

print(
    "The mean cross-validation accuracy is: "
    f"{scores.mean():.3f} ± {scores.std():.3f}"
)

The mean cross-validation accuracy is: 0.923 ± 0.022
