In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

## Data loading

In [2]:
# Load the training dataset and define the features and target
df_train = pd.read_csv('/kaggle/input/ml1-2024/train.csv')
y_column = 'LotArea'
X_train, y_train = df_train.drop(y_column, axis=1), df_train[y_column]
# Load the test features and drop the column ID, which is only useful for the submission
X_test = pd.read_csv("/kaggle/input/ml1-2024/test.csv")
X_test_id = X_test['ID']
X_test = X_test.drop(columns='ID')
# The training and test features have the same number of columns
print(X_train.shape, X_test.shape)
X_train

(1459, 78) (1460, 78)


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,120,RM,,Pave,,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,,,,0,6,2008,ConLD,Normal
1,20,RL,66.0,Pave,,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,,,,0,8,2007,WD,Normal
2,60,RL,74.0,Pave,,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,,,,0,10,2006,WD,Normal
3,90,RL,70.0,Pave,,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,,,,0,12,2007,WD,Normal
4,30,RM,50.0,Pave,,Reg,Bnk,AllPub,Inside,Gtl,...,0,0,,MnPrv,,0,3,2007,WD,Alloca
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,60,RL,82.0,Pave,,Reg,Lvl,AllPub,Inside,Gtl,...,180,0,,,,0,7,2009,WD,Normal
1455,20,RL,60.0,Pave,,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,,,,0,2,2010,WD,Normal
1456,160,RL,24.0,Pave,,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,,,,0,9,2009,WD,Normal
1457,20,RL,61.0,Pave,,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,,GdPrv,,0,5,2007,WD,Normal


## Preprocessing

In [3]:
# Example of a simple model with basic feature preprocessing
selected_features = ['MSSubClass', 'MSZoning', 'LotFrontage', 'Street']
X_train = X_train[selected_features]
X_test = X_test[selected_features]

numeric_features = ['MSSubClass', 'LotFrontage']
categorical_features = ['MSZoning', 'Street']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
])
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)
model = Pipeline(steps=[('preprocessor', preprocessor), ('model', LinearRegression())])

## Fitting and prediction

In [4]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

## Submission

In [5]:
submission = pd.DataFrame({
    'ID': X_test_id,
    'LotArea': y_pred,
})
submission.to_csv('submission.csv', index=False)
submission

Unnamed: 0,ID,LotArea
0,0,7681.224870
1,1,9551.194442
2,2,10495.272353
3,3,16640.017820
4,4,11740.225578
...,...,...
1455,1455,11058.402942
1456,1456,6118.815543
1457,1457,7235.180865
1458,1458,9970.787477
