In [2]:
import pandas as pd
import numpy as np 
import seaborn as sns
# Load training data
train_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

# View data info
print(train_data.shape)
train_data.head()


(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# Check missing values
missing = train_data.isnull().sum()
print(missing[missing > 0])

# View data types
train_data.info()


LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   objec

In [7]:
# Fill missing numerical values with median
for column in train_data.select_dtypes(include=['float64', 'int64']).columns:
    train_data[column] = train_data[column].fillna(train_data[column].median())

# Fill missing categorical values with mode
for column in train_data.select_dtypes(include=['object']).columns:
    train_data[column] = train_data[column].fillna(train_data[column].mode()[0])

# Convert categorical to numerical
train_data = pd.get_dummies(train_data)


In [11]:
X = train_data.drop(['Id','SalePrice'], axis=1)
y = train_data['SalePrice']


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()
model.fit(X_train, y_train)

preds = model.predict(X_valid)
mse = mean_squared_error(y_valid, preds)
print("Mean Squared Error:", mse)


Mean Squared Error: 873658179.20648


In [15]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

rf_preds = rf_model.predict(X_valid)
rf_mse = mean_squared_error(y_valid, rf_preds)
print("Random Forest MSE:", rf_mse)


Random Forest MSE: 833851942.1748586


In [16]:
# Repeat missing value treatment for test data
for column in test_data.select_dtypes(include=['float64', 'int64']).columns:
    test_data[column] = test_data[column].fillna(test_data[column].median())

for column in test_data.select_dtypes(include=['object']).columns:
    test_data[column] = test_data[column].fillna(test_data[column].mode()[0])

# Convert categorical to numerical
test_data = pd.get_dummies(test_data)

# Align columns with training data
X_train, test_data = X_train.align(test_data, join='left', axis=1, fill_value=0)

# Predict
test_preds = rf_model.predict(test_data)

# Prepare submission
submission = pd.DataFrame({
    'Id': test_data.index + 1461,  # Adjust according to test file Ids
    'SalePrice': test_preds
})
submission.to_csv('submission.csv', index=False)
