In [22]:
import pandas as pd

file_path = '/mnt/data/data.csv'
df = pd.read_csv("data.csv")

df.info(), df.head(), df.describe(include='all')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4597 non-null   float64
 2   bedrooms       4595 non-null   float64
 3   bathrooms      4598 non-null   float64
 4   sqft_living    4598 non-null   float64
 5   sqft_lot       4598 non-null   float64
 6   floors         4598 non-null   float64
 7   waterfront     4595 non-null   float64
 8   view           4598 non-null   float64
 9   condition      4600 non-null   int64  
 10  sqft_above     4597 non-null   float64
 11  sqft_basement  4597 non-null   float64
 12  yr_built       4594 non-null   float64
 13  yr_renovated   4600 non-null   int64  
 14  street         4597 non-null   object 
 15  city           4596 non-null   object 
 16  statezip       4597 non-null   object 
 17  country        4594 non-null   object 
dtypes: float

(None,
             date      price  bedrooms  bathrooms  sqft_living  sqft_lot  \
 0  5/2/2014 0:00   313000.0       3.0       1.50       1340.0    7912.0   
 1  5/2/2014 0:00  2384000.0       5.0       2.50       3650.0    9050.0   
 2  5/2/2014 0:00   342000.0       3.0       2.00       1930.0   11947.0   
 3  5/2/2014 0:00   420000.0       3.0       2.25       2000.0    8030.0   
 4  5/2/2014 0:00   550000.0       4.0       2.50       1940.0   10500.0   
 
    floors  waterfront  view  condition  sqft_above  sqft_basement  yr_built  \
 0     1.5         0.0   0.0          3      1340.0            0.0    1955.0   
 1     2.0         0.0   4.0          5      3370.0          280.0    1921.0   
 2     1.0         0.0   0.0          4      1930.0            0.0    1966.0   
 3     1.0         0.0   0.0          4      1000.0         1000.0    1963.0   
 4     1.0         0.0   0.0          4      1140.0          800.0    1976.0   
 
    yr_renovated                    street       city

In [26]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder

# Load Data
df = pd.read_csv("data.csv")

# Fill Missing Data
def fill_na(df, column, method="mean"):
    if method == "mean":
        df[column] = df[column].fillna(df[column].mean())
    elif method == "mode":
        df[column] = df[column].fillna(df[column].mode()[0])

# Columns with missing values
fill_na(df, 'price', method="mean")
fill_na(df, 'bedrooms', method="mode")
fill_na(df, 'bathrooms', method="mode")
fill_na(df, 'sqft_living', method="mean")
fill_na(df, 'sqft_lot', method="mean")
fill_na(df, 'floors', method="mode")
fill_na(df, 'waterfront', method="mode")
fill_na(df, 'view', method="mode")
fill_na(df, 'yr_built', method="mode")
fill_na(df, 'sqft_above', method="mean")
fill_na(df, 'sqft_basement', method="mean")
fill_na(df, 'city', method="mode")
fill_na(df, 'statezip', method="mode")

# Drop irrelevant columns
df.drop(['street', 'country', 'date'], axis=1, inplace=True)

# Encode Categorical Variables
label_encoder = LabelEncoder()
df['city'] = label_encoder.fit_transform(df['city'])
df['statezip'] = label_encoder.fit_transform(df['statezip'])

# Feature Engineering
df['price_per_sqft'] = df['price'] / df['sqft_living']
df['age'] = 2024 - df['yr_built']

# Feature Scaling
scaler = StandardScaler()
features_to_scale = ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'price_per_sqft', 'age']
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

# Features and Target
X = df.drop(['price'], axis=1)
y = df['price']

# Feature Engineering: Scaling the data
X = df.drop(columns=['price'])
y = df['price']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Model Selection: Ridge Regression with Hyperparameter Tuning
ridge = Ridge()
param_grid = {'alpha': [0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)

# Best model after tuning
best_ridge = grid_search.best_estimator_

# Model Evaluation
y_pred_train = best_ridge.predict(X_train)
y_pred_test = best_ridge.predict(X_test)

# Cross-validation scores
cv_scores = cross_val_score(best_ridge, X_train, y_train, cv=5, scoring='r2')

# Performance Metrics
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
mae = mean_absolute_error(y_test, y_pred_test)

# Results
print(f"Train R²: {train_r2:.4f}")
print(f"Test R²: {test_r2:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Cross-Validation R² Scores: {cv_scores}")
print(f"Average CV R²: {np.mean(cv_scores):.4f}")

Train R²: 0.8729
Test R²: 0.5531
Mean Absolute Error: 94563.9809
Cross-Validation R² Scores: [0.89175708 0.89983636 0.83738961 0.83380583 0.8901306 ]
Average CV R²: 0.8706
