## House price prediction

In [3]:
import pandas as pd

In [None]:
housing = pd.read_csv("data.csv")

In [None]:
housing.head() #makes a table of first 5 entries

In [None]:
housing.info() #gives no of entries

In [None]:
housing.describe()  #gives count, mean, min, max etc

In [None]:
%matplotlib inline #allows displying plot directly in jupyternb instead of a new window

In [None]:
import matplotlib.pyplot as plt


In [None]:
housing.hist(bins = 50, figsize=(20,15))

##Train-Test splitting

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)


In [None]:
print(f"Rows in train set: {len(train_set)} \n Rows in test_set: {len(test_set)}")

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['CHAS']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
# --- 5. Correlation Analysis ---

# Assuming 'MEDV' is the target variable (Median Value)
corr_matrix = housing.corr()
print("\nTop features correlated with MEDV:")
print(corr_matrix["MEDV"].sort_values(ascending=False))

In [None]:
# --- 6. Data Preparation (Feature Engineering & Cleaning) ---

# Create copies and separate features (X) and target (y)
housing_train = strat_train_set.drop("MEDV", axis=1) # X_train
housing_train_labels = strat_train_set["MEDV"].copy() # y_train

housing_test = strat_test_set.drop("MEDV", axis=1) # X_test
housing_test_labels = strat_test_set["MEDV"].copy() # y_test

In [None]:
# --- 7. Handling Missing Data (Imputation) & Feature Scaling using a Pipeline ---

# Use SimpleImputer to fill missing values with the median of each column
# The median is robust to outliers
imputer = SimpleImputer(strategy="median")

# We only fit the imputer on the TRAINING data
imputer.fit(housing_train)

# Transform both training and test sets
housing_train_prepared = imputer.transform(housing_train)
housing_train_prepared = pd.DataFrame(housing_train_prepared, columns=housing_train.columns, index=housing_train.index)

# Check if missing values are handled
print(f"\nMissing values check after imputation (should be 0): \n{housing_train_prepared.isnull().sum().sum()}")

# Feature Scaling: Use StandardScaler to scale features (e.g., to a mean of 0 and std dev of 1)
# This is crucial for gradient-descent based algorithms
scaler = StandardScaler()

# Create a final pipeline for all transformations (Imputation -> Scaling)
# This ensures consistency and prevents data leakage
full_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

# Apply the pipeline to the training data
housing_train_prepared_scaled = full_pipeline.fit_transform(housing_train)

# Apply the fitted pipeline to the test data
housing_test_prepared_scaled = full_pipeline.transform(housing_test)