# Import Libraries

In [None]:
# Built-in libraries
import sys

# Third-party libraries
import numpy
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

# Specific imports from packages
from numpy import isnan
from pandas import read_csv
from scipy.cluster.hierarchy import cut_tree, dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.compose import make_column_transformer
from sklearn.decomposition import KernelPCA
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler

# Configuration
numpy.set_printoptions(threshold=sys.maxsize)
plt.rcParams['figure.figsize'] = [12, 8]

In [None]:
df = pd.read_csv('/kaggle/input/bengaluru-house-price-data/Bengaluru_House_Data.csv')

In [None]:
df.shape

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
(df.isnull().sum()/len(df)*100).sort_values(ascending=False)

In [None]:
df['location'] = df['location'].fillna('Sarjapur  Road')

In [None]:
df['size'] = df['size'].str.split(' ', expand=True)[0].astype(float)
df['size'] = df['size'].fillna('2 BHK')

In [None]:
df['bath'] = df['bath'].fillna(df['bath'].median())

In [None]:
numeric_values = pd.to_numeric(df['total_sqft'], errors='coerce')

non_numeric_values = df['total_sqft'][numeric_values.isna()].tolist()

print(non_numeric_values)

In [None]:
def convertRange(x):
    temp = x.split('-')
    if len(temp)==2:
        return (float(temp[0])+float(temp[1]))/2
    try:
        return float(x)
    except:
        return None

df['total_sqft'] = df['total_sqft'].apply(convertRange)
df.dropna(inplace=True)
df['Price_per_square_feet'] = df['price']*100000 / df['total_sqft']

In [None]:
location_count = df['location'].value_counts()

location_count_less_10 = location_count[location_count<10]

df['location'] = df['location'].apply(lambda x: 'other' if x in location_count_less_10 else x)

df = df[((df['total_sqft']/df['size'])>=300)]

# Outliers

In [None]:
Q1 = (df['total_sqft'] / df['size']).quantile(0.25)
Q3 = (df['total_sqft'] / df['size']).quantile(0.75)
IQR = Q3 - Q1

df = df[(df['total_sqft'] / df['size'] >= (Q1 - 1.5 * IQR)) &
(df['total_sqft'] / df['size'] <= (Q3 + 1.5 * IQR))]

In [None]:
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

df = df[(df['price'] >= (Q1 - 1.5 * IQR)) & (df['price'] <= (Q3 + 1.5 * IQR))]

In [None]:
df.drop(columns='society' , inplace=True)
df.drop(columns='availability' , inplace=True)
df.drop(columns='area_type' , inplace=True)
df.drop(columns='balcony' , inplace=True)

# Correlation Matrix

In [None]:
numeric_df = df.select_dtypes(include=[np.number])

correlation_matrix = numeric_df.corr()
plt.figure(figsize=(8, 5))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix')
plt.show()

# Define Features And Target

In [None]:
X = df.drop(columns='price')
y = df['price']

# Model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
LR = LinearRegression()

In [None]:
column_trans = make_column_transformer(
    (OneHotEncoder(sparse_output=False), ['location']),
    remainder='passthrough'
)
scaler = StandardScaler()

In [None]:
pipe_line = make_pipeline(column_trans,scaler,LR)

# Fit The Model

In [None]:
pipe_line.fit(X_train,y_train)

# Predictions

In [None]:
y_pred = pipe_line.predict(X_test)

In [None]:
pipe_line.score(X_test,y_test)

# Evaluate The model

In [None]:
print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))
print('MSE',mean_squared_error(y_test,y_pred))
print('RMSE',np.sqrt(mean_squared_error(y_test,y_pred)))

In [None]:
results = pd.DataFrame({
    'Actual Price': y_test.values,
    'Predicted Price': y_pred
})
print(results.head(10))

In [None]:
results = pd.DataFrame({
    'Actual Price': y_test.values,
    'Predicted Price': y_pred
})
print(results.tail(10))

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs Predicted Prices")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.show()