DATA Processing,Cleaning

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the data
file_path = '/content/house.csv'  # Replace with your file path
data = pd.read_csv(file_path, encoding='ISO-8859-1')

# Display the first few rows to understand the structure
print(data.head())

# Handle missing values
# Fill numerical columns with median
numerical_cols = data.select_dtypes(include=['number']).columns
data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].median())

# Fill categorical columns with mode
categorical_cols = data.select_dtypes(include=['object']).columns
for column in categorical_cols:
    data[column] = data[column].fillna(data[column].mode()[0])

# Convert data types if necessary
data['price'] = data['price'].astype(float)

# Remove duplicate rows
data = data.drop_duplicates()




   property_id  location_id  \
0       237062         3325   
1       346905         3236   
2       386513          764   
3       656161          340   
4       841645         3226   

                                            page_url property_type     price  \
0  https://www.zameen.com/Property/g_10_g_10_2_gr...          Flat  10000000   
1  https://www.zameen.com/Property/e_11_2_service...          Flat   6900000   
2  https://www.zameen.com/Property/islamabad_g_15...         House  16500000   
3  https://www.zameen.com/Property/islamabad_bani...         House  43500000   
4  https://www.zameen.com/Property/dha_valley_dha...         House   7000000   

      location       city      province_name   latitude  longitude  baths  \
0         G-10  Islamabad  Islamabad Capital  33.679890  73.012640      2   
1         E-11  Islamabad  Islamabad Capital  33.700993  72.971492      3   
2         G-15  Islamabad  Islamabad Capital  33.631486  72.926559      6   
3    Bani Gala  Islamaba

Feature Engineering,Encoding

In [4]:
# Feature Engineering
# Create a new feature: Total Area in square meters if area type is given
def convert_area(row):
    # Check if 'area_size' and 'area_type' columns exist (correcting potential typo)
    if pd.isnull(row.get('area_size')) or pd.isnull(row.get('area_type')):
        return None
    if row['area_type'].lower() == 'square meter':
        return row['area_size']
    elif row['area_type'].lower() == 'square feet':
        return row['area_size'] * 0.092903
    else:
        return row['area_size']

# Apply the function, ensuring column names are correct
data['TotalArea'] = data.apply(convert_area, axis=1)

# Label encode categorical variables
label_encoders = {}
for column in categorical_cols:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Normalize/Scale numerical features
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Display the cleaned and transformed data
print(data.head())

   property_id  location_id  page_url  property_type     price  location  \
0    -6.822669    -0.278279 -0.366027      -0.913132 -0.219931 -0.235879   
1    -6.773876    -0.301846 -0.651326      -0.913132 -0.307725 -0.431774   
2    -6.756282    -0.956412  0.307869      -0.128626 -0.035847 -0.222461   
3    -6.636502    -1.068683  0.059711      -0.128626  0.728810 -0.979206   
4    -6.554109    -0.304494 -0.659799      -0.128626 -0.304893 -0.603517   

       city  province_name  latitude  longitude  ...      area   purpose  \
0 -1.107349      -1.515826  1.003286   0.565853  ...  0.478355  0.629362   
1 -1.107349      -1.515826  1.008828   0.552720  ...  0.828134  0.629362   
2 -1.107349      -1.515826  0.990574   0.538378  ...  1.306779  0.629362   
3 -1.107349      -1.515826  1.010556   0.610079  ... -0.129156  0.629362   
4 -1.107349      -1.515826  0.954099   0.658000  ...  1.306779  0.629362   

   bedrooms  date_added    agency     agent  Area Type  Area Size  \
0 -0.598268   -2.

Model Selection & Training

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [9]:
# Normalize/Scale numerical features
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Prepare data for modeling
X = data.drop(columns=['price', 'page_url', 'property_id','TotalArea'])  # Assuming 'price' is the target variable
y = data['price']


In [10]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)



Evalutions

In [11]:
# Evaluate the model
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

# Print evaluation metrics
print(f"Train MSE: {mse_train}")
print(f"Test MSE: {mse_test}")
print(f"Train R2: {r2_train}")
print(f"Test R2: {r2_test}")

Train MSE: 0.677700519262822
Test MSE: 0.6125960035614547
Train R2: 0.33400463479676057
Test R2: 0.3410693106299939
