<a href="https://colab.research.google.com/github/luckyyyman/git-workshop/blob/main/RealEstate_EDAV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.colab import files

uploaded = files.upload()
file_name = "Hyderabad_Real_Estate_Big_Dataset.xlsx"
df = pd.read_excel(file_name)

df.columns = [c.strip() for c in df.columns]

def find_col(names):
    for n in names:
        for c in df.columns:
            if n.lower() in c.lower():
                return c
    return None

price_col = find_col(['price', 'amount', 'sale'])
city_col = find_col(['city', 'location', 'area', 'place'])
type_col = find_col(['type', 'property'])
locality_col = find_col(['locality', 'neighbourhood'])

df['Price'] = (
    df[price_col].astype(str)
    .str.replace(r'[^\d.]', '', regex=True)
    .replace('', np.nan)
    .astype(float)
)
df['City'] = df[city_col].astype(str).fillna('Unknown')
df['Type'] = df[type_col].astype(str).fillna('Unknown')
df['Locality'] = df[locality_col].astype(str).fillna('Unknown')

print("Detected columns:")
print("Price:", price_col, "| City:", city_col, "| Type:", type_col, "| Locality:", locality_col)

# Q1: Calculate average property price with NumPy
mean_price = np.mean(df['Price'])
median_price = np.median(df['Price'])
std_price = np.std(df['Price'])
print("\nQ1 → Average Price (Mean):", round(mean_price, 2))
print("Median:", round(median_price, 2), "| Std Dev:", round(std_price, 2))

# Q2: Filter properties by city and type using Pandas
top_city = df['City'].mode()[0]
top_type = df['Type'].mode()[0]
filtered = df[(df['City'] == top_city) & (df['Type'] == top_type)]
print(f"\nQ2 → Properties in {top_city} of type {top_type}: {len(filtered)} found.")
display(filtered.head(5))

# Q3: Handle missing property attribute data
print("\nMissing values before:")
print(df.isnull().sum())

df['Price'].fillna(df['Price'].median(), inplace=True)
df['Locality'].fillna('Unknown', inplace=True)
df['Type'].fillna('Unknown', inplace=True)

print("\nMissing values after:")
print(df.isnull().sum())

# Q4: Group properties by city and locality
grouped = df.groupby(['City', 'Locality']).agg(
    Listings=('Price', 'count'),
    Avg_Price=('Price', 'mean'),
    Median_Price=('Price', 'median')
).reset_index()

print("\nQ4 → Top localities by listings:")
display(grouped.sort_values('Listings', ascending=False).head(10))

# Q5: Plot price trends for top cities using Matplotlib
plt.figure(figsize=(10, 5))
avg_city_price = df.groupby('City')['Price'].mean().sort_values(ascending=False)
avg_city_price.plot(kind='bar')
plt.title("Average Property Price by City")
plt.xlabel("City")
plt.ylabel("Average Price (INR)")
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(10, 5))
top_cities = df['City'].value_counts().head(5).index
data = [df[df['City'] == c]['Price'] for c in top_cities]
plt.boxplot(data, labels=top_cities, showfliers=False)
plt.title("Price Distribution of Top 5 Cities")
plt.ylabel("Price (INR)")
plt.show()

print("✅ All five questions executed successfully!")
