<a href="https://colab.research.google.com/github/martinpius/Practical_1/blob/main/Day_5_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary packages for today's lesson
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from scipy import stats
import matplotlib.pyplot as plt

In [None]:
# Loading the dataset from fetch_california_housing
california = fetch_california_housing()

In [None]:
# Create the Pandas DataFrame using the above instance
dfm = pd.DataFrame(data = california.data,
                   columns = california.feature_names)

In [None]:
# Observes first few covariates: [The loaded data has covariates only]
display(dfm.head(4))

In [None]:
# Add the target (The response variable to the dataframe above)

In [None]:
dfm["MedHouseVal"] = california.target

In [None]:
# Observes the first few raws again
display(dfm.head(4))

In [None]:
# Change the target into USD by multiplying with 100000
dfm["MedHouseVal"] = dfm.MedHouseVal * 100000

In [None]:
# Observes the data again to see the effect
display(dfm.head(5))

In [None]:
# See how many rows and columns we have
dfm.shape

In [None]:
# Renaming columns for clarity (whener necessary): Here is only to
# demonstrate the use of rename functionality in Pandas

In [None]:
# Syntax is: dfm.rename(mapper = {"oldname":"newname"})

In [None]:
dfm = dfm.rename(mapper = {
    'MedInc': 'MedianIncome',
    'AveRooms': 'AvgRooms',
    'AveBedrms': 'AvgBedrooms',
    'AveOccup': 'AvgOccupancy'
})

In [None]:
# Observes the changes
display(dfm.head())

In [None]:
# Investigate anymissing {You can use .isna() or .isnull()}

In [None]:
dfm.isna().sum()

In [None]:
dfm.isnull().sum()

In [None]:
# Our data has no any missing. Let simulate missing for the "MedInc"

In [None]:
dfm.loc[10:20, ['MedInc']] = np.nan # Make rows 10 to 20 missing for the column "MedInc"

In [None]:
# Observes the changes we see 11 missing
dfm.isna().sum()

In [None]:
# Fill the missing value (Imputation depends on the variable as discussed in class: Several approaches as available): Here we fill with the Median income

In [None]:
dfm["MedInc"] = dfm["MedInc"].fillna(value = dfm["MedInc"].median())

In [None]:
# Check again if the problem is solved
dfm.isna().sum()

In [None]:
# Print out the metadata [column names, etc]

In [None]:
print("Columns:", dfm.columns.tolist())
print("\nData types:\n", dfm.dtypes)
print("\nMissing values:\n", dfm.isnull().sum())

In [None]:
# Create categorical income groups
dfm['IncomeCategory'] = pd.cut(dfm['MedInc'],
                              bins=[0, 3, 6, 9, 15],
                              labels=['Low', 'Medium', 'High', 'Very High'])

In [None]:
# See the changes
display(dfm.head(3))

In [None]:
# Filter coastal properties [Longitude and Latitude range are defined/Given in this case]
coastal_mask = (dfm['Latitude'] >= 34) & (dfm['Latitude'] <= 38) & \
               (dfm['Longitude'] >= -124) & (dfm['Longitude'] <= -118)

In [None]:
# Check out the mask
coastal_mask.head(4)

In [None]:
# Apply the mask to fetch coastal houses/properties
coastal_housing = dfm[coastal_mask].copy()

In [None]:
# Print out total rows obtained/fetched
print(f"Found {len(coastal_housing)} coastal properties")

In [None]:
import warnings
warnings.filterwarnings(action = "ignore")

In [None]:
# Group analysis by income category
income_stats = dfm.groupby('IncomeCategory').agg(
    AvgHouseValue=('MedHouseVal', 'mean'),
    MaxBedrooms=('AveBedrms', 'max'),
    TotalPopulation=('Population', 'sum')
).reset_index()

In [None]:
income_stats

In [None]:
dfm.describe() # General summary (Discreptv stat for numerica vars)

In [None]:
# Calculate price per room [Feature engineering]
coastal_housing['PricePerRoom'] = coastal_housing['MedHouseVal']/coastal_housing['AveRooms']

In [None]:
# See the changes
coastal_housing.head(2)

In [None]:
# Splitting and Merging Datasets in Pandas
# Split into two datasets[Here we split the columns-> subsets will have same number of rows]
geo_data = dfm[['Latitude', 'Longitude', 'MedHouseVal']] # Subseting only geospatial dt
economic_data = dfm.drop(['Latitude', 'Longitude'], axis=1) # Other data

In [None]:
# See what you now have
geo_data.shape

In [None]:
economic_data.shape

In [None]:
# Merge the data to reconstruct the oiginal
merged_df = pd.concat([geo_data, economic_data], axis = 1)

In [None]:
merged_df.head(2)

In [None]:
# Export and save cleaned dataset for future use[Load your drive and reset the directory to the folder of your choice to save in your drive--> This was covered in previous class]
# Save cleaned data for further uses

dfm.to_csv('merged_df.csv', index=False)

# Export income statistics
income_stats.to_excel('merged_df.xlsx', index=False)

In [None]:
# Basic descriptive statistics by subseting the data
print("Global Statistics:")
print(dfm[['MedHouseVal', 'MedInc', 'HouseAge']].describe())


In [None]:
# Skewness/Kurtosis Analysis [Refers to your Basic Stats Class]
print("\nDistribution Shape:")
print(dfm[['MedHouseVal', 'MedInc']].agg(['skew', 'kurtosis']))

In [None]:
# Correlation Matrix
corr_matrix = dfm.corr(numeric_only=True)
print("\nTop Correlations with House Value:")
print(corr_matrix['MedHouseVal'].sort_values(ascending=False)[:5])

In [None]:
# Distribution Analysis
plt.figure(figsize=(18,6))

# House Value Distribution
plt.subplot(1,2,1)
sns.histplot(dfm['MedHouseVal'], kde=True, bins=30)
plt.title('House Value Distribution')

# Income-Value Relationship
plt.subplot(1,2,2)
sns.scatterplot(x='MedInc', y='MedHouseVal', data=dfm, alpha=0.3) # Change alpha for more resolution
plt.title('Income vs House Value')

plt.tight_layout();

In [None]:
# Anomaly Detection
# Identify price outliers using IQR
Q1 = dfm['MedHouseVal'].quantile(0.25)
Q3 = dfm['MedHouseVal'].quantile(0.75)
IQR = Q3 - Q1

outliers = dfm[(dfm['MedHouseVal'] < Q1 - 1.5*IQR) |
             (dfm['MedHouseVal'] > Q3 + 1.5*IQR)]

print(f"Found {len(outliers)} price outliers ({len(outliers)/len(dfm):.1%} of data)")
print("Outlier characteristics:")
print(outliers[['MedInc', 'HouseAge', 'AveRooms']].describe())

In [None]:
# For spatial Analytics create latitude and longitude clusters
dfm['LatCluster'] = pd.qcut(dfm['Latitude'], q=5, labels=False)
dfm['LonCluster'] = pd.qcut(dfm['Longitude'], q=5, labels=False)

In [None]:
# See the changes
display(dfm.head())

In [None]:
# See unique values of a categorical variable
dfm.LatCluster.unique()

In [None]:
# Analyse spatial patterns
plt.figure(figsize=(10,6))
sns.scatterplot(x='Longitude', y='Latitude', hue='MedHouseVal',
                data=dfm, palette='viridis', alpha=0.8)
plt.title('Geographic Price Distribution');

In [None]:
# TODO: Intepreate the above plot like we have discussed earlier in the class

In [None]:
# Pair Plot for Key Features to demonstrate MVA
sns.pairplot(dfm[['MedHouseVal', 'MedInc', 'HouseAge', 'AveRooms']],
             plot_kws={'alpha':0.5})
plt.suptitle('Multivariate Relationships', y=1.04);

In [None]:
#The correlation hitmap to identify multicolinearity
plt.figure(figsize=(10,6))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask = mask, annot=True, cmap='coolwarm')
plt.title('Feature Correlation Matrix');

In [None]:
# TODO: Intepreate the above correlation matrix as we did in class

In [None]:
# Creating groups for t-test

In [None]:
coastal = dfm[dfm['LatCluster'].isin([2,3])]['MedHouseVal']  # Coastal clusters
inland = dfm[dfm['LatCluster'].isin([0,4])]['MedHouseVal']   # Inland clusters

In [None]:
# Perform two-sample t-test
t_stat, p_val = stats.ttest_ind(coastal, inland, equal_var=False)
print(f"T-statistic: {t_stat:.2f}, p-value: {p_val:.4f}")

if p_val < 0.05:
    print("Reject H₀: Significant price difference between coastal and inland properties")
else:
    print("No significant difference detected")

In [None]:
# TODO Recal your stat-inference for intepretation of the above outputs

In [None]:
# Prepare data groups---> For oneway ANOVA
income_groups = [
    dfm[dfm['IncomeCategory'] == cat]['MedHouseVal']
    for cat in ['Low', 'Medium', 'High', 'Very High']
]


In [None]:
# One-way ANOVA
f_stat, p_val = stats.f_oneway(*income_groups)
print(f"F-statistic: {f_stat:.1f}, p-value: {p_val:.4f}")

In [None]:
# TODO Recal your stat-inference for intepretation of the above outputs

In [None]:
# Exericise:--:> Use dataset of your own/Boston-housing from my GitHub repo,
# Open the new notebook and perform EDA:

In [None]:
#===============================================================================