In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv("data.csv")  # Replace with your cleaned dataset path



In [None]:

# Display basic info about the dataset
print("Dataset Overview:")
print(data.info())
print("\nMissing Values:")
print(data.isnull().sum())
print("\nSummary Statistics:")
print(data.describe())

# Convert necessary columns to numeric if not already (example for price columns)
columns_to_clean = [
    'Median House Price (2021)', 
    'Median Apartment Price (2020)', 
    'Median House Rent (per week)', 
    'Median Apartment Rent (per week)'
]
for column in columns_to_clean:
    data[column] = data[column].replace('[\$,]', '', regex=True).astype(float)

# Univariate Analysis: Distribution of key numerical features
numeric_columns = [
    'Median House Price (2020)', 
    'Median House Price (2021)', 
    'Median House Rent (per week)', 
    'Median Apartment Rent (per week)', 
    'Affordability Index', 
    'Price Change (%)'
]


In [None]:
# Python Explorative Data Analysis 

In [None]:

print("\nUnivariate Distributions:")
for col in numeric_columns:
    plt.figure(figsize=(8, 5))
    sns.histplot(data[col], kde=True, bins=20, color="blue")
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.show()


In [None]:

# Bivariate Analysis: Correlation heatmap for numeric variables
plt.figure(figsize=(12, 8))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()


In [None]:

# Affordability Analysis by Region
plt.figure(figsize=(10, 6))
sns.boxplot(x='Region', y='Affordability Index', data=data)
plt.xticks(rotation=45)
plt.title("Affordability Index by Region")
plt.show()


In [None]:

# Relationships between Key Variables
pairplot_columns = ['Median House Price (2021)', 'Median House Rent (per week)', 'Affordability Index', 'Safety']
sns.pairplot(data[pairplot_columns], diag_kind="kde", plot_kws={'alpha':0.6})
plt.suptitle("Pairwise Relationships of Key Variables", y=1.02)
plt.show()


In [None]:

# Time Trends Analysis: Price changes over time (2020-2021)
data['Price Difference'] = data['Median House Price (2021)'] - data['Median House Price (2020)']
plt.figure(figsize=(10, 6))
sns.histplot(data['Price Difference'], kde=True, bins=20, color="green")
plt.title("Distribution of Price Changes (2020-2021)")
plt.xlabel("Price Difference")
plt.ylabel("Frequency")
plt.show()


In [None]:

# Top and Bottom Suburbs by Affordability Index
top_affordable = data.nlargest(10, 'Affordability Index')[['Name', 'Affordability Index']]
bottom_affordable = data.nsmallest(10, 'Affordability Index')[['Name', 'Affordability Index']]
print("\nTop 10 Most Affordable Suburbs:")
print(top_affordable)
print("\nTop 10 Least Affordable Suburbs:")
print(bottom_affordable)

# Safety vs Affordability
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Safety', y='Affordability Index', data=data, hue='Region', palette="viridis")
plt.title("Safety vs Affordability Index")
plt.xlabel("Safety")
plt.ylabel("Affordability Index")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
# Suburbs with highest and lowest affordability
most_affordable = data.nlargest(10, 'Affordability Index')[['Name', 'Region', 'Affordability Index']]
least_affordable = data.nsmallest(10, 'Affordability Index')[['Name', 'Region', 'Affordability Index']]

print("\nTop 10 Most Affordable Suburbs:")
print(most_affordable)

print("\nTop 10 Least Affordable Suburbs:")
print(least_affordable)

# Visualize affordability across regions
plt.figure(figsize=(14, 8))
sns.barplot(data=region_summary, x='Region', y=('Affordability Index', 'mean'), palette='coolwarm')
plt.title("Average Affordability Index by Region")
plt.xticks(rotation=45)
plt.ylabel("Average Affordability Index")
plt.show()


In [None]:
# Boxplot for identifying outliers in house prices
plt.figure(figsize=(8, 6))
sns.boxplot(data=data, y='Median House Price (2021)')
plt.title("Boxplot for Median House Prices (2021)")
plt.show()

# Boxplot for Price Change (%)
plt.figure(figsize=(8, 6))
sns.boxplot(data=data, y='Price Change (%)')
plt.title("Boxplot for Price Change (%)")
plt.show()


In [None]:
# Count plot for 'Region'
plt.figure(figsize=(12, 6))
sns.countplot(data=data, y='Region', order=data['Region'].value_counts().index, palette='Spectral')
plt.title("Number of Suburbs by Region")
plt.xlabel("Count")
plt.ylabel("Region")
plt.show()

# Count plot for 'Ideal for'
plt.figure(figsize=(14, 8))
sns.countplot(data=data, y='Ideal for', order=data['Ideal for'].value_counts().index, palette='Set2')
plt.title("Distribution of Suburbs by 'Ideal for'")
plt.xlabel("Count")
plt.ylabel("Ideal for")
plt.show()


In [None]:
# Analyze time to CBD (Driving vs Public Transport)
plt.figure(figsize=(8, 6))
sns.scatterplot(data=data, x='Time to CBD (Public Transport) [Town Hall St]', y='Time to CBD (Driving) [Town Hall St]')
plt.title("Public Transport vs Driving Time to CBD")
plt.xlabel("Time to CBD (Public Transport)")
plt.ylabel("Time to CBD (Driving)")
plt.show()

# Average public transport rating by region
avg_transport_region = data.groupby('Region')['Public Transport'].mean().sort_values(ascending=False)
print("\nAverage Public Transport Rating by Region:")
print(avg_transport_region)

plt.figure(figsize=(12, 6))
avg_transport_region.plot(kind='bar', color='steelblue')
plt.title("Average Public Transport Rating by Region")
plt.ylabel("Average Public Transport Rating")
plt.xlabel("Region")
plt.show()


In [None]:
# Safety score by region
plt.figure(figsize=(14, 8))
sns.boxplot(data=data, x='Region', y='Safety')
plt.title("Safety Scores by Region")
plt.xticks(rotation=45)
plt.ylabel("Safety Score")
plt.show()

# Family-friendliness vs Safety correlation
plt.figure(figsize=(8, 6))
sns.scatterplot(data=data, x='Family-Friendliness', y='Safety', hue='Region', palette='tab10')
plt.title("Family-Friendliness vs Safety by Region")
plt.xlabel("Family-Friendliness")
plt.ylabel("Safety")
plt.show()


In [None]:
# Calculate regional averages for key metrics
region_trends = data.groupby('Region').agg({
    'Median House Price (2021)': 'mean',
    'Price Change (%)': 'mean',
    'Affordability Index': 'mean',
    'Public Transport': 'mean',
    'Safety': 'mean',
    'Family-Friendliness': 'mean'
}).reset_index()

# Rename columns for clarity
region_trends.rename(columns={
    'Median House Price (2021)': 'Avg House Price (2021)',
    'Price Change (%)': 'Avg Price Change (%)',
    'Affordability Index': 'Avg Affordability Index',
    'Public Transport': 'Avg Public Transport Rating',
    'Safety': 'Avg Safety Rating',
    'Family-Friendliness': 'Avg Family-Friendliness'
}, inplace=True)

print("Regional Trends Summary:")
print(region_trends)


In [None]:
# Bar plot: Average house price by region
plt.figure(figsize=(14, 8))
sns.barplot(data=region_trends, x='Region', y='Avg House Price (2021)', palette='coolwarm')
plt.title("Average House Price (2021) by Region")
plt.ylabel("Average House Price (2021)")
plt.xticks(rotation=45)
plt.show()

# Line plot: Price change and affordability index by region
fig, ax1 = plt.subplots(figsize=(14, 8))

sns.lineplot(data=region_trends, x='Region', y='Avg Price Change (%)', label='Avg Price Change (%)', marker='o', ax=ax1)
ax1.set_ylabel("Average Price Change (%)", color="blue")
ax1.tick_params(axis='y', labelcolor="blue")
ax1.set_xticklabels(region_trends['Region'], rotation=45)

ax2 = ax1.twinx()  # Create a twin y-axis
sns.lineplot(data=region_trends, x='Region', y='Avg Affordability Index', label='Avg Affordability Index', color='red', marker='o', ax=ax2)
ax2.set_ylabel("Average Affordability Index", color="red")
ax2.tick_params(axis='y', labelcolor="red")

plt.title("Price Change (%) and Affordability Index by Region")
plt.legend(loc='upper left')
plt.show()

# Bar plot: Public transport and safety ratings by region
fig, axes = plt.subplots(2, 1, figsize=(14, 12), sharex=True)

sns.barplot(data=region_trends, x='Region', y='Avg Public Transport Rating', ax=axes[0], palette='viridis')
axes[0].set_title("Average Public Transport Rating by Region")
axes[0].set_ylabel("Avg Public Transport Rating")

sns.barplot(data=region_trends, x='Region', y='Avg Safety Rating', ax=axes[1], palette='magma')
axes[1].set_title("Average Safety Rating by Region")
axes[1].set_ylabel("Avg Safety Rating")
axes[1].set_xlabel("Region")

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Boxplot: Distribution of house prices by region
plt.figure(figsize=(14, 8))
sns.boxplot(data=data, x='Region', y='Median House Price (2021)', palette='Set3')
plt.title("Distribution of Median House Prices (2021) by Region")
plt.ylabel("Median House Price (2021)")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Best and worst regions for affordability
best_affordability = region_trends.nlargest(3, 'Avg Affordability Index')[['Region', 'Avg Affordability Index']]
worst_affordability = region_trends.nsmallest(3, 'Avg Affordability Index')[['Region', 'Avg Affordability Index']]

print("Top 3 Most Affordable Regions:")
print(best_affordability)

print("\nTop 3 Least Affordable Regions:")
print(worst_affordability)

# Best and worst regions for safety
best_safety = region_trends.nlargest(3, 'Avg Safety Rating')[['Region', 'Avg Safety Rating']]
worst_safety = region_trends.nsmallest(3, 'Avg Safety Rating')[['Region', 'Avg Safety Rating']]

print("Top 3 Safest Regions:")
print(best_safety)

print("\nTop 3 Least Safe Regions:")
print(worst_safety)


In [None]:
###10 most and least afforable suburbs 
top_affordable = data.nlargest(10, 'Affordability Index')[['Name', 'Affordability Index']]
bottom_affordable = data.nsmallest(10, 'Affordability Index')[['Name', 'Affordability Index']]

plt.figure(figsize=(14, 8))
sns.barplot(x='Affordability Index', y='Name', data=top_affordable, palette='Blues_d')
plt.title("Top 10 Most Affordable Suburbs")
plt.xlabel("Affordability Index")
plt.ylabel("Suburb")
plt.show()

plt.figure(figsize=(14, 8))
sns.barplot(x='Affordability Index', y='Name', data=bottom_affordable, palette='Reds_d')
plt.title("Top 10 Least Affordable Suburbs")
plt.xlabel("Affordability Index")
plt.ylabel("Suburb")
plt.show()


In [None]:
#affordability index by region 
# Visualization code
plt.figure(figsize=(14, 8))
sns.boxplot(x='Region', y='Affordability Index', data=data, palette='coolwarm')
plt.title("Affordability Index by Region")
plt.xticks(rotation=45)
plt.ylabel("Affordability Index")
plt.show()


In [None]:
#price change from 2020 to 2021 
# Visualization code
data['Price Difference'] = data['Median House Price (2021)'] - data['Median House Price (2020)']

plt.figure(figsize=(10, 6))
sns.histplot(data['Price Difference'], kde=True, bins=20, color="green")
plt.title("Distribution of Price Changes (2020-2021)")
plt.xlabel("Price Difference")
plt.ylabel("Frequency")
plt.show()


In [None]:
#safety vs affordability 
# Visualization code
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Safety', y='Affordability Index', data=data, hue='Region', palette="viridis")
plt.title("Safety vs Affordability Index")
plt.xlabel("Safety")
plt.ylabel("Affordability Index")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
#transport time to central business disctrict vs afforability index 
# Visualization code
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Time to CBD (Public Transport) [Town Hall St]', y='Affordability Index', data=data, hue='Region', palette="coolwarm")
plt.title("Time to CBD (Public Transport) vs Affordability Index")
plt.xlabel("Time to CBD (Public Transport)")
plt.ylabel("Affordability Index")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
# In-Depth Analysis 

In [None]:
### New libraries for indepth analysis 

In [None]:

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error




In [None]:
# Step 1: Inspect the problematic columns
print(data['Time to CBD (Public Transport) [Town Hall St]'].head())
print(data['Time to CBD (Driving) [Town Hall St]'].head())

# Step 2: Convert columns to string, then extract numeric parts
data['Time to CBD (Public Transport) [Town Hall St]'] = (
    data['Time to CBD (Public Transport) [Town Hall St]']
    .astype(str)  # Convert all values to string
    .str.extract('(\d+)')  # Extract numeric part
    .astype(float)  # Convert to float
)

data['Time to CBD (Driving) [Town Hall St]'] = (
    data['Time to CBD (Driving) [Town Hall St]']
    .astype(str)  # Convert all values to string
    .str.extract('(\d+)')  # Extract numeric part
    .astype(float)  # Convert to float
)

# Step 3: Verify conversion
print(data[['Time to CBD (Public Transport) [Town Hall St]', 'Time to CBD (Driving) [Town Hall St]']].head())


In [None]:

# Feature Importance using Linear Regression
X = data[key_features]  # Independent variables
y = data['Affordability Index']  # Target variable

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train linear regression model
model = LinearRegression()
model.fit(X_train, y_train)


In [None]:

# Extract feature importance (coefficients)
feature_importance = pd.DataFrame({'Feature': key_features, 'Importance': model.coef_}).sort_values(by='Importance', ascending=False)
print("Feature Importance:\n", feature_importance)

# Visualize Feature Importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance, palette='Blues_r')
plt.title("Feature Importance for Affordability Index")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()


In [None]:

# Scatterplots for Key Relationships
scatter_features = ['Safety', 'Median House Price (2021)', 'Time to CBD (Public Transport) [Town Hall St]']
for feature in scatter_features:
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=feature, y='Affordability Index', data=data, hue='Region', palette='coolwarm')
    plt.title(f"{feature} vs. Affordability Index")
    plt.xlabel(feature)
    plt.ylabel("Affordability Index")
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.show()

# Evaluate Regression Model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Model Mean Squared Error: {mse:.2f}")


In [None]:
# Suburbs profiling 

In [None]:
### New libraries for profiling 

In [None]:
# Import necessary libraries
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [None]:

# Select relevant features for clustering
features = ['Affordability Index', 'Median House Price (2021)', 'Safety', 'Family-Friendliness']
clustering_data = data[features].dropna()  # Drop missing values, if any

# Step 1: Normalize features using StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(clustering_data)

# Step 2: Determine the optimal number of clusters using the Elbow Method
inertia = []
range_clusters = range(2, 11)
for k in range_clusters:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_features)
    inertia.append(kmeans.inertia_)

# Plot the Elbow Method
plt.figure(figsize=(10, 6))
plt.plot(range_clusters, inertia, marker='o')
plt.title("Elbow Method for Optimal Clusters")
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
plt.show()



In [None]:
# Step 3: Apply K-Means Clustering with the optimal number of clusters
optimal_clusters = 4  # Replace with the number determined from the Elbow Method
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
clustering_data['Cluster'] = kmeans.fit_predict(scaled_features)

# Step 4: Analyze clusters
cluster_summary = clustering_data.groupby('Cluster').mean()
print("Cluster Summary:\n", cluster_summary)


In [None]:

# Step 5: Visualize clusters (e.g., Safety vs. Affordability Index)
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x='Safety', 
    y='Affordability Index', 
    hue='Cluster', 
    palette='viridis', 
    data=clustering_data
)
plt.title("Clusters of Suburbs (Safety vs. Affordability Index)")
plt.xlabel("Safety")
plt.ylabel("Affordability Index")
plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x='Safety', 
    y='Affordability Index', 
    hue='Cluster', 
    palette='viridis', 
    data=clustering_data
)
plt.title("Clusters of Suburbs (Safety vs. Affordability Index)")
plt.xlabel("Safety")
plt.ylabel("Affordability Index")
plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
from math import pi

# Radar chart data
cluster_summary = clustering_data.groupby('Cluster').mean()
categories = cluster_summary.columns
num_clusters = cluster_summary.shape[0]

# Plot each cluster
for cluster in range(num_clusters):
    values = cluster_summary.iloc[cluster].values.flatten().tolist()
    values += values[:1]  # Close the loop
    angles = [n / float(len(categories)) * 2 * pi for n in range(len(categories))]
    angles += angles[:1]
    
    plt.figure(figsize=(6, 6))
    ax = plt.subplot(111, polar=True)
    ax.plot(angles, values, linewidth=2, linestyle='solid', label=f"Cluster {cluster}")
    ax.fill(angles, values, alpha=0.25)
    plt.xticks(angles[:-1], categories, color='grey', size=8)
    plt.title(f"Cluster {cluster} Characteristics", size=12, y=1.1)
    plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Data Preparation
top_affordable = data.nlargest(10, 'Affordability Index')[['Name', 'Affordability Index']]
least_affordable = data.nsmallest(10, 'Affordability Index')[['Name', 'Affordability Index']]

# Plotting
fig, ax = plt.subplots(1, 2, figsize=(16, 6), sharey=True)

sns.barplot(x='Affordability Index', y='Name', data=top_affordable, ax=ax[0], palette='viridis')
ax[0].set_title('Top 10 Most Affordable Suburbs')
ax[0].set_xlabel('Affordability Index')
ax[0].set_ylabel('Suburb')

sns.barplot(x='Affordability Index', y='Name', data=least_affordable, ax=ax[1], palette='magma')
ax[1].set_title('Top 10 Least Affordable Suburbs')
ax[1].set_xlabel('Affordability Index')
ax[1].set_ylabel('')

plt.tight_layout()
plt.show()


In [None]:
# Correlation Matrix
correlation_matrix = data[['Median House Price (2021)', 'Median House Rent (per week)', 
                           'Safety', 'Family-Friendliness', 'Affordability Index']].corr()

# Plotting
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap of Affordability Drivers")
plt.show()


In [None]:
# Feature Importance Data
feature_importance = {
    'Feature': ['Median House Price (2021)', 'Median House Rent (per week)', 'Safety', 'Family-Friendliness', 'Proximity to CBD'],
    'Importance': [-0.8, -0.7, 0.3, 0.2, -0.1]
}
importance_df = pd.DataFrame(feature_importance)

# Plotting
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df, palette='Blues_r')
plt.title('Feature Importance in Affordability Analysis')
plt.xlabel('Correlation Coefficient')
plt.ylabel('Feature')
plt.axvline(0, color='gray', linestyle='--')
plt.show()


In [None]:
# Aggregating Data by Region
regional_data = data.groupby('Region')['Affordability Index'].mean().sort_values()

# Plotting
plt.figure(figsize=(12, 6))
sns.barplot(x=regional_data.values, y=regional_data.index, palette='Spectral')
plt.title('Average Affordability Index by Region')
plt.xlabel('Average Affordability Index')
plt.ylabel('Region')
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Proximity to CBD (Public Transport)', y='Affordability Index', data=data, hue='Region', palette='viridis', s=100)
plt.title('Affordability Index vs. Proximity to CBD (Public Transport)')
plt.xlabel('Proximity to CBD (mins)')
plt.ylabel('Affordability Index')
plt.axhline(data['Affordability Index'].mean(), color='red', linestyle='--', label='Average Affordability Index')
plt.legend(title='Region')
plt.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Select relevant columns for clustering
features = ['Affordability Index', 'Safety', 'Family-Friendliness', 'Median House Price (2021)']
clustering_data = data[features].dropna()

# Standardize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(clustering_data)

# Perform K-Means clustering
kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(scaled_features)

# Add cluster labels to the original dataset
clustered_data = data.copy()
clustered_data['Cluster'] = clusters

# Filter data for highlighting Cluster 1
cluster_1_data = clustered_data[clustered_data['Cluster'] == 1]

# Scatterplot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Safety', y='Affordability Index', data=clustered_data, hue='Cluster', palette='Set2', s=100, alpha=0.7)
sns.scatterplot(x='Safety', y='Affordability Index', data=cluster_1_data, color='red', s=150, label='Cluster 1 (Affordable & Family-Friendly)', edgecolor='black')

# Customize plot
plt.title('Safety vs. Affordability Index with Highlighted Cluster 1 Suburbs')
plt.xlabel('Safety')
plt.ylabel('Affordability Index')
plt.axhline(clustered_data['Affordability Index'].mean(), color='gray', linestyle='--', label='Average Affordability Index')
plt.axvline(clustered_data['Safety'].mean(), color='gray', linestyle='--', label='Average Safety')
plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(alpha=0.3)

plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Filter data for highlighting Cluster 3
cluster_3_data = clustered_data[clustered_data['Cluster'] == 3]

# Scatterplot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Safety', y='Affordability Index', data=clustered_data, hue='Cluster', palette='Set2', s=100, alpha=0.7)
sns.scatterplot(x='Safety', y='Affordability Index', data=cluster_3_data, color='orange', s=150, label='Cluster 3 (Emerging Hotspots)', edgecolor='black')

# Customize plot
plt.title('Safety vs. Affordability Index with Highlighted Cluster 3 Suburbs')
plt.xlabel('Safety')
plt.ylabel('Affordability Index')
plt.axhline(clustered_data['Affordability Index'].mean(), color='gray', linestyle='--', label='Average Affordability Index')
plt.axvline(clustered_data['Safety'].mean(), color='gray', linestyle='--', label='Average Safety')
plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(alpha=0.3)

plt.show()


In [None]:
import matplotlib.pyplot as plt
from math import pi

# Filter data for Cluster 4
cluster_4_data = clustered_data[clustered_data['Cluster'] == 4]

# Calculate the mean values for relevant metrics in Cluster 4
cluster_4_means = cluster_4_data[['Affordability Index', 'Safety', 'Family-Friendliness', 'Median House Price (2021)']].mean()

# Preparing data for radar chart
categories = list(cluster_4_means.index)
num_vars = len(categories)

# Calculate angles for radar chart
angles = [n / float(num_vars) * 2 * pi for n in range(num_vars)]
angles += angles[:1]

# Values for radar chart
values = cluster_4_means.values.flatten().tolist()
values += values[:1]  # Close the circle

# Create radar chart
plt.figure(figsize=(8, 8))
ax = plt.subplot(111, polar=True)

# Plot the data
ax.plot(angles, values, linewidth=2, linestyle='solid', label='Cluster 4')
ax.fill(angles, values, alpha=0.25)

# Add category labels
plt.xticks(angles[:-1], categories, color='gray', size=12)

# Add title and legend
plt.title('Trade-offs in Affordability and Safety for Cluster 4', size=15, y=1.1)
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))

# Display the chart
plt.show()


In [None]:
import matplotlib.pyplot as plt
from math import pi

# Filter data for Cluster 2
cluster_2_data = clustered_data[clustered_data['Cluster'] == 2]

# Calculate the mean values for relevant metrics in Cluster 2
cluster_2_means = cluster_2_data[['Affordability Index', 'Safety', 'Family-Friendliness', 'Median House Price (2021)']].mean()

# Preparing data for radar chart
categories = list(cluster_2_means.index)
num_vars = len(categories)

# Calculate angles for radar chart
angles = [n / float(num_vars) * 2 * pi for n in range(num_vars)]
angles += angles[:1]

# Values for radar chart
values = cluster_2_means.values.flatten().tolist()
values += values[:1]  # Close the circle

# Create radar chart
plt.figure(figsize=(8, 8))
ax = plt.subplot(111, polar=True)

# Plot the data
ax.plot(angles, values, linewidth=2, linestyle='solid', label='Cluster 2')
ax.fill(angles, values, alpha=0.25, color='blue')

# Add category labels
plt.xticks(angles[:-1], categories, color='gray', size=12)

# Add title and legend
plt.title('Trade-offs in Affordability and Safety for Cluster 2 (Luxury Suburbs)', size=15, y=1.1)
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))

# Display the chart
plt.show()


In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='Region', y='Median House Price (2021)', data=data, palette='coolwarm')
plt.title('Distribution of Median House Prices by Region')
plt.xlabel('Region')
plt.ylabel('Median House Price (2021)')
plt.xticks(rotation=45)
plt.grid(alpha=0.3)
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Median House Price (2021)', y='Median House Rent (per week)', data=data, hue='Region', palette='viridis', s=100)
plt.title('Median House Price vs. Median Rent Across Suburbs')
plt.xlabel('Median House Price (2021)')
plt.ylabel('Median House Rent (per week)')
plt.axhline(data['Median House Rent (per week)'].mean(), color='red', linestyle='--', label='Average Rent')
plt.axvline(data['Median House Price (2021)'].mean(), color='blue', linestyle='--', label='Average House Price')
plt.legend(title='Region', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(alpha=0.3)
plt.show()


In [None]:
# Data for heatmap
policy_priorities = pd.DataFrame({
    'Cluster': ['Affordable & Family-Friendly', 'Luxury Suburbs', 'Emerging Hotspots', 'Affordable but Less Safe'],
    'Safety': [2, 1, 3, 4],
    'Infrastructure': [1, 2, 3, 4],
    'Affordability': [3, 4, 2, 1]
}).set_index('Cluster')

plt.figure(figsize=(8, 6))
sns.heatmap(policy_priorities, annot=True, cmap='coolwarm', linewidths=0.5, fmt='d')
plt.title('Policy and Development Priorities by Cluster')
plt.xlabel('Priority Area')
plt.ylabel('Cluster')
plt.show()


In [None]:
# Simulate data for year-over-year affordability trends
affordability_trends = data.groupby('Region')[['Median House Price (2020)', 'Median House Price (2021)']].mean().T
affordability_trends.index = ['2020', '2021']

affordability_trends.plot(kind='line', figsize=(10, 6), marker='o')
plt.title('Affordability Index Trends Over Time by Region')
plt.xlabel('Year')
plt.ylabel('Median House Price')
plt.legend(title='Region', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(alpha=0.3)
plt.show()


In [None]:
# Pivot data for heatmap
heatmap_data = data.pivot_table(index='Region', values='Median House Price (2021)', aggfunc='mean')

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(heatmap_data, annot=True, fmt='.0f', cmap='coolwarm', linewidths=0.5)
plt.title('Average Median House Prices by Region')
plt.xlabel('Regions')
plt.ylabel('')
plt.show()


In [None]:
# Subset relevant columns
pairplot_data = data[['Affordability Index', 'Safety', 'Family-Friendliness', 'Median House Price (2021)']]

# Pairplot
sns.pairplot(pairplot_data, diag_kind='kde', corner=True)
plt.suptitle('Pairwise Relationships Between Key Variables', y=1.02)
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Family-Friendliness', y='Affordability Index', data=clustered_data, hue='Cluster', palette='Set2', s=100, alpha=0.8)
plt.title('Cluster Distribution: Family-Friendliness vs. Affordability Index')
plt.xlabel('Family-Friendliness')
plt.ylabel('Affordability Index')
plt.grid(alpha=0.3)
plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
# Sample policy priorities data
policy_priorities = pd.DataFrame({
    'Region': ['Region A', 'Region B', 'Region C', 'Region D'],
    'Safety Improvements': [3, 2, 4, 1],
    'Infrastructure Development': [2, 4, 3, 1],
    'Affordability Interventions': [1, 3, 2, 4]
}).set_index('Region')

# Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(policy_priorities, annot=True, cmap='coolwarm', linewidths=0.5, fmt='d')
plt.title('Regional Policy Needs Heatmap')
plt.xlabel('Policy Priority Areas')
plt.ylabel('Region')
plt.show()


In [None]:
plt.figure(figsize=(12, 8))
sns.scatterplot(
    x='Affordability Index', y='Safety', 
    size='Population (rounded)*', data=data, hue='Region', 
    sizes=(50, 1000), alpha=0.6, palette='cool'
)
plt.title('Affordability vs. Safety by Suburb Population')
plt.xlabel('Affordability Index')
plt.ylabel('Safety')
plt.legend(title='Region', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(alpha=0.3)
plt.show()


In [None]:
print('Case Study Complete: Please Check  out the linkedin article. ')