## Run this to save the notebook to DagsHub 👇

In [1]:
# Install the DagsHub python client
!pip install -q dagshub


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.3/236.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.5/71.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.4/238.4 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.2/139.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.1/51.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:0

In [None]:
from dagshub.notebook import save_notebook

save_notebook(repo="Omdena/ParisFranceChapter_HousingAffordabilityAnalysis", path="src/tasks/task-2a-data-analysis")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Load the dataset
file_path = '/Population in Municipal Arrondissements.xlsx'
data = pd.read_excel(file_path)

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

# Check the shape of the dataset
print(f"\nDataset contains {data.shape[0]} rows and {data.shape[1]} columns.")

# Display basic information about the dataset
print("\nBasic information about the dataset:")
print(data.info())

# Check for missing values
missing_values = data.isnull().sum()
print(f"\nMissing values in each column:\n{missing_values}")

# Summary statistics
print("\nSummary statistics:")
print(data.describe())

# Handle missing values (if any) for numerical columns only
data_numeric = data.select_dtypes(include=[np.number])
data_numeric.fillna(data_numeric.mean(), inplace=True)

# Replace the original numerical columns with the filled ones
for column in data_numeric.columns:
    data[column] = data_numeric[column]

print(f"\nMissing values after handling:\n{data.isnull().sum()}")

# Identify and handle outliers using Z-score for numerical columns
z_scores = np.abs(stats.zscore(data_numeric))
outliers = (z_scores > 3).any(axis=1)
data_cleaned = data[~outliers]
print(f"\nNumber of rows removed due to outliers: {outliers.sum()}")

# Updated summary statistics after handling missing values and outliers
print("\nUpdated summary statistics after handling outliers:")
print(data_cleaned.describe())

# Histograms for numerical features
print("\nHistograms for numerical features:")
data_cleaned.select_dtypes(include=[np.number]).hist(bins=30, figsize=(15, 10))
plt.show()

# Box plots to visualize outliers
for column in data_cleaned.select_dtypes(include=[np.number]).columns:
    plt.figure(figsize=(10, 5))
    sns.boxplot(x=data_cleaned[column])
    plt.title(f'Box plot of {column}')
    plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 8))
correlation_matrix = data_cleaned.select_dtypes(include=[np.number]).corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Save the cleaned dataset to a new CSV file
cleaned_file_path = '/cleaned_population_data.csv'
data_cleaned.to_csv(cleaned_file_path, index=False)

print(f"\nCleaned data saved to {cleaned_file_path}")
