In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load data
raw_data_path = "../data/raw/data.xlsx"
definitions_path = "../data/raw/Xente_Variable_Definitions.xlsx"
data = pd.read_excel(raw_data_path)
variable_definitions = pd.read_excel(definitions_path)

In [None]:
# Display dataset overview
print("Dataset Overview:\n")
print(data.info())
print("\nFirst Few Rows of the Dataset:\n")
print(data.head())


In [None]:
# Basic summary statistics
print("\nSummary Statistics:\n")
print(data.describe(include='all'))

# Visualize missing values
plt.figure(figsize=(10, 6))
sns.heatmap(data.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Values Heatmap")
plt.show()


In [None]:
# Distribution of numerical features
numerical_features = data.select_dtypes(include=['int64', 'float64']).columns

for col in numerical_features:
    plt.figure(figsize=(8, 4))
    sns.histplot(data[col], kde=True, bins=30)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.show()

In [None]:
# Categorical feature distributions
categorical_features = data.select_dtypes(include=['object']).columns

for col in categorical_features:
    plt.figure(figsize=(10, 5))
    sns.countplot(y=data[col], order=data[col].value_counts().index)
    plt.title(f"Distribution of {col}")
    plt.xlabel("Count")
    plt.ylabel(col)
    plt.show()

# Correlation analysis
correlation_matrix = data[numerical_features].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

# Save the cleaned data for further processing
data_cleaned_path = "../data/processed/cleaned_data.csv"
data.to_csv(data_cleaned_path, index=False)

print(f"Cleaned data saved to {data_cleaned_path}")
