In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
# from config.config import CLEANED_DATA_CSV

# Set style
sns.set(style="whitegrid")

# Create visualization directory
os.makedirs('visualizations', exist_ok=True)

df = pd.read_csv("data/cleaned_data.csv")

# Display basic info
print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nSample Data:\n", df.head())

# Separate numerical and categorical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

# Remove 'Item_id' from numerical if present
if 'Item_id' in numerical_cols:
    numerical_cols.remove('Item_id')

print("\nNumerical Columns:", numerical_cols)
print("Categorical Columns:", categorical_cols)

# Drop rows with missing values for visualization purposes
df_clean = df.dropna().copy()

# Encode categorical columns for grouping (optional for visualization)
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_clean[col + '_encoded'] = le.fit_transform(df_clean[col])
    label_encoders[col] = le

# 1. Histograms for numerical columns
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols):
    plt.subplot((len(numerical_cols) + 2) // 3, 3, i + 1)
    sns.histplot(df_clean[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.tight_layout()
plt.savefig(os.path.join('visualizations', 'histograms.png'))
plt.close()

# 2. Boxplots for numerical columns grouped by first categorical column (if any)
if len(categorical_cols) > 0:
    group_col = categorical_cols[0]
    for col in numerical_cols:
        plt.figure(figsize=(12, 6))
        sns.boxplot(x=group_col, y=col, data=df_clean)
        plt.xticks(rotation=45)
        plt.title(f'{col} by {group_col}')
        plt.tight_layout()
        plt.savefig(os.path.join('visualizations', f'boxplot_{col}_by_{group_col}.png'))
        plt.close()

# 3. Correlation Heatmap
if len(numerical_cols) >= 2:
    plt.figure(figsize=(10, 8))
    corr = df_clean[numerical_cols].corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Heatmap')
    plt.tight_layout()
    plt.savefig(os.path.join('visualizations', 'correlation_heatmap.png'))
    plt.close()

# 4. Pairplot (sample if too large)
sample_size = min(1000, len(df_clean))
pairplot_data = df_clean.sample(sample_size) if len(df_clean) > sample_size else df_clean
if len(numerical_cols) >= 2:
    sns.pairplot(pairplot_data[numerical_cols])
    plt.suptitle('Pairplot of Numerical Features', y=1.02)
    plt.savefig(os.path.join('visualizations', 'pairplot.png'))
    plt.close()

# 5. Count plots for categorical columns
for col in categorical_cols:
    plt.figure(figsize=(10, 6))
    sns.countplot(y=col, data=df_clean, order=df_clean[col].value_counts().index)
    plt.title(f'Count Plot of {col}')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(os.path.join('visualizations', f'countplot_{col}.png'))
    plt.close()

# 6. Additional Insightful Plot: Average Price by Category
if 'Category' in categorical_cols and 'Amount' in numerical_cols:
    plt.figure(figsize=(12, 6))
    avg_price_by_category = df_clean.groupby('Category')['Amount'].mean().sort_values()
    avg_price_by_category.plot(kind='barh', color='skyblue')
    plt.title('Average Amount by Property Category')
    plt.xlabel('Average Amount (AZN)')
    plt.ylabel('Category')
    plt.tight_layout()
    plt.savefig(os.path.join('visualizations', 'avg_amount_by_category.png'))
    plt.close()

print("Visualizations saved to 'visualizations' folder.")

Shape: (7359, 14)

Columns: ['Unnamed: 0', 'Item_id', 'Amount', 'Currency', 'Location', 'Category', 'Area', 'Number_of_rooms', 'Frontage', 'Mortgage', 'Condition', 'Land_area', 'Current_floor', 'Total_floors']

Sample Data:
    Unnamed: 0  Item_id  Amount Currency  \
0           0  5165411  408000      AZN   
1           1  5150136  234000      AZN   
2           2  5117823  329000      AZN   
3           3  5193713  143000      AZN   
4           4  5193372     500      AZN   

                                            Location     Category    Area  \
0  ['Grand Hayat Residence', '8 Noyabr m.', 'Yasa...  Yeni tikili  178 m²   
1  ['Gənclik m.', 'Respublika stadionu', 'Nəriman...  Yeni tikili   65 m²   
2                        ['Xəzər r.', 'Mərdəkan q.']       Torpaq       0   
3  ['Ukrayna dairəsi', 'Əhmədli m.', 'Həzi Aslano...  Yeni tikili   75 m²   
4                     ['20 Yanvar m.', 'Yasamal r.']  Yeni tikili   35 m²   

   Number_of_rooms  Frontage  Mortgage  Condition Lan

  plt.tight_layout()


Visualizations saved to 'visualizations' folder.
