In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from config.config import CLEANED_CSV_FILE

# Set style
sns.set(style="whitegrid")

# Create output directory
os.makedirs('visualizations', exist_ok=True)

# Load data
df = pd.read_csv(CLEANED_CSV_FILE)
print("Data loaded successfully.\n")

# Display basic info
print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nSample Data:\n", df.head())

# Separate numerical and categorical columns
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
categorical_cols = df.select_dtypes(exclude=['number']).columns.tolist()

print("\nNumerical Columns:", numerical_cols)
print("Categorical Columns:", categorical_cols)

# Set up color palette
palette = sns.color_palette("viridis")

# Histograms for numerical columns
for col in numerical_cols:
    plt.figure(figsize=(10, 6))
    sns.histplot(df[col], kde=True, bins=30, color=palette[0])
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.savefig(os.path.join('visualizations', f'hist_{col}.png'))
    plt.close()

# Boxplots for numerical columns grouped by categorical variables
if categorical_cols:
    for num_col in numerical_cols:
        for cat_col in categorical_cols:
            try:
                plt.figure(figsize=(12, 6))
                sns.boxplot(x=cat_col, y=num_col, data=df, palette=palette)
                plt.title(f'{num_col} by {cat_col}')
                plt.xticks(rotation=45)
                plt.tight_layout()
                plt.savefig(os.path.join('visualizations', f'box_{num_col}_by_{cat_col}.png'))
                plt.close()
            except Exception as e:
                print(f"Skipping boxplot for {num_col} by {cat_col}: {str(e)}")

# Correlation heatmap
plt.figure(figsize=(10, 8))
corr = df[numerical_cols].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap of Numerical Features')
plt.tight_layout()
plt.savefig(os.path.join('visualizations', 'heatmap_correlation.png'))
plt.close()

# Pairplot of numerical features (sample if large)
if len(df) > 1000:
    sample_df = df[numerical_cols].sample(n=1000, random_state=42)
else:
    sample_df = df[numerical_cols]

sns.pairplot(sample_df, height=1.5)
plt.suptitle('Pairplot of Numerical Features (Sample)', y=1.02)
plt.savefig(os.path.join('visualizations', 'pairplot_numerical.png'))
plt.close()

# Count plots for categorical columns
for col in categorical_cols:
    plt.figure(figsize=(10, 6))
    top_categories = df[col].value_counts().nlargest(10).index
    subset_df = df[df[col].isin(top_categories)]
    sns.countplot(data=subset_df, x=col, order=top_categories, palette=palette)
    plt.title(f'Count of Top Categories in {col}')
    plt.xticks(rotation=45)
    plt.ylabel('Count')
    plt.tight_layout()
    plt.savefig(os.path.join('visualizations', f'countplot_{col}.png'))
    plt.close()

# Additional Insightful Plot: Price vs Year
if 'year' in numerical_cols and 'amount' in numerical_cols:
    plt.figure(figsize=(12, 6))
    sns.lineplot(x='year', y='amount', data=df, estimator='mean', ci=None, marker='o', color=palette[3])
    plt.title('Average Car Price Over Years')
    plt.xlabel('Year')
    plt.ylabel('Average Price (amount)')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join('visualizations', 'line_price_over_years.png'))
    plt.close()

print("\nAll visualizations have been saved to the 'visualizations' folder.")

Data loaded successfully.

Shape: (2906, 12)

Columns: ['Unnamed: 0', 'amount', 'city', 'marka', 'model', 'year', 'type', 'color', 'distance', 'motor_volume', 'horse_power', 'motor_type']

Sample Data:
    Unnamed: 0   amount   city      marka                   model  year  \
0           0  17500.0   Bakı    Hyundai                  Sonata  2010   
1           1  27300.0   Bakı     Toyota                 Corolla  2021   
2           2  32123.0   Bakı     Toyota                 Corolla  2024   
3           3  17500.0  Qazax  Chevrolet                    Trax  2018   
4           4  24900.0   Bakı       Ford  Fusion (North America)  2016   

                      type   color  distance  motor_volume  horse_power  \
0                    Sedan  Gümüşü    222000           2.4          178   
1                    Sedan    Qara     73000           1.6          122   
2                    Sedan      Ağ         0           1.8           98   
3  Offroader / SUV, 5 qapı    Qara    205000        


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=cat_col, y=num_col, data=df, palette=palette)
The palette list has fewer values (6) than needed (55) and will cycle, which may produce an uninterpretable plot.
  sns.boxplot(x=cat_col, y=num_col, data=df, palette=palette)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=cat_col, y=num_col, data=df, palette=palette)
The palette list has fewer values (6) than needed (96) and will cycle, which may produce an uninterpretable plot.
  sns.boxplot(x=cat_col, y=num_col, data=df, palette=palette)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=cat_col


All visualizations have been saved to the 'visualizations' folder.
