In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:


# Load the dataset
df = pd.read_csv('../data/data.csv')

# Show the first 5 rows
print(df.head())


🧱 Understand the Structure
check rows, columns, and types:

In [None]:
# Shape of the dataset
print("Number of rows and columns:", df.shape)

# Column names and data types
print("\nData types:")
print(df.dtypes)

# More detailed info (non-null counts)
print("\nFull info:")
print(df.info())

In [None]:
# Summary of all columns (not just numerical)
df.describe(include='all')

# Summary statistics for numerical columns
df.describe()


Distribution of Numerical Features

In [None]:
# Set plot style
sns.set(style="whitegrid")

# Automatically get list of numerical columns
num_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

print("Numerical features:", num_features)

for col in num_features:
    plt.figure(figsize=(8, 4))
    sns.histplot(df[col], bins=100, kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
for col in num_features:
    plt.figure(figsize=(8, 2))
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

Plot With Log Transform
Apply log transform to reduce skew and better visualize data spread:

In [None]:
df['log_Amount'] = np.log1p(df['Amount'])  # log1p handles zero/negative values
df['log_Value'] = np.log1p(df['Value'])

# Plot log-transformed distributions
for col in ['log_Amount', 'log_Value']:
    plt.figure(figsize=(8, 4))
    sns.histplot(df[col], bins=100, kde=True)
    plt.title(f'Distribution of {col}')
    plt.show()

categorical features

In [None]:
# Get object (categorical) columns
cat_features = df.select_dtypes(include='object').columns.tolist()

print("Categorical features:", cat_features)

for col in cat_features:
    plt.figure(figsize=(10, 4))
    sns.countplot(y=col, data=df, order=df[col].value_counts().index)
    plt.title(f'Distribution of {col}')
    plt.xlabel('Count')
    plt.ylabel(col)
    plt.tight_layout()
    plt.show()

Correlation Analysis (Numerical Features)

In [None]:
# Select only numerical columns
num_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Compute correlation matrix
corr_matrix = df[num_features].corr()

plt.figure(figsize=(10, 6))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', center=0)
plt.title("Correlation Matrix (Numerical Features)")
plt.tight_layout()
plt.show()

Identifying Missing Values

In [None]:
# Count missing values
missing_counts = df.isnull().sum()
missing_counts = missing_counts[missing_counts > 0].sort_values(ascending=False)

# Print results
print("Missing Value Counts:")
print(missing_counts)


In [None]:
# Calculate percentage of missing values
missing_percentage = df.isnull().mean() * 100
missing_percentage = missing_percentage[missing_percentage > 0].sort_values(ascending=False)

# Print results
print("\nMissing Value Percentage:")
print(missing_percentage)


Outlier Detection using IQR

In [None]:
def detect_outliers_iqr(column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outlier_count = df[(df[column] < lower_bound) | (df[column] > upper_bound)].shape[0]
    print(f"{column}: {outlier_count} outliers (outside {lower_bound:.2f} to {upper_bound:.2f})")

# Run for all numerical columns
for col in num_features:
    detect_outliers_iqr(col)
