In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('credit_card_data.csv')

# --- 1. Data Cleaning and Initial Inspection ---

# Filter out rows with mostly missing data (where SK_ID_CURR is NaN)
df_cleaned = df.dropna(subset=['SK_ID_CURR'])

# Convert appropriate float columns to int
int_cols = ['SK_ID_CURR', 'CNT_CHILDREN', 'FLAG_MOBIL', 'FLAG_EMP_PHONE',
            'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL',
            'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY',
            'HOUR_APPR_PROCESS_START']
for col in int_cols:
    if col in df_cleaned.columns:
        # Fill NaN with 0 before conversion to int.
        # SK_ID_CURR is a float because of the trailing NaNs in the original file
        df_cleaned[col] = df_cleaned[col].fillna(0).astype(int)

# Calculate missing value percentages on the cleaned data
missing_pct = df_cleaned.isnull().sum() * 100 / len(df_cleaned)
missing_df = pd.DataFrame({'Missing Percentage': missing_pct[missing_pct > 0].sort_values(ascending=False)})

print("Shape of the cleaned data:", df_cleaned.shape)
print("\nPercentage of Missing Values (only showing columns with missing data):")
print(missing_df)

# Descriptive statistics for numerical columns
print("\nDescriptive Statistics for Numerical Columns:")
print(df_cleaned.describe())

# --- 2. Value Counts for Categorical Columns ---
categorical_cols = df_cleaned.select_dtypes(include='object').columns.tolist()
print("\nValue Counts for Categorical Columns:")
for col in categorical_cols:
    print(f"\n--- {col} ---")
    print(df_cleaned[col].value_counts(dropna=False))


# --- 3. Visualization ---
sns.set_style("whitegrid")
df_cleaned['AGE_YEARS'] = np.abs(df_cleaned['DAYS_BIRTH'] / 365.25)

# 3.1. Distribution of AMT_CREDIT and AMT_INCOME_TOTAL (Numerical)
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
sns.histplot(df_cleaned['AMT_CREDIT'], bins=30, kde=True, color='skyblue')
plt.title('Distribution of Loan Amount (AMT_CREDIT)')
plt.xlabel('Loan Amount')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
sns.histplot(df_cleaned['AMT_INCOME_TOTAL'], bins=30, kde=True, color='salmon')
plt.title('Distribution of Total Income (AMT_INCOME_TOTAL)')
plt.xlabel('Total Income')
plt.ylabel('Frequency')

plt.tight_layout()
plt.savefig('numerical_distributions.png')
plt.close()

# 3.2. Distribution of Age (Derived from DAYS_BIRTH)
plt.figure(figsize=(7, 5))
sns.histplot(df_cleaned['AGE_YEARS'], bins=20, kde=True, color='darkgreen')
plt.title('Distribution of Client Age')
plt.xlabel('Age (Years)')
plt.ylabel('Frequency')
plt.savefig('age_distribution.png')
plt.close()

# 3.3. Bar charts for Categorical Variables
plt.figure(figsize=(18, 5))

# CODE_GENDER
plt.subplot(1, 3, 1)
df_cleaned['CODE_GENDER'].value_counts().plot(kind='bar', rot=0, color=['teal', 'tomato', 'gold'])
plt.title('Client Gender Distribution')
plt.xlabel('Gender')
plt.ylabel('Count')

# NAME_CONTRACT_TYPE
plt.subplot(1, 3, 2)
df_cleaned['NAME_CONTRACT_TYPE'].value_counts().plot(kind='bar', rot=0, color=['purple', 'orange'])
plt.title('Contract Type Distribution')
plt.xlabel('Contract Type')
plt.ylabel('Count')

# FLAG_OWN_CAR
plt.subplot(1, 3, 3)
df_cleaned['FLAG_OWN_CAR'].value_counts().plot(kind='bar', rot=0, color=['blue', 'red'])
plt.title('Car Ownership Distribution')
plt.xlabel('Own Car (Y/N)')
plt.ylabel('Count')

plt.tight_layout()
plt.savefig('categorical_distributions_1.png')
plt.close()

# 3.4. OCCUPATION_TYPE (Top 10 + Missing)
plt.figure(figsize=(10, 6))
# Using fillna('Missing') to include missing values in the plot
(df_cleaned['OCCUPATION_TYPE'].fillna('Missing').value_counts().nlargest(10)).sort_values(ascending=True).plot(kind='barh')
plt.title('Top 10 Occupation Types (Including Missing)')
plt.xlabel('Count')
plt.ylabel('Occupation Type')
plt.tight_layout()
plt.savefig('occupation_type_distribution.png')
plt.close()

# Save cleaned data to CSV for the user to download
df_cleaned.to_csv('credit_card_data_cleaned.csv', index=False)

  df = pd.read_csv('credit_card_data.csv')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[col] = df_cleaned[col].fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[col] = df_cleaned[col].fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[col] = df_cleaned[col].fillna(0).astype

Shape of the cleaned data: (5000, 28)

Percentage of Missing Values (only showing columns with missing data):
                 Missing Percentage
OWN_CAR_AGE                   66.16
OCCUPATION_TYPE               30.90
AMT_GOODS_PRICE                0.02

Descriptive Statistics for Numerical Columns:
          SK_ID_CURR  CNT_CHILDREN  AMT_INCOME_TOTAL    AMT_CREDIT  \
count    5000.000000   5000.000000      5.000000e+03  5.000000e+03   
mean   102920.935000      0.406200      1.680364e+05  5.976345e+05   
std      1688.918602      0.710002      9.777104e+04  4.010417e+05   
min    100002.000000      0.000000      2.565000e+04  4.500000e+04   
25%    101475.750000      0.000000      1.125000e+05  2.700000e+05   
50%    102915.500000      0.000000      1.440000e+05  5.084955e+05   
75%    104375.500000      1.000000      2.025000e+05  8.086500e+05   
max    105849.000000      7.000000      1.935000e+06  2.517300e+06   

         AMT_ANNUITY  AMT_GOODS_PRICE  REGION_POPULATION_RELATIVE  \

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['AGE_YEARS'] = np.abs(df_cleaned['DAYS_BIRTH'] / 365.25)
