Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Set random seed for reproducibility
np.random.seed(42)

# Load the data
print("Loading data...")
df = pd.read_csv("data/UCI_Credit_Card.csv")

# Basic information about the dataset
print("\nBasic Information:")
print(f"Shape of dataset: {df.shape}")

Loading data...

Basic Information:
Shape of dataset: (30000, 25)


In [2]:
# Check for missing values
print("\nMissing values in each column:")
print(df.info())


Missing values in each column:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          30000 non-null  int64  
 1   LIMIT_BAL                   30000 non-null  float64
 2   SEX                         30000 non-null  int64  
 3   EDUCATION                   30000 non-null  int64  
 4   MARRIAGE                    30000 non-null  int64  
 5   AGE                         30000 non-null  int64  
 6   PAY_0                       30000 non-null  int64  
 7   PAY_2                       30000 non-null  int64  
 8   PAY_3                       30000 non-null  int64  
 9   PAY_4                       30000 non-null  int64  
 10  PAY_5                       30000 non-null  int64  
 11  PAY_6                       30000 non-null  int64  
 12  BILL_AMT1                   30000 non-null  float64
 13 

In [3]:
# Drop ID column (not useful for modeling)
credit_data = df.drop('ID', axis=1)

# Check education and marriage for unusual values
print("\nUnique values in EDUCATION:", credit_data['EDUCATION'].unique())
print("Unique values in MARRIAGE:", credit_data['MARRIAGE'].unique())

# Fix education and marriage variables:
credit_data['EDUCATION'] = credit_data['EDUCATION'].map(lambda x: 4 if x in [0, 5, 6] else x)
credit_data['MARRIAGE'] = credit_data['MARRIAGE'].map(lambda x: 3 if x == 0 else x)

print("\nAfter cleaning:")
print("Unique values in EDUCATION:", credit_data['EDUCATION'].unique())
print("Unique values in MARRIAGE:", credit_data['MARRIAGE'].unique())


Unique values in EDUCATION: [2 1 3 5 4 6 0]
Unique values in MARRIAGE: [1 2 3 0]

After cleaning:
Unique values in EDUCATION: [2 1 3 4]
Unique values in MARRIAGE: [1 2 3]


In [4]:
# Feature and target variables
X = credit_data.drop('default.payment.next.month', axis=1)
y = credit_data['default.payment.next.month']

# Split data into training, validation and test sets (60%, 20%, 20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
print(f"Test set shape: {X_test.shape}")

# Standardize numerical features
scaler = StandardScaler()
numerical_cols = ['LIMIT_BAL', 'AGE'] + [col for col in X.columns if col.startswith('BILL_') or col.startswith('PAY_AMT')]
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

Training set shape: (18000, 23)
Validation set shape: (6000, 23)
Test set shape: (6000, 23)


In [5]:
# Save to CSV files
print("Saving preprocessed data to CSV files...")
X_train.to_csv("data/X_train.csv", index=False)
X_val.to_csv("data/X_val.csv", index=False)
X_test.to_csv("data/X_test.csv", index=False)
y_train.to_csv("data/y_train.csv", index=False)
y_val.to_csv("data/y_val.csv", index=False)
y_test.to_csv("data/y_test.csv", index=False)

Saving preprocessed data to CSV files...
