# 1. Loading Dependencies

In [1]:
import pandas as pd 
import numpy as np

# 2. Importing Raw Data

In [2]:
df = pd.read_csv("/Users/mayankahuja/Desktop/Cardora/data/UCI_Credit_Card.csv")
print('CSV successfully imported.')

CSV successfully imported.


In [3]:
#quick peek 
df.head(5)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


# 3. Standardization of Coulmns

In [None]:
# Standardizing column names to lowercase and remove spaces
df.columns = [c.strip().lower().replace('.', '_').replace(' ', '_') for c in df.columns]
print('Columns:', df.columns.tolist())

Columns: ['id', 'limit_bal', 'sex', 'education', 'marriage', 'age', 'pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6', 'bill_amt1', 'bill_amt2', 'bill_amt3', 'bill_amt4', 'bill_amt5', 'bill_amt6', 'pay_amt1', 'pay_amt2', 'pay_amt3', 'pay_amt4', 'pay_amt5', 'pay_amt6', 'default_payment_next_month']


# 4. Cleaning Data

In [5]:
df.isna().sum()

id                            0
limit_bal                     0
sex                           0
education                     0
marriage                      0
age                           0
pay_0                         0
pay_2                         0
pay_3                         0
pay_4                         0
pay_5                         0
pay_6                         0
bill_amt1                     0
bill_amt2                     0
bill_amt3                     0
bill_amt4                     0
bill_amt5                     0
bill_amt6                     0
pay_amt1                      0
pay_amt2                      0
pay_amt3                      0
pay_amt4                      0
pay_amt5                      0
pay_amt6                      0
default_payment_next_month    0
dtype: int64

In [6]:
df.duplicated().sum()

np.int64(0)

In [7]:
df.dtypes

id                              int64
limit_bal                     float64
sex                             int64
education                       int64
marriage                        int64
age                             int64
pay_0                           int64
pay_2                           int64
pay_3                           int64
pay_4                           int64
pay_5                           int64
pay_6                           int64
bill_amt1                     float64
bill_amt2                     float64
bill_amt3                     float64
bill_amt4                     float64
bill_amt5                     float64
bill_amt6                     float64
pay_amt1                      float64
pay_amt2                      float64
pay_amt3                      float64
pay_amt4                      float64
pay_amt5                      float64
pay_amt6                      float64
default_payment_next_month      int64
dtype: object

In [8]:
df['sex'] = df['sex'].astype('category')
df['education'] = df['education'].astype('category')
df['marriage'] = df['marriage'].astype('category')

# 5. Feature Engineering

In [9]:
df['limit_bal'] > 0

0        True
1        True
2        True
3        True
4        True
         ... 
29995    True
29996    True
29997    True
29998    True
29999    True
Name: limit_bal, Length: 30000, dtype: bool

In [10]:
df['utilisation'] = df['bill_amt1'] > df['limit_bal']
df['avg_bill_amt_6m'] = df[['bill_amt1','bill_amt2','bill_amt3','bill_amt4','bill_amt5','bill_amt6']].mean(axis = 1)
df['total_pay_amt_6m'] = df[['pay_amt1','pay_amt2','pay_amt3','pay_amt4','pay_amt5','pay_amt6']].sum(axis = 1)

#### A. Choosing the Target Variable

The target variable is set as `bill_amt1`, which represents the **next month’s billing amount** for a customer.  
This variable is chosen because it directly reflects short-term financial activity and helps identify spending or repayment trends.

Selecting this target allows the model to learn patterns that predict how much a customer is likely to owe next month.  
Such predictions can assist credit analysts in identifying customers at risk of overspending or requiring credit adjustments.

If an irrelevant or indirect variable were chosen as the target, the model’s predictions would lose business meaning and accuracy, making the results difficult to interpret or apply.


In [11]:
TARGET = 'bill_amt1'

#### B. Capping extremely large bill amounts for robust regression

In [12]:
clip_upper = df['bill_amt1'].quantile(0.995)
df['bill_amt1_clipped'] = df['bill_amt1'].clip(upper = clip_upper)

#### C. Final feature list for baseline regression

In [None]:
feature_cols = [
    'limit_bal', 'age', 'avg_bill_amt_6m', 'total_pay_amt_6m'
]

#### D. Extending 'feature_cols' with new dummies

In [14]:
for c in df.columns:
    if c.startswith('sex_') or c.startswith('education_') or c.startswith('marriage_'):
        feature_cols.append(c)

In [15]:
print("Final features:", feature_cols)
print("Final dataset shape:", df.shape)

Final features: ['limit_bal', 'age', 'utilization', 'avg_bill_amt_6m', 'total_pay_amt_6m']
Final dataset shape: (30000, 29)


In [16]:
df.head(5)

Unnamed: 0,id,limit_bal,sex,education,marriage,age,pay_0,pay_2,pay_3,pay_4,...,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,default_payment_next_month,utilisation,avg_bill_amt_6m,total_pay_amt_6m,bill_amt1_clipped
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,689.0,0.0,0.0,0.0,0.0,1,False,1284.0,689.0,3913.0
1,2,120000.0,2,2,2,26,-1,2,0,0,...,1000.0,1000.0,1000.0,0.0,2000.0,1,False,2846.166667,5000.0,2682.0
2,3,90000.0,2,2,2,34,0,0,0,0,...,1500.0,1000.0,1000.0,1000.0,5000.0,0,False,16942.166667,11018.0,29239.0
3,4,50000.0,2,2,1,37,0,0,0,0,...,2019.0,1200.0,1100.0,1069.0,1000.0,0,False,38555.666667,8388.0,46990.0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,36681.0,10000.0,9000.0,689.0,679.0,0,False,18223.166667,59049.0,8617.0


# 6. Exporting Processed Data

In [17]:
df.to_csv('/Users/mayankahuja/Desktop/Cardora/data/processed_data.csv', index = False)