
# Credit Consumption Prediction


## 1. Import Libraries

In [30]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

## 2. Load & Merge Data

In [22]:
demographic = pd.read_excel(r'G:\My Courses\0. AnalytixLabs\5. Machine Learning, Text Mining and Intro to AI with Python\0. Case Studies\1. Capstone Case Study - Predict Cred Card Consumption\CustomerDemographics.xlsx')
behavior = pd.read_excel(r'G:\My Courses\0. AnalytixLabs\5. Machine Learning, Text Mining and Intro to AI with Python\0. Case Studies\1. Capstone Case Study - Predict Cred Card Consumption\CustomerBehaviorData.xlsx')
credit = pd.read_excel(r'G:\My Courses\0. AnalytixLabs\5. Machine Learning, Text Mining and Intro to AI with Python\0. Case Studies\1. Capstone Case Study - Predict Cred Card Consumption\CreditConsumptionData.xlsx')

data = demographic.merge(behavior, on="ID", how="left")
data = data.merge(credit, on="ID", how="left")

data.head()


Unnamed: 0,ID,account_type,gender,age,Income,Emp_Tenure_Years,Tenure_with_Bank,region_code,NetBanking_Flag,Avg_days_between_transaction,...,debit_count_may,max_credit_amount_may,debit_amount_jun,credit_amount_jun,credit_count_jun,debit_count_jun,max_credit_amount_jun,loan_enq,emi_active,cc_cons
0,17051,current,M,30,MEDIUM,26.4,9,9,355,0,...,20,41860.0,32734.75,80959.0,36,9,171200.0,Y,3448.84,16239.0
1,11491,current,M,37,LOW,14.4,7,7,485,0,...,8,113367.0,60974.75,495080.0,5,3,15694.0,Y,3812.69,39002.0
2,7433,current,M,33,MEDIUM,3.2,1,1,764,0,...,14,168000.0,425802.96,115707.38,7,58,28058.0,Y,9432.9,21182.0
3,14606,current,M,63,LOW,10.2,6,6,863,0,...,44,57750.0,25537.91,63606.0,12,0,24459.0,Y,144.61,8123.0
4,8381,saving,M,33,MEDIUM,26.4,6,6,523,0,...,2,18405.0,64687.32,62353.35,49,35,31574.0,Y,1887.89,28282.0


## 3. Train–Test Split

In [23]:
train_data = data[data['cc_cons'].notna()].copy()
test_data  = data[data['cc_cons'].isna()].copy()

print("Train shape:", train_data.shape)
print("Test shape :", test_data.shape)


Train shape: (15000, 49)
Test shape : (5000, 49)


## 4. Define Features & Target

In [24]:
X = train_data.drop(['ID', 'cc_cons'], axis=1)
y = train_data['cc_cons']

## 5. Identify Column Types

In [25]:
cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(exclude='object').columns.tolist()

cat_cols, num_cols

(['account_type', 'gender', 'Income', 'loan_enq'],
 ['age',
  'Emp_Tenure_Years',
  'Tenure_with_Bank',
  'region_code',
  'NetBanking_Flag',
  'Avg_days_between_transaction',
  'cc_cons_apr',
  'dc_cons_apr',
  'cc_cons_may',
  'dc_cons_may',
  'cc_cons_jun',
  'dc_cons_jun',
  'cc_count_apr',
  'cc_count_may',
  'cc_count_jun',
  'dc_count_apr',
  'dc_count_may',
  'dc_count_jun',
  'card_lim',
  'personal_loan_active',
  'vehicle_loan_active',
  'personal_loan_closed',
  'vehicle_loan_closed',
  'investment_1',
  'investment_2',
  'investment_3',
  'investment_4',
  'debit_amount_apr',
  'credit_amount_apr',
  'debit_count_apr',
  'credit_count_apr',
  'max_credit_amount_apr',
  'debit_amount_may',
  'credit_amount_may',
  'credit_count_may',
  'debit_count_may',
  'max_credit_amount_may',
  'debit_amount_jun',
  'credit_amount_jun',
  'credit_count_jun',
  'debit_count_jun',
  'max_credit_amount_jun',
  'emi_active'])

## 6. Preprocessing

In [26]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('num', 'passthrough', num_cols)
    ]
)

## 7. Model Pipeline

In [33]:
# Categorical pipeline
cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Numerical pipeline
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_pipeline, cat_cols),
        ('num', num_pipeline, num_cols)
    ]
)

## 8. Train Model

In [35]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('rf', RandomForestRegressor(
        n_estimators=300,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    ))
])

In [36]:
model.fit(X_train, y_train)

## 9. RMSPE Evaluation

In [37]:
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(((y_true - y_pred) / y_true) ** 2))

val_pred = model.predict(X_val)
print("Validation RMSPE:", round(rmspe(y_val, val_pred), 4))


Validation RMSPE: 0.4074


## 10. Predict on Test Data

In [38]:
# Validation
val_pred = model.predict(X_val)

# Test prediction
X_test = test_data.drop(['ID', 'cc_cons'], axis=1)
test_data['Predicted_cc_cons'] = model.predict(X_test)

## 11. Save Predictions

In [39]:

test_data[['ID', 'Predicted_cc_cons']].to_csv(
    "credit_consumption_predictions.csv",
    index=False
)

print("Prediction file saved successfully.")


Prediction file saved successfully.


In [40]:
test_data.head()

Unnamed: 0,ID,account_type,gender,age,Income,Emp_Tenure_Years,Tenure_with_Bank,region_code,NetBanking_Flag,Avg_days_between_transaction,...,max_credit_amount_may,debit_amount_jun,credit_amount_jun,credit_count_jun,debit_count_jun,max_credit_amount_jun,loan_enq,emi_active,cc_cons,Predicted_cc_cons
15000,17591,current,M,37,MEDIUM,11.9,4,4,575,1,...,9750.0,30527.88,31271.0,3,12,25945.92,Y,1170.49,,4286.780703
15001,13541,current,M,33,MEDIUM,7.8,6,6,394,0,...,16967.0,14342.83,16582.0,6,39,12214.0,Y,16447.45,,8364.717035
15002,13431,current,M,53,LOW,33.0,10,10,324,1,...,36398.0,32503.16,33539.54,5,9,13215.0,Y,2622.28,,4271.108337
15003,8687,current,M,33,MEDIUM,7.8,5,5,370,1,...,97825.0,93572.42,109429.75,21,0,72317.0,Y,340.79,,8990.094937
15004,14727,current,M,62,LOW,12.6,9,9,505,1,...,27936.0,19011.5,41401.0,6,35,42344.0,Y,2812.2,,4239.943321
