In [12]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder

# Load your dataset
dataset = pd.read_csv('C:\\Users\\neenu\\Jupyter\\Assignments\\Final_Project\\Default_of_Credit_Card_Clients\\default_of_credit_card_clients.csv')
#print(df)

In [13]:
df =pd.DataFrame(dataset)
print(df)

          ID  LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2  PAY_3  \
0          1      20000    2          2         1   24      2      2     -1   
1          2     120000    2          2         2   26     -1      2      0   
2          3      90000    2          2         2   34      0      0      0   
3          4      50000    2          2         1   37      0      0      0   
4          5      50000    1          2         1   57     -1      0     -1   
...      ...        ...  ...        ...       ...  ...    ...    ...    ...   
29995  29996     220000    1          3         1   39      0      0      0   
29996  29997     150000    1          3         2   43     -1     -1     -1   
29997  29998      30000    1          2         2   37      4      3      2   
29998  29999      80000    1          3         1   41      1     -1      0   
29999  30000      50000    1          2         1   46      0      0      0   

       PAY_4  ...  BILL_AMT4  BILL_AMT5  BILL_AMT6 

In [9]:
# Step 2: Check for missing values and handle them
print("Missing values in each column:\n", df.isnull().sum())

# Fill missing numeric values with the mean of the column
df.fillna(df.mean(), inplace=True)

# Drop rows with missing values (if preferred)
df.dropna(inplace=True)

Missing values in each column:
 ID                            0
LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default payment next month    0
dtype: int64


In [15]:
print(df.columns)


Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month'],
      dtype='object')


In [46]:
# Step 3: Encode categorical data
# Display the columns in the DataFrame to check for any naming discrepancies
print(df.columns)

# Assuming the columns are named correctly, here is the corrected encoding code

# Label Encoding for binary categorical columns (if 'default payment next month' exists)
if 'default payment next month' in df.columns:
    label_encoder = LabelEncoder()
    df['default payment next month'] = label_encoder.fit_transform(df['default payment next month'])

# One-Hot Encoding for multiclass categorical columns
# Ensure columns match exactly, including 'SEX' and 'EDUCATION' names
if 'SEX' in df.columns and 'EDUCATION' in df.columns and 'MARRIAGE' in df.columns :
    df = pd.get_dummies(df, columns=['SEX', 'EDUCATION','MARRIAGE'], drop_first=True)

print(df)

Index(['ID', 'LIMIT_BAL', 'EDUCATION', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3',
       'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3',
       'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2',
       'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month', 'SEX_1', 'MARRIAGE_1', 'MARRIAGE_2',
       'MARRIAGE_3'],
      dtype='object')
          ID  LIMIT_BAL  EDUCATION       AGE     PAY_0     PAY_2     PAY_3  \
0          1  -1.136720          2 -1.246020  1.794564  1.782348 -0.696663   
1          2  -0.365981          2 -1.029047 -0.874991  1.782348  0.138865   
2          3  -0.597202          2 -0.161156  0.014861  0.111736  0.138865   
3          4  -0.905498          2  0.164303  0.014861  0.111736  0.138865   
4          5  -0.905498          2  2.334029 -0.874991  0.111736 -0.696663   
...      ...        ...        ...       ...       ...       ...       ...   
29995  29996   0.404759          3  0.381275  0.014861  0.111736 

In [38]:
# Step 4: Scale numerical features

'''
LIMIT_BAL (Credit limit)
AGE (Age of the customer)
PAY_0, PAY_2, PAY_3, PAY_4, PAY_5, PAY_6 (Payment status)
BILL_AMT1, BILL_AMT2, BILL_AMT3, BILL_AMT4, BILL_AMT5, BILL_AMT6 (Bill amounts)
PAY_AMT1, PAY_AMT2, PAY_AMT3, PAY_AMT4, PAY_AMT5, PAY_AMT6 (Payment amounts)
'''

scaler = StandardScaler()
df[['LIMIT_BAL', 'AGE','PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6','BILL_AMT1','BILL_AMT2','BILL_AMT3','BILL_AMT4','BILL_AMT5','BILL_AMT6','PAY_AMT1','PAY_AMT2', 'PAY_AMT3','PAY_AMT4', 'PAY_AMT5','PAY_AMT6']] = scaler.fit_transform(df[['LIMIT_BAL', 'AGE','PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6','BILL_AMT1','BILL_AMT2','BILL_AMT3','BILL_AMT4','BILL_AMT5','BILL_AMT6','PAY_AMT1','PAY_AMT2', 'PAY_AMT3','PAY_AMT4', 'PAY_AMT5','PAY_AMT6']])

print(df)
# Alternatively, MinMaxScaler for scaling between 0 and 1
#minmax_scaler = MinMaxScaler()
#data[['numerical_column1', 'numerical_column2']] = minmax_scaler.fit_transform(data[['numerical_column1', 'numerical_column2']])

          ID  LIMIT_BAL  EDUCATION       AGE     PAY_0     PAY_2     PAY_3  \
0          1  -1.136720          2 -1.246020  1.794564  1.782348 -0.696663   
1          2  -0.365981          2 -1.029047 -0.874991  1.782348  0.138865   
2          3  -0.597202          2 -0.161156  0.014861  0.111736  0.138865   
3          4  -0.905498          2  0.164303  0.014861  0.111736  0.138865   
4          5  -0.905498          2  2.334029 -0.874991  0.111736 -0.696663   
...      ...        ...        ...       ...       ...       ...       ...   
29995  29996   0.404759          3  0.381275  0.014861  0.111736  0.138865   
29996  29997  -0.134759          3  0.815221 -0.874991 -0.723570 -0.696663   
29997  29998  -1.059646          2  0.164303  3.574267  2.617654  1.809921   
29998  29999  -0.674276          3  0.598248  0.904712 -0.723570  0.138865   
29999  30000  -0.905498          2  1.140680  0.014861  0.111736  0.138865   

          PAY_4     PAY_5     PAY_6  ...  PAY_AMT2  PAY_AMT3  P

In [51]:
# Step 5: Split the dataset into training and testing sets
X = df.drop('default payment next month', axis=1)  # Independent variables
y = df['default payment next month']               # Dependent variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (24000, 26)
X_test shape: (6000, 26)
y_train shape: (24000,)
y_test shape: (6000,)
