## Feature Engineering

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [None]:
credit_score=pd.read_csv("data/credit_score.csv")

# Display max 6 columns
pd.set_option('display.max_columns', 10)
credit_score.head()

### Linear Relationships

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(credit_score["R_DEBT_INCOME"], credit_score["CREDIT_SCORE"])
plt.xlabel("Ratio of Debt to Income \n(R_DEBT_INCOME)", size=15)
plt.ylabel("Credit Score", size=15)

In [None]:
# Select features and target
x=credit_score[["INCOME", "R_DEBT_INCOME"]] 
y=credit_score["CREDIT_SCORE"]

# Fit regression model
x=sm.add_constant(x)
# Creates an OLS(Ordinary Least Squares) regression object with y as the dependent variable (endog) and x as the independent variables (exog)
model=sm.OLS(y, x).fit()

print(model.summary())

### Reformulating Linear Relationships

In [None]:
# Add entertainment ratio to features
x["R_ENTERTAINMENT_INV"]=credit_score["T_ENTERTAINMENT_12"]/credit_score["T_ENTERTAINMENT_6"]
x["R_ENTERTAINMENT_INV"]=x["R_ENTERTAINMENT_INV"].fillna(x["R_ENTERTAINMENT_INV"].mean())

y=credit_score["CREDIT_SCORE"]

# Fit the regression model
x=sm.add_constant(x)
model=sm.OLS(y, x).fit()

print(model.summary())

In [None]:
# Inverse Ratio
credit_score["T_ENTERTAINMENT_6"]/credit_score["T_ENTERTAINMENT_12"]

# Alternate Ratio
credit_score["T_ENTERTAINMENT_6"]/(credit_score["T_ENTERTAINMENT_12"]-credit_score["T_ENTERTAINMENT_6"])

In [None]:
x=credit_score[["INCOME", "R_DEBT_INCOME", "R_ENTERTAINMENT"]]
y=credit_score["CREDIT_SCORE"]

x=sm.add_constant(x)
model=sm.OLS(y, x).fit()

print(model.summary())

- Relative increase in expenditure on entertainment leads to a decrease in credit score.

- `R_ENTERTAINMENT=T_ENTERTAINMENT_6/T_ENTERTAINMENT_12`

### Polynomial Regression

In [None]:
x=credit_score[["INCOME", "R_DEBT_INCOME", "R_ENTERTAINMENT", "R_EXPENDITURE"]]
x["R_EXPENDITURE_SQR"]=credit_score["R_EXPENDITURE"]**2
y=credit_score["CREDIT_SCORE"]

x=sm.add_constant(x)
model=sm.OLS(y, x).fit()

print(model.summary())

### Discretization

- Converting categorical features to continuous features in order to capture complex non-linear relationships.

In [None]:
x["GAMBLING_LOW"]=credit_score["CAT_GAMBLING"].apply(lambda x: 1 if x=="Low" else 0)
x["GAMBLING_HIGH"]=credit_score["CAT_GAMBLING"].apply(lambda x: 1 if x=="High" else 0)
y=credit_score["CREDIT_SCORE"]

x=sm.add_constant(x)
model=sm.OLS(y, x).fit()

print(model.summary())

### Interactions

In [None]:
# Add interaction term to features
x=credit_score[["INCOME", "DEBT"]]
x["INCOME.DEBT"]=x["INCOME"]*x["DEBT"]
y=credit_score["CREDIT_SCORE"]

x=sm.add_constant(x)
model=sm.OLS(y, x).fit()

print(model.summary())

In [None]:
plt.scatter(credit_score["INCOME"],
            credit_score["DEBT"],
            credit_score["CREDIT_SCORE"], cmap="bwr")
plt.colorbar(label="Credit Score", orientation="vertical")

plt.xlabel("Income", size=10)
plt.ylabel("Debt", size=10)