# Linear Regression

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## Data Preprocessing

### Data Collection

In [2]:
df = pd.read_csv('data/credit_score.csv')

In [None]:
df

In [None]:
df.describe()

### Data Cleansing

#### Define target

In [5]:
target = 'Credit Score'

In [None]:
target_count = df[target].value_counts()
plt.bar(target_count.index.astype(str), target_count.values)
plt.show()

#### Transform target

In [7]:
df['High Credit'] = df['Credit Score'].apply(lambda x: 1 if x == 'High' else 0)
df = df.drop(columns='Credit Score')

In [None]:
df

In [9]:
target = 'High Credit'

In [None]:
target_count = df[target].value_counts()
plt.bar(target_count.index.astype(str), target_count.values)
plt.show()

#### Check data type

In [None]:
df.dtypes

#### Clean missing value

In [None]:
df.isnull().sum()

#### Transform categorical columns

##### Normianl

In [13]:
nominal_cols = ['Gender', 'Marital Status', 'Home Ownership']
df = pd.get_dummies(df, columns=nominal_cols, prefix=nominal_cols, drop_first=True)

In [None]:
df

In [15]:
df = df.apply(lambda x: x.astype(int) if x.dtype == 'bool' else x)

In [None]:
df

##### Ordinal

In [None]:
df['Education'].unique()

In [18]:
education_order = {
    "High School Diploma": 0,
    "Associate's Degree": 1,
    "Bachelor's Degree": 2,
    "Master's Degree": 3,
    "Doctorate": 4,
}

df['Education'] = df['Education'].map(education_order)

In [None]:
df

#### Assign target and features

In [20]:
features = [x for x in df.columns if x != target]

In [None]:
features

In [22]:
y = df[target]
X = df[features]

In [None]:
y

In [None]:
X

### Split dataset

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X_train

In [None]:
y_train

### Feature selection

In [None]:
correlation_matrix = df.corr()

plt.figure(figsize=(8, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

In [29]:
features = ['Age', 'Income', 'Education', 'Number of Children', 'Marital Status_Single', 'Home Ownership_Rented']

In [30]:
X_train = X_train[features]
X_test = X_test[features]

In [None]:
X_train

## Feature Engineering

### Standardization

In [32]:
scaler = StandardScaler()

In [33]:
train_index = X_train.index
X_train_scale = pd.DataFrame(scaler.fit_transform(X_train), columns=features)
X_train_scale.index = train_index

In [None]:
X_train_scale

In [35]:
test_index = X_test.index
X_test_scale = pd.DataFrame(scaler.transform(X_test), columns=features)
X_test_scale.index = test_index

## Model

### First training

In [None]:
model = LogisticRegression()
model.fit(X_train_scale, y_train)

In [None]:
model.intercept_[0]

In [None]:
pd.DataFrame({'feature':features, 'coef':model.coef_[0]})

In [None]:
model.predict(X_test_scale)

In [None]:
model.predict_proba(X_test_scale)

In [41]:
y_pred_proba = model.predict_proba(X_test_scale)[:, 1]
residuals = y_test - y_pred_proba

### Check assumption

#### Linearity

In [None]:
y_pred_proba = model.predict_proba(X_train_scale)[:, 1]

log_odds = np.log(y_pred_proba / (1 - y_pred_proba))

for feature in features:
    plt.scatter(X_train_scale[feature], log_odds)
    plt.xlabel(feature)
    plt.ylabel('Log-Odds')
    plt.title('Linearity Check for Log-Odds')
    plt.show()

#### Independence

In [None]:
dw_test = durbin_watson(residuals)
print(f"Durbin-Watson test statistic: {dw_test}")

#### Multicollinearity

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train_scale[features].columns
vif['VIF'] = [variance_inflation_factor(X_train_scale[features], i) for i in range(len(X_train_scale[features].columns))]
print(vif)

#### Large sample size

In [None]:
df[target].value_counts()

In [None]:
df[target].value_counts()[0] / len(features)

### Final Training

In [None]:
model = LogisticRegression(penalty='l2')
model.fit(X_train_scale, y_train)

In [None]:
model.intercept_[0]

In [None]:
pd.DataFrame({'feature':features, 'coef':model.coef_[0]})

## Evaluate

In [None]:
y_pred_train = model.predict(X_train_scale)
print(classification_report(y_train, y_pred_train))

In [None]:
y_pred_test = model.predict(X_test_scale)
print(classification_report(y_test, y_pred_test))

## Test run

In [52]:
input = {
    'Age': 48,
    'Income': 82500,
    'Education': 4,
    'Number of Children': 1,
    'Marital Status_Single': 0,
    'Home Ownership_Rented': 0
}

In [53]:
input_df = pd.DataFrame(input, index=[0])

In [None]:
input_df

In [55]:
input_df = pd.DataFrame(scaler.transform(input_df), columns=features)

In [None]:
input_df

In [None]:
model.predict(input_df)

In [None]:
model.predict_proba(input_df)