In [None]:
%config InlineBackend.figure_formats = {'png', 'retina'}
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## 1: 데이터 로드

In [None]:
# target : Balance
df = pd.read_csv("./data/credit.csv")
df

In [None]:
# summary
# 회귀분석 : 단순선형, 다중
# 다중회귀 : feture selection : 산점도, 상관계수, VIF
# sklearn, statsmodels
# mae

## 2: 데이터 전처리

In [None]:
# 수치형 데이터만 필터링

In [None]:
df.dtypes

In [None]:
numerical_columns = [column for column in df.columns if df[column].dtype in ("int", "float")]
numerical_columns

In [None]:
# 불필요하다고 생각하는 컬럼 제거 : ID
filtered_df = df[numerical_columns].drop(columns=["ID"])
filtered_df.tail(2)

In [None]:
# feature, target 데이터 분리

In [None]:
features = filtered_df[filtered_df.columns[:-1]]
features.tail(1)

In [None]:
target = filtered_df["Balance"]
target[:2]

## 3: 모델 학습

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression().fit(features, target)

## 4. 모델 성능확인

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
pred = model.predict(features)
mae = np.round(mean_absolute_error(target, pred), 2)
mae

## 5: 모델 feature 최적화

In [None]:
# 1. 상관계수 확인

In [None]:
filtered_df.corr()

In [None]:
plt.figure(figsize=(20, 5))
sns.heatmap( filtered_df.corr() ** 2, annot=True, fmt=".2f")
plt.show()

In [None]:
# Balance와 관계가 없는 컬럼 Cards, Age, Education 확인

In [None]:
features.columns

In [None]:
features.columns[:-3]

In [None]:
features_2 = features[features.columns[:-3]]
features_2.tail(2)

In [None]:
model = LinearRegression().fit(features_2, target)
pred = model.predict(features_2)
mae = np.round(mean_absolute_error(target, pred), 2)
mae

In [None]:
# Cards, Age, Education 3개의 컬럼을 제거해도 성능에 큰 변화가 없음 

In [None]:
# 2. 요약표 확인

In [None]:
import statsmodels.api as sm

In [None]:
# features 로 확인하면 Education 컬럼의 p-value가 높게 나와서 Education 컬럼 제거하는것이 좋음
sm_feature = sm.add_constant(features_2)

In [None]:
model = sm.OLS(target, sm_feature).fit()

In [None]:
print(model.summary2())

In [None]:
# MAE 확인
pred =  np.dot(sm_feature, model.params)
mae = np.round(mean_absolute_error(target, pred), 2)
mae

In [None]:
# 3. VIF 지표 확인

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
pd.DataFrame({
    "feature": features_2.columns,
    "VIF": [variance_inflation_factor(features_2.values, idx)
            for idx in range(features_2.shape[1])]
})

In [None]:
# Limit 제거
features_3 = features_2.drop(columns=["Limit"])
features_3.tail(2)

In [None]:
pd.DataFrame({
    "feature": features_3.columns,
    "VIF": [variance_inflation_factor(features_3.values, idx)
            for idx in range(features_3.shape[1])]
})

In [None]:
# 요약표 확인
sm_feature = sm.add_constant(features_3)
model = sm.OLS(target, sm_feature).fit()
print(model.summary2())

In [None]:
# MAE 확인
pred =  np.dot(sm_feature, model.params)
mae = np.round(mean_absolute_error(target, pred), 2)
mae

In [None]:
# R-squared의 변화가 거의 없음
# mae : 120.44 > 120.8 거의 변화 없음
# 결론 Income, Rating 컬럼만 사용해서 모델을 만드는것이 가장 좋음