In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.tree import DecisionTreeRegressor

In [2]:
df = pd.read_csv("card_cust.csv")
df.head(2)

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,10001,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0.0,2.0,1000.0,201.802084,139.509787,0.0,12.0
1,10002,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4.0,0.0,7000.0,4103.032597,1072.340217,0.222222,12.0


### 전처리

In [3]:
df.isna().sum()

CUST_ID                              0
BALANCE                              0
BALANCE_FREQUENCY                    0
PURCHASES                            0
ONEOFF_PURCHASES                     0
INSTALLMENTS_PURCHASES               0
CASH_ADVANCE                         0
PURCHASES_FREQUENCY                  0
ONEOFF_PURCHASES_FREQUENCY           0
PURCHASES_INSTALLMENTS_FREQUENCY     0
CASH_ADVANCE_FREQUENCY               0
CASH_ADVANCE_TRX                     0
PURCHASES_TRX                        0
CREDIT_LIMIT                         0
PAYMENTS                             0
MINIMUM_PAYMENTS                    74
PRC_FULL_PAYMENT                     0
TENURE                               0
dtype: int64

In [4]:
df["MINIMUM_PAYMENTS"] = df["MINIMUM_PAYMENTS"].fillna(df["MINIMUM_PAYMENTS"].mean())

In [5]:
df.isna().sum().sum()

0

In [6]:
df_base = df.copy()

### Q1.

In [7]:
df_q1 = df_base[["TENURE", "CREDIT_LIMIT", "BALANCE"]]

In [8]:
df_q1.head(2)

Unnamed: 0,TENURE,CREDIT_LIMIT,BALANCE
0,12.0,1000.0,40.900749
1,12.0,7000.0,3202.467416


In [9]:
df_q1_sub = df_q1.loc[df_q1["TENURE"] == 12, ]
df_q1_sub[["CREDIT_LIMIT", "BALANCE"]].corr()

Unnamed: 0,CREDIT_LIMIT,BALANCE
CREDIT_LIMIT,1.0,0.460833
BALANCE,0.460833,1.0


In [10]:
ls_corr = []
for t in df_q1["TENURE"].unique():
    df_q1_sub = df_q1.loc[df_q1["TENURE"] == t, ]
    val_corr = df_q1_sub[["CREDIT_LIMIT", "BALANCE"]].corr().iloc[0, 1]
    ls_corr = ls_corr + [val_corr]

In [11]:
pd.Series(ls_corr, index = df_q1["TENURE"].unique()).reset_index().round(2)

Unnamed: 0,index,0
0,12.0,0.46
1,8.0,0.82
2,11.0,0.38
3,9.0,0.09
4,10.0,0.29
5,7.0,0.95
6,6.0,0.87


In [12]:
df_q1.groupby("TENURE")["CREDIT_LIMIT", "BALANCE"].corr().round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,CREDIT_LIMIT,BALANCE
TENURE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6.0,CREDIT_LIMIT,1.0,0.87
6.0,BALANCE,0.87,1.0
7.0,CREDIT_LIMIT,1.0,0.95
7.0,BALANCE,0.95,1.0
8.0,CREDIT_LIMIT,1.0,0.82
8.0,BALANCE,0.82,1.0
9.0,CREDIT_LIMIT,1.0,0.09
9.0,BALANCE,0.09,1.0
10.0,CREDIT_LIMIT,1.0,0.29
10.0,BALANCE,0.29,1.0


In [13]:
df_q1.groupby("TENURE")["CREDIT_LIMIT", "BALANCE"].corr().xs("CREDIT_LIMIT", 
                                                             level = 1)

Unnamed: 0_level_0,CREDIT_LIMIT,BALANCE
TENURE,Unnamed: 1_level_1,Unnamed: 2_level_1
6.0,1.0,0.868056
7.0,1.0,0.948405
8.0,1.0,0.820696
9.0,1.0,0.085474
10.0,1.0,0.291482
11.0,1.0,0.38036
12.0,1.0,0.460833


### Q2.

In [14]:
df_q2 = df_base.drop(columns = "CUST_ID").copy()
df_q2.head(2)

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0.0,2.0,1000.0,201.802084,139.509787,0.0,12.0
1,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4.0,0.0,7000.0,4103.032597,1072.340217,0.222222,12.0


In [15]:
df_q2_nor = StandardScaler().fit_transform(df_q2)
df_q2_nor = pd.DataFrame(df_q2_nor, columns = df_q2.columns)
df_q2_nor.head(2)

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,-0.848768,-0.419879,-0.441936,-0.374048,-0.395301,-0.482354,-0.872701,-0.804321,-0.71962,-0.684701,-0.457918,-0.564116,-1.161669,-0.557396,-0.443725,-0.465544,0.282429
1,0.282791,0.012131,-0.469017,-0.374048,-0.470304,1.878468,-1.282558,-0.804321,-0.924403,0.513493,0.065417,-0.628057,0.150025,0.360574,-0.086159,0.331592,0.282429


In [None]:
# list(range(2, 6))

ls_sil = []
for k in range(2, 6):
    model_km = KMeans(n_clusters = k, random_state = 1234)
    model_km.fit(df_q2_nor)
    val_sil = silhouette_score(df_q2_nor, labels = model_km.labels_)
    ls_sil = ls_sil + [val_sil]

In [17]:
ls_sil

[0.3075281530456079,
 0.19636128772937608,
 0.20715098494639902,
 0.19274056144483248]

In [None]:
model_km = KMeans(n_clusters = 2, random_state = 1234)
model_km.fit(df_q2_nor)

In [19]:
df_q2["cluster"] = model_km.labels_
df_q2.head(2)

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE,cluster
0,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0.0,2.0,1000.0,201.802084,139.509787,0.0,12.0,0
1,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4.0,0.0,7000.0,4103.032597,1072.340217,0.222222,12.0,0


In [20]:
df_q2["cluster"].value_counts()

0    802
1    198
Name: cluster, dtype: int64

In [21]:
df_q2.groupby("cluster")["ONEOFF_PURCHASES"].mean().round(2)

cluster
0     340.23
1    3946.19
Name: ONEOFF_PURCHASES, dtype: float64

### Q3.

In [22]:
7 % 4, 6 % 4

(3, 2)

In [26]:
df_train = df_base.loc[(df_base["CUST_ID"] % 4) != 0, ]
df_test  = df_base.loc[(df_base["CUST_ID"] % 4) == 0, ]

In [27]:
len(df_train), len(df_test)

(752, 248)

In [30]:
model_dt = DecisionTreeRegressor(random_state = 1234)
model_dt.fit(X = df_train.drop(columns = ["CUST_ID", "ONEOFF_PURCHASES"]),
             y = df_train["ONEOFF_PURCHASES"])
pred = model_dt.predict(df_test.drop(columns = ["CUST_ID", "ONEOFF_PURCHASES"]))

In [31]:
pred[:4]

array([1508.54,    0.  , 1494.5 ,    0.  ])

In [33]:
y_t = df_test["ONEOFF_PURCHASES"]
y_p = pred

In [40]:
# y_t - y_p
# (y_t - y_p) ** 2
# ((y_t - y_p) ** 2).mean()
round(((y_t - y_p) ** 2).mean() ** 0.5, 1)

2383.8

In [36]:
from sklearn.metrics import mean_squared_error

In [38]:
round(mean_squared_error(y_true = df_test["ONEOFF_PURCHASES"],
                         y_pred = pred) ** 0.5, 1)

2383.8

In [None]:
# 1039.2