In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [None]:
df = pd.read_csv("galaxy_users.csv")
df.head(2)

### Q1.

In [None]:
df_q1 = df.loc[:, "OnlineSecurity":"StreamingMovies"].copy()
df_q1 = df_q1.replace({"Yes": 1, "No": 0})
df_q1.head(2)

In [10]:
# df_q1["OnlineSecurity"].unique()
# df_q1["OnlineBackup"].unique()
# df_q1.unique()
# df_q1.drop_duplicates()
df_q1.apply(lambda x: x.unique())
# df_q1.apply(lambda x: [x.unique()]) # 시험버전의 경우 list 로 감싸주어야 함.

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,0,1,0,0,0,0
1,1,0,1,1,1,1
2,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service


In [11]:
df_q1_sub = df_q1.loc[df_q1["OnlineSecurity"] != "No internet service", ]

In [14]:
# df_q1.loc[df_q1["OnlineSecurity"] == "No internet service", ]

In [None]:
df_q1_sub = df_q1.replace("No internet service", np.nan).dropna()

In [None]:
df_q1_sub.apply(lambda x: x.unique())

In [22]:
ser_cnt = df_q1_sub.sum(axis = 1).value_counts()
ser_cnt[1], ser_cnt[6]

(966, 284)

In [24]:
round(ser_cnt[1] / ser_cnt[6], 1)

3.4

### Q2.

In [36]:
df_q2 = df[["tenure", "MonthlyCharges", "TotalCharges"]].copy()
df_q2["month"] = df_q2["TotalCharges"] // df_q2["MonthlyCharges"]
df_q2.head(2)

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,month
0,1,29.85,29.85,1.0
1,34,56.95,1889.5,33.0


In [33]:
df_q2.iloc[:, [0, 1, 3]].corr().round(3) # 0.999, 기본 상관분석 연산이 Pearson's

Unnamed: 0,tenure,MonthlyCharges,month
tenure,1.0,0.247,0.999
MonthlyCharges,0.247,1.0,0.246
month,0.999,0.246,1.0


In [38]:
# print(df_q2.corr.__doc__)

In [40]:
# dir(df_q2)

### Q3.

In [45]:
ls_x1 = ['SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'MonthlyCharges', 'TotalCharges']
ls_x2 = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingMovies', 'PaperlessBilling']
df_q3 = df[["Churn"] + ls_x1 + ls_x2].copy()
df_q3 = df_q3.replace({"Yes": 1, "No": 0})
df_q3.head(2)

Unnamed: 0,Churn,SeniorCitizen,Partner,Dependents,tenure,MonthlyCharges,TotalCharges,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies,PaperlessBilling
0,0,0,1,0,1,29.85,29.85,0,1,0,0,0,1
1,0,0,0,0,34,56.95,1889.5,1,0,1,0,0,0


In [None]:
df_q3_obj = df_q3.select_dtypes(include = "object") # 시험버전에서는 이 메서드가 구현은 되어있으나 버그로 동작 X
df_q3_obj = df_q3.loc[:, df_q3.dtypes == "object"] # 시험버전용 코드
df_q3_obj.head(2)

In [None]:
df_q3_obj.apply(lambda x: x.unique())

In [None]:
df_q3 = df_q3.replace("No internet service", -1)

In [53]:
df_train, df_test = train_test_split(df_q3, train_size = 0.7, random_state = 123)
len(df_train), len(df_test)

(4922, 2110)

https://datadoctorblog.com/2023/07/23/Py-ML-Normalization/

https://datadoctorblog.com/2023/08/02/Py-Preprocessing-naming-object-variables/

In [57]:
df_train.head(1)

Unnamed: 0,Churn,SeniorCitizen,Partner,Dependents,tenure,MonthlyCharges,TotalCharges,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies,PaperlessBilling
463,1,0,0,0,7,99.8,673.25,0,1,0,0,1,1


In [54]:
model_nor = MinMaxScaler().fit(df_train)
arr_train_nor = model_nor.transform(df_train)
arr_test_nor  = model_nor.transform(df_test)

In [None]:
arr_train_nor[:1, ]

In [None]:
model_lr = LogisticRegression(random_state = 123)
model_lr.fit(X = arr_train_nor[:, 1:],
             y = arr_train_nor[:, 0])
pred = model_lr.predict(arr_test_nor[:, 1:])

In [62]:
round(f1_score(y_true = arr_test_nor[:, 0], y_pred = pred), 2)

0.55

### Q. 특정 범주를 제외한 나머지 모든 범주를 지정한 값으로 치환하고자 하는 경우
 ※ "color" 변수의 원소가 "E" 또는 "J"가 아닌 나머지 모든 변수의 범주를 -1로 치환  
 ※ 고수는 .map() + 사용자 정의 함수(or lambda) 로 해결

In [63]:
df_dia = pd.read_csv("../diamonds.csv")
df_dia.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31


In [64]:
df_dia["color"].unique()

array(['E', 'I', 'J', 'H', 'F', 'G', 'D'], dtype=object)

In [68]:
# df_dia.iloc[:, 1:4].apply(lambda x: [x.unique()]) # 시험버전
df_dia.iloc[:, 1:4].apply(lambda x: x.unique())

cut         [Ideal, Premium, Good, Very Good, Fair]
color                         [E, I, J, H, F, G, D]
clarity    [SI2, SI1, VS1, VS2, VVS2, VVS1, I1, IF]
dtype: object

In [69]:
ser_u = df_dia.iloc[:, 1:4].apply(lambda x: x.unique()).explode()
ser_u

cut            Ideal
cut          Premium
cut             Good
cut        Very Good
cut             Fair
color              E
color              I
color              J
color              H
color              F
color              G
color              D
clarity          SI2
clarity          SI1
clarity          VS1
clarity          VS2
clarity         VVS2
clarity         VVS1
clarity           I1
clarity           IF
dtype: object

In [70]:
np.where(ser_u.isin(["E", "J"]), ser_u, -1)

array([-1, -1, -1, -1, -1, 'E', -1, 'J', -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1], dtype=object)

In [None]:
ser_u = df_dia.iloc[:, 1:4].apply(lambda x: x.unique()).explode()
ser_repl = pd.Series(np.where(ser_u.isin(["E", "J"]), ser_u, -1), 
                     index = ser_u)
ser_repl.to_dict()

In [None]:
df_dia2 = df_dia.replace(ser_repl)

In [73]:
df_dia2.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,-1,E,-1,61.5,55.0,326,3.95,3.98,2.43
1,0.21,-1,E,-1,59.8,61.0,326,3.89,3.84,2.31


In [74]:
df_dia2.iloc[:, 1:4].apply(lambda x: x.unique())

cut              [-1]
color      [E, -1, J]
clarity          [-1]
dtype: object