In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [2]:
df = pd.read_csv("galaxy_users.csv")
df.head(2)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No


### Q1.

In [26]:
df_q1 = df.loc[:, "OnlineSecurity":"StreamingMovies"].copy()
df_q1.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,No,Yes,No,No,No,No
1,Yes,No,Yes,No,No,No


In [27]:
df_q1.head(2).replace({"Yes": 1, "No": 0})

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,0,1,0,0,0,0
1,1,0,1,0,0,0


In [28]:
df_q1["OnlineSecurity"].unique()

array(['No', 'Yes', 'No internet service'], dtype=object)

In [29]:
df_q1.apply(lambda x: x.unique())

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,No,Yes,No,No,No,No
1,Yes,No,Yes,Yes,Yes,Yes
2,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service


In [30]:
df_q1.loc[df_q1["OnlineSecurity"] == "No internet service", ]

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
11,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
16,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
21,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
22,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
33,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
...,...,...,...,...,...,...
7006,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
7008,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
7009,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
7019,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service


In [31]:
df_q1 = df_q1.loc[df_q1["OnlineSecurity"] != "No internet service", ]
df_q1.head(1)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,No,Yes,No,No,No,No


In [32]:
df_q1.apply(lambda x: x.unique())

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,No,Yes,No,No,No,No
1,Yes,No,Yes,Yes,Yes,Yes


In [33]:
df_q1 = df_q1.replace({"Yes": 1, "No": 0})
df_q1.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,0,1,0,0,0,0
1,1,0,1,0,0,0


In [34]:
df_q1["cnt"] = df_q1.sum(axis = 1)
df_q1.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,cnt
0,0,1,0,0,0,0,1
1,1,0,1,0,0,0,2


In [35]:
df_q1["cnt"].value_counts()

3    1117
2    1033
1     966
4     850
0     693
5     569
6     284
Name: cnt, dtype: int64

In [36]:
round(966 / 284, 1)

3.4

In [37]:
df_q1["cnt"].value_counts()[1]

966

In [40]:
a = df.loc[:, "OnlineSecurity":"StreamingMovies"].apply(lambda x: sum(x == "Yes"), 
                                                    axis = 1).value_counts()
round(a[1] / a[6], 1)

3.4

### Q2.

In [41]:
df_q2 = df[["tenure", "MonthlyCharges", "TotalCharges"]].copy()
df_q2.head(2)

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,1,29.85,29.85
1,34,56.95,1889.5


In [43]:
12 // 5, 13 // 5

(2, 2)

In [45]:
df_q2["month"] = df_q2["TotalCharges"] // df_q2["MonthlyCharges"]
df_q2.head(2)

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,month
0,1,29.85,29.85,1.0
1,34,56.95,1889.5,33.0


In [47]:
df_q2.drop(columns = "TotalCharges").corr().round(3)

Unnamed: 0,tenure,MonthlyCharges,month
tenure,1.0,0.247,0.999
MonthlyCharges,0.247,1.0,0.246
month,0.999,0.246,1.0


### Q3.

In [49]:
col1 = ['SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'MonthlyCharges', 'TotalCharges']
col2 = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingMovies', 'PaperlessBilling']

In [51]:
df_q3 = df[["Churn"] + col1 + col2].copy()
df_q3.head(1)

Unnamed: 0,Churn,SeniorCitizen,Partner,Dependents,tenure,MonthlyCharges,TotalCharges,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies,PaperlessBilling
0,No,0,Yes,No,1,29.85,29.85,No,Yes,No,No,No,Yes


In [52]:
df_q3.dtypes

Churn                object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
MonthlyCharges      float64
TotalCharges        float64
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingMovies      object
PaperlessBilling     object
dtype: object

In [53]:
df_q3 = df_q3.replace({"Yes": 1, "No": 0})
df_q3.head(2)

Unnamed: 0,Churn,SeniorCitizen,Partner,Dependents,tenure,MonthlyCharges,TotalCharges,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies,PaperlessBilling
0,0,0,1,0,1,29.85,29.85,0,1,0,0,0,1
1,0,0,0,0,34,56.95,1889.5,1,0,1,0,0,0


In [54]:
df_q3.dtypes

Churn                 int64
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
MonthlyCharges      float64
TotalCharges        float64
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingMovies      object
PaperlessBilling      int64
dtype: object

In [55]:
df_q3_obj = df_q3.loc[:, df_q3.dtypes.astype("str").str.contains("obj")]
df_q3_obj.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies
0,0,1,0,0,0
1,1,0,1,0,0


In [56]:
df_q3_obj.apply(lambda x: x.unique())

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies
0,0,1,0,0,0
1,1,0,1,1,1
2,No internet service,No internet service,No internet service,No internet service,No internet service


In [57]:
df_q3 = df_q3.replace({"No internet service": -1})
df_q3.head(2)

Unnamed: 0,Churn,SeniorCitizen,Partner,Dependents,tenure,MonthlyCharges,TotalCharges,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies,PaperlessBilling
0,0,0,1,0,1,29.85,29.85,0,1,0,0,0,1
1,0,0,0,0,34,56.95,1889.5,1,0,1,0,0,0


In [58]:
df_q3.dtypes

Churn                 int64
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
MonthlyCharges      float64
TotalCharges        float64
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingMovies       int64
PaperlessBilling      int64
dtype: object

In [59]:
df_train, df_test = train_test_split(df_q3, train_size = 0.7,
                                     random_state = 123)
len(df_train), len(df_test)

(4922, 2110)

In [60]:
model_nor = MinMaxScaler().fit(df_train)
arr_train_nor = model_nor.transform(df_train)
arr_test_nor  = model_nor.transform(df_test)

In [61]:
arr_train_nor[:2, ]

array([[1.        , 0.        , 0.        , 0.        , 0.08450704,
        0.81116094, 0.07551927, 0.5       , 1.        , 0.5       ,
        0.5       , 1.        , 1.        ],
       [1.        , 0.        , 1.        , 0.        , 0.        ,
        0.60737419, 0.00698708, 0.5       , 0.5       , 0.5       ,
        0.5       , 0.5       , 1.        ]])

In [64]:
model_lr = LogisticRegression(random_state = 123)
model_lr.fit(X = arr_train_nor[:, 1:],
             y = arr_train_nor[:,  0])
pred = model_lr.predict(arr_test_nor[:, 1:])

In [65]:
pred[:3]

array([0., 0., 0.])

In [66]:
round(f1_score(y_true = arr_test_nor[:, 0], 
               y_pred = pred), 2)

0.55

### Q. 기타 원소가 굉장히 많을 경우 일괄 변환 방법?

In [67]:
df = pd.read_csv("../diamonds.csv")
df.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31


In [68]:
df = df[["cut", "color", "clarity"]].copy()
df.head(2)

Unnamed: 0,cut,color,clarity
0,Ideal,E,SI2
1,Premium,E,SI1


In [75]:
# df.apply(lambda x: x.unique()) # error가 발생하나, 최신버전에서는 정상동작
ser_u = df.apply(lambda x: [x.unique()]).explode().explode()
ser_u

cut            Ideal
cut          Premium
cut             Good
cut        Very Good
cut             Fair
color              E
color              I
color              J
color              H
color              F
color              G
color              D
clarity          SI2
clarity          SI1
clarity          VS1
clarity          VS2
clarity         VVS2
clarity         VVS1
clarity           I1
clarity           IF
dtype: object

In [76]:
np.where(ser_u.isin(["J", "H"]), ser_u, -1)

array([-1, -1, -1, -1, -1, -1, -1, 'J', 'H', -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1], dtype=object)

In [78]:
ser_repl = pd.Series(np.where(ser_u.isin(["J", "H"]), ser_u, -1),
                     index = ser_u)
ser_repl.head()

Ideal        -1
Premium      -1
Good         -1
Very Good    -1
Fair         -1
dtype: object

In [79]:
ser_repl.to_dict()

{'Ideal': -1,
 'Premium': -1,
 'Good': -1,
 'Very Good': -1,
 'Fair': -1,
 'E': -1,
 'I': -1,
 'J': 'J',
 'H': 'H',
 'F': -1,
 'G': -1,
 'D': -1,
 'SI2': -1,
 'SI1': -1,
 'VS1': -1,
 'VS2': -1,
 'VVS2': -1,
 'VVS1': -1,
 'I1': -1,
 'IF': -1}

In [80]:
df_repl = df.replace(ser_repl)
df_repl.head()

Unnamed: 0,cut,color,clarity
0,-1,-1,-1
1,-1,-1,-1
2,-1,-1,-1
3,-1,-1,-1
4,-1,J,-1


In [81]:
df_repl.apply(lambda x: [x.unique()])

cut              [[-1]]
color      [[-1, J, H]]
clarity          [[-1]]
dtype: object

### Q. 제거하고자 하는 원소가 여러 변수에 흩어져 있는 경우 해당 원소가 있는 행을 효과적으로 제거하는 방법?

In [86]:
df = pd.read_csv("galaxy_users.csv")
df_q1 = df.loc[:, "OnlineSecurity":"StreamingMovies"].copy()
df_q1.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,No,Yes,No,No,No,No
1,Yes,No,Yes,No,No,No


In [87]:
for col in df_q1.columns:
    df_q1 = df_q1.loc[df_q1[col] != "Yes", ]

In [90]:
print(len(df_q1))
df_q1.head(2)

2213


Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
4,No,No,No,No,No,No
11,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service


In [91]:
df_q1 = df.loc[:, "OnlineSecurity":"StreamingMovies"].copy()
df_q1.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,No,Yes,No,No,No,No
1,Yes,No,Yes,No,No,No


In [92]:
df_q1 = df_q1.replace({"Yes": np.nan})
df_q1.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,No,,No,No,No,No
1,,No,,No,No,No


In [93]:
df_q1 = df_q1.dropna()
len(df_q1)

2213