In [366]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [367]:
df = pd.read_csv("../datasets/weight_height.csv",
                 encoding="euc-kr",
                 usecols=["학교명", "학년", "성별", "키", "몸무게"])
                 
df

Unnamed: 0,학교명,학년,성별,키,몸무게
0,서울대도초등학교,1,남,125.8,27.3
1,서울대도초등학교,1,남,124.3,25.4
2,서울대도초등학교,1,남,119.2,23.5
3,서울대도초등학교,1,남,115.0,20.0
4,서울대도초등학교,1,남,120.0,33.5
...,...,...,...,...,...
9681,세종과학고등학교,3,남,176.1,50.4
9682,세종과학고등학교,3,남,174.1,88.8
9683,세종과학고등학교,3,남,169.5,63.2
9684,세종과학고등학교,3,여,159.2,52.9


In [368]:
# df["학교명"] -> 초등학교=0, 중학교=6, 고등학교=9 + df["학년"]
df["grade"] = df["학교명"].map(lambda x : 0 if x.endswith("초등학교")
                            else 6 if x.endswith("중학교")
                            else 9) + df["학년"]
df

Unnamed: 0,학교명,학년,성별,키,몸무게,grade
0,서울대도초등학교,1,남,125.8,27.3,1
1,서울대도초등학교,1,남,124.3,25.4,1
2,서울대도초등학교,1,남,119.2,23.5,1
3,서울대도초등학교,1,남,115.0,20.0,1
4,서울대도초등학교,1,남,120.0,33.5,1
...,...,...,...,...,...,...
9681,세종과학고등학교,3,남,176.1,50.4,12
9682,세종과학고등학교,3,남,174.1,88.8,12
9683,세종과학고등학교,3,남,169.5,63.2,12
9684,세종과학고등학교,3,여,159.2,52.9,12


In [369]:
df.drop(["학교명", "학년"], axis="columns", inplace=True)
df.columns = ["gender", "height", "weight", "grade"]
df

Unnamed: 0,gender,height,weight,grade
0,남,125.8,27.3,1
1,남,124.3,25.4,1
2,남,119.2,23.5,1
3,남,115.0,20.0,1
4,남,120.0,33.5,1
...,...,...,...,...
9681,남,176.1,50.4,12
9682,남,174.1,88.8,12
9683,남,169.5,63.2,12
9684,여,159.2,52.9,12


In [370]:
# 남 : 0 / 여 : 1
df["gender"] = df["gender"].map(lambda x : 0 if x == "남" else 1)
df

Unnamed: 0,gender,height,weight,grade
0,0,125.8,27.3,1
1,0,124.3,25.4,1
2,0,119.2,23.5,1
3,0,115.0,20.0,1
4,0,120.0,33.5,1
...,...,...,...,...
9681,0,176.1,50.4,12
9682,0,174.1,88.8,12
9683,0,169.5,63.2,12
9684,1,159.2,52.9,12


In [371]:
# 몸무게와 키를 통해서 성별 예측하고 싶다


x = df[["weight", "height"]]
y = df[["gender"]]

In [372]:
# 몸무게 키의 척도를 통일하자 (스케일링) -> 후에 다시 모델 만들고 평가
# 완성한 06 파일에서 단위가 다르니 scaler를 하고 그다음에 다시 학습시켜서 다시 평가

In [373]:
# MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

In [374]:
x

Unnamed: 0,weight,height
0,27.3,125.8
1,25.4,124.3
2,23.5,119.2
3,20.0,115.0
4,33.5,120.0
...,...,...
9681,50.4,176.1
9682,88.8,174.1
9683,63.2,169.5
9684,52.9,159.2


In [375]:
# # 결측값 제거
x = x.dropna()
y = y.loc[x.index]

In [376]:
x[["weight"]]

Unnamed: 0,weight
0,27.3
1,25.4
2,23.5
3,20.0
4,33.5
...,...
9681,50.4
9682,88.8
9683,63.2
9684,52.9


In [377]:
x[["height"]]

Unnamed: 0,height
0,125.8
1,124.3
2,119.2
3,115.0
4,120.0
...,...
9681,176.1
9682,174.1
9683,169.5
9684,159.2


In [378]:
y

Unnamed: 0,gender
0,0
1,0
2,0
3,0
4,0
...,...
9681,0
9682,0
9683,0
9684,1


In [390]:
x["weight"].max()

np.float64(130.7)

In [391]:
x["weight"].min()

np.float64(15.9)

In [392]:
x["height"].max()

np.float64(194.2)

In [393]:
x["height"].min()

np.float64(107.7)

In [394]:
scaler = MinMaxScaler()

In [400]:
# fit과 transform 한번에 수행
# transform : 학습된 데이터를 기준삼아 데이터들을 0~1사이 값으로 변환함
x_scaled = scaler.fit_transform(x)
x_scaled

array([[0.09930314, 0.20924855],
       [0.08275261, 0.19190751],
       [0.06620209, 0.13294798],
       ...,
       [0.41202091, 0.71445087],
       [0.32229965, 0.59537572],
       [0.45121951, 0.62774566]], shape=(9682, 2))

In [401]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, random_state=1)

In [402]:
model = LogisticRegression()

In [403]:
model.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [404]:
model.score(x_test, y_test)

0.5332507228418009

In [405]:
model.predict([[160,50]])

array([0])

In [413]:
model.intercept_

array([0.94265096])

In [414]:
model.coef_

array([[-1.19653832, -1.14162123]])

In [416]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [417]:
x_scaled = scaler.fit_transform(x)
x_scaled

array([[-1.34158104, -1.69498036],
       [-1.45285436, -1.78336852],
       [-1.56412769, -2.08388827],
       ...,
       [ 0.76089924,  0.88006137],
       [ 0.15768067,  0.27312933],
       [ 1.02444134,  0.43812057]], shape=(9682, 2))

In [418]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, random_state=1)

In [419]:
model = LogisticRegression()

In [420]:
model.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [422]:
model.score(x_test, y_test)

0.5353159851301115

In [423]:
model.predict([[160,50]])

array([0])

In [424]:
model.intercept_

array([-0.03390597])

In [425]:
model.coef_

array([[-0.18703848, -0.2172212 ]])