In [14]:
# Logistic Regression + KNN - BMI data

import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier # 분류기
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

# Raw Data Loading
df = pd.read_csv('./data/bmi.csv', skiprows=3)

# Data Split
train_x_data, test_x_data, train_t_data, test_t_data = \
train_test_split(df[['height', 'weight']], df['label'], test_size=0.3, random_state=1, stratify=df['label'])

# 정규화
scaler = MinMaxScaler()
scaler.fit(train_x_data)
norm_train_x_data = scaler.transform(train_x_data)
norm_test_x_data = scaler.transform(test_x_data)

# Logistic Regression
model = LogisticRegression()
model.fit(norm_train_x_data, train_t_data)

acc = model.score(norm_test_x_data, test_t_data)
print('Logistic Regression의 Accuracy : {}'.format(acc)) # 0.9851666666666666

# KNN으로 구현
knn_classifier = KNeighborsClassifier(n_neighbors=3) # 동심원을 그려서 주변의 값 3개를 가져와라
knn_classifier.fit(norm_train_x_data, train_t_data)
acc = knn_classifier.score(norm_test_x_data, test_t_data)
print('KNN의 Accuracy : {}'.format(acc)) # 0.9985

Logistic Regression의 Accuracy : 0.9851666666666666
KNN의 Accuracy : 0.9985


In [73]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? ㅛ
Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [74]:
# Linear Regression + KNN by Tensorflow 2.x - Ozone data

import numpy as np
import pandas as pd
from scipy import stats
from sklearn.neighbors import KNeighborsRegressor # 연속적인 숫자값을 예측
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

# Raw Data Loading
df = pd.read_csv('./data/ozone.csv')

x_data = df[['Solar.R', 'Wind', 'Temp']] # 2차원 DataFrame
t_data = df['Ozone']

# 1. 독립변수에 대한 결측치를 찾아서 중위값으로 보간(대체)
for col in x_data.columns:
    col_median = np.nanmedian(x_data[col])
    x_data[col].loc[x_data[col].isnull()] = col_median
    
# 2. 독립변수의 이상치를 찾아 이상치를 제외한 나머지 값들의 평균으로 이상치를 대체
zscore_threshold = 2.0

for col in x_data.columns:
    outlier = x_data[col][np.abs(stats.zscore(x_data[col])) > zscore_threshold]
    col_mean = np.mean(x_data.loc[~x_data[col].isin(outlier),col])
    x_data.loc[x_data[col].isin(outlier), col] = col_mean
    
# 3. 종속변수의 이상치를 찾아 이상치를 제외한 나머지 값들의 평균으로 이상치를 대체
outlier = t_data[np.abs(stats.zscore(t_data)) > zscore_threshold]
col_mean = np.mean(~t_data.isin(outlier))
t_data[t_data.isin(outlier)] = col_mean

# 4. 정규화 진행
scaler_x = MinMaxScaler()
scaler_t = MinMaxScaler()

scaler_x.fit(x_data.values) # scaler는 2차원 ndarray로 사용해야 함
scaler_t.fit(t_data.values.reshape(-1,1))

norm_x_data = scaler_x.transform(x_data.values)
norm_t_data = scaler_t.transform(t_data.values.reshape(-1,1)).ravel()

# 5. 종속변수에 대한 결측치는 KNN을 이용해 impotation(보간)
# 종속변수가 결측치가 아닌 독립&종속변수들을 추출(KNN 학습을 위해)
norm_train_x_data = norm_x_data[~np.isnan(norm_t_data)]
norm_train_t_data = norm_t_data[~np.isnan(norm_t_data)]

# Model 생성
knn_regressor = KNeighborsRegressor(n_neighbors=2)
knn_regressor.fit(norm_train_x_data, norm_train_t_data)

# 종속변수가 결측치인 독립변수들을 입력으로 넣어서 값을 예측
knn_predict = knn_regressor.predict(norm_x_data[np.isnan(norm_t_data)])
norm_t_data[np.isnan(norm_t_data)] = knn_predict

In [76]:
# 최종 데이터는 norm_x_data, norm_t_data
# Sklearn과 Tensorflow 2.x로 구현해보자

from sklearn.linear_model import LinearRegression
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras.optimizers import SGD

test_data = np.array([[330, 15, 80]]) # 태양광, 바람, 온도

# Sklearn 구현
model = LinearRegression()
model.fit(norm_x_data, norm_t_data)
result = model.predict(scaler_x.transform(test_data))
print('sklearn 예측값 : {}'.format(scaler_t.inverse_transform(result.reshape(-1,1)))) # 36.93077619

# Tensorflow 2.x 구현
keras_model = Sequential()

keras_model.add(Flatten(input_shape=(3,))) # input layer. 독립변수 3개 튜플로 입력
keras_model.add(Dense(units=1, activation='linear')) # output layer. class와 logistic의 개수는 1

keras_model.compile(optimizer=SGD(learning_rate=1e-2),
                    loss='mse')

keras_model.fit(norm_x_data, norm_t_data, epochs=5000, verbose=0)

result = keras_model.predict(scaler_x.transform(test_data))
print('Tensorflow 예측값 : {}'.format(scaler_t.inverse_transform(result.reshape(-1,1)))) # 36.760612

sklearn 예측값 : [[36.93077619]]
Tensorflow 예측값 : [[36.760612]]


In [77]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [86]:
# Logistic Regression(Binary Classification) by Sklearn / Tensorflow 2.x - Titanic

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras.optimizers import SGD
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# 이상치는 실제 데이터이기 때문에 처리하지 않고 결측치만 처리할 예정

# Raw Data Loading
df = pd.read_csv('./data/titanic/train.csv')

df = df.drop(['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin'], axis=1, inplace=False)

df['Family'] = df['SibSp'] + df['Parch']
df = df.drop(['SibSp', 'Parch'], axis=1, inplace=False)

# 결측치 처리
df['Embarked'] = df['Embarked'].fillna('Q')
df['Age'] = df['Age'].fillna(df['Age'].mean())

# 문자로 되어 있는 값을 숫자로 변경
gender_string = {'male': 0, 'female': 1}
df['Sex'] = df['Sex'].map(gender_string)

embarked_string = {'S': 0, 'C': 1, 'Q': 2}
df['Embarked'] = df['Embarked'].map(embarked_string)

def age_category(age): # 
    if ((age >= 0) & (age < 25)):
        return 0
    elif ((age >= 25) & (age < 50)):
        return 1
    else:
        return 2
    
df['Age'] = df['Age'].map(age_category)

df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked,Family
0,0,3,0,0,0,1
1,1,1,1,1,1,1
2,1,3,1,1,0,0
3,1,1,1,1,0,1
4,0,3,0,1,0,0


In [87]:
# Data Split
train_x_data, test_x_data, train_t_data, test_t_data = \
train_test_split(df.drop('Survived', axis=1, inplace=False), df['Survived'], test_size=0.3, random_state=1, stratify=df['Survived'])

# 정규화
scaler = MinMaxScaler()
scaler.fit(train_x_data)

norm_train_x_data = scaler.transform(train_x_data)
norm_test_x_data = scaler.transform(test_x_data)

# Sklearn 구현
model = LogisticRegression()
model.fit(norm_train_x_data, train_t_data)
sklearn_result = model.score(norm_test_x_data, test_t_data)
print('Sklearn 정확도 : {}'.format(sklearn_result)) # 0.7873134328358209

Sklearn 정확도 : 0.7873134328358209


In [88]:
# Tensorflow 2.x 구현
keras_model = Sequential()

keras_model.add(Flatten(input_shape=(5,))) # input layer
keras_model.add(Dense(units=1, activation='sigmoid')) # output layer

keras_model.compile(optimizer=SGD(learning_rate=1e-2),
                    loss='binary_crossentropy',
                    metrics=['accuracy'])

keras_model.fit(norm_train_x_data, train_t_data, epochs=1000, verbose=0)

keras_result = keras_model.evaluate(norm_test_x_data, test_t_data)
print('Tensorflow 정확도 : {}'.format(keras_result)) # 0.7910447716712952

Tensorflow 정확도 : [0.4663046896457672, 0.7910447716712952]
