In [78]:
# Logistic Regression by Sklearn

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('./data/admission.csv')

# display(df.head())
# df.info()
# print(df.isnull().sum()) # 결치값 없음

# figure = plt.figure()
# ax1 = figure.add_subplot(1,4,1)
# ax2 = figure.add_subplot(1,4,2)
# ax3 = figure.add_subplot(1,4,3)
# ax4 = figure.add_subplot(1,4,4)

# ax1.set_title('ADMIT')
# ax2.set_title('GRE')
# ax3.set_title('GPA')
# ax4.set_title('RANK')

# ax1.boxplot(df['admit'])
# ax2.boxplot(df['gre'])
# ax3.boxplot(df['gpa'])
# ax4.boxplot(df['rank'])

# figure.tight_layout()
# plt.show() # df['gre'], df['gpa']에 이상치가 있다

zscore_threshold = 2.0 # zscore로 이상치 제거
for col in df.columns:
    outlier = df[col][np.abs(stats.zscore(df[col])) > zscore_threshold]
    df = df.loc[~df[col].isin(outlier)] # ~는 Not

x_data = df.drop('admit', axis = 1, inplace=False) # 정규화 진행
t_data = df['admit'].values.reshape(-1,1) # 0과 1로만 구성되어 있음

scaler = MinMaxScaler()
scaler.fit(x_data) # fit으로 변환을 위한 사전구조 맞추기

norm_x_data = scaler.transform(x_data) # 0 ~ 1 사이의 값으로 변환
# print(norm_x_data) # norm_x_data, t_data 준비

model = linear_model.LogisticRegression() # Sklearn 구현
model.fit(x_data, t_data)
my_score = np.array([[600, 3.8, 1]])
predict_val = model.predict(my_score) # 1. 0 or 1로 결과 도출
predict_proba = model.predict_proba(my_score) # [[0.43740782 0.56259218]]. 확률값으로 결과 도출

print('Sklearn이 예측한 결과 : 합격여부 : {}, 확률 : {}'.format(predict_val, predict_proba)) # 합격!

Sklearn이 예측한 결과 : 합격여부 : [1], 확률 : [[0.43740782 0.56259218]]


In [79]:
import warnings
warnings.filterwarnings('ignore')

# Logistic Regression by Tensorflow
X = tf.placeholder(shape=[None,3], dtype=tf.float32) # placeholder
T = tf.placeholder(shape=[None,1], dtype=tf.float32)

# Weight, bias
W = tf.Variable(tf.random.normal([3,1]))
b = tf.Variable(tf.random.normal([1]))

# Hypothesis, Model, Predict Model, Logistic Regression Model
logit = tf.matmul(X,W) + b
H = tf.sigmoid(logit)

# loss func, cross entropy 혹은 log loss
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logit, labels=T)) # logits은 선형회귀, y. label은 T

# train node
train = tf.train.GradientDescentOptimizer(learning_rate=1e-4).minimize(loss)

# Session & 초기화
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# 반복 학습
for step in range(300000):
    _, loss_val = sess.run([train, loss], feed_dict={X: norm_x_data, T: t_data})
    
    if step % 30000 == 0:
        print('loss의 값 : {}'.format(loss_val))

loss의 값 : 2.3886940479278564
loss의 값 : 0.8489124774932861
loss의 값 : 0.610153079032898
loss의 값 : 0.5910931825637817
loss의 값 : 0.5888001322746277
loss의 값 : 0.5879842042922974
loss의 값 : 0.5873535871505737
loss의 값 : 0.5867820978164673
loss의 값 : 0.5862650275230408
loss의 값 : 0.5857952833175659


In [80]:
# Prediction + MinMaxScaler
my_score = np.array([[600, 3.8, 1]])
norm_my_score = scaler.transform(my_score)

result = sess.run(H, feed_dict={X: norm_my_score})
print('Tensorflow가 예측한 : 확률 : {}'.format(result)) # [[0.4715909]]. 불합격!

Tensorflow가 예측한 : 확률 : [[0.4715909]]


In [71]:
# Regression의 Metrics(Hold-Out Validation, Sklearn)

import numpy as np
import pandas as pd
from sklearn import linear_model
from scipy import stats
from sklearn.model_selection import train_test_split # Training set / Test set 나누기

df = pd.read_csv('./data/ozone.csv')
# print(df.shape) # (153, 6)

training_data = df.dropna(how='any', inplace=False) # 결측치 제거
# print(training_data.shape) # (111, 6)

zscore_threshold = 2.0 # zscore로 이상치 제거

for col in training_data.columns:
    outlier = training_data[col][np.abs(stats.zscore(training_data[col])) > zscore_threshold]
    training_data = training_data.loc[~training_data[col].isin(outlier)]

# display(training_data.head()) # Sklearn 쓸 거라서 정규화는 따로 하지 않음

# Data Set
x_data = training_data[['Solar.R', 'Wind', 'Temp']].values
t_data = training_data[['Ozone']].values.reshape(-1,1)

# Train / Validation Data Set으로 분리
train_x_data, valid_x_data, train_t_data, valid_t_data = \
train_test_split(x_data,
                 t_data,
                 test_size=0.3,
                 random_state=2) # random_state은 seed 역할

# Model
model = linear_model.LinearRegression()

# Model 학습
model.fit(train_x_data, train_t_data)

# 예측값. 정답은 valid_t_data
predict_value = model.predict(valid_x_data)

In [72]:
from sklearn.metrics import mean_absolute_error # MAE

print(mean_absolute_error(valid_t_data, predict_value)) # 13.924465776324642

13.924465776324642


In [73]:
from sklearn.metrics import mean_squared_error # MSE

print(mean_squared_error(valid_t_data, predict_value)) # 271.5671192367061

271.5671192367061


In [74]:
from sklearn.metrics import r2_score # R-Squared

print(r2_score(valid_t_data, predict_value)) # 0.3734728354920861

0.3734728354920861
