In [1]:
import numpy as np
import pandas as pd
import os

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# 관련 라이브러리 임포트 
import matplotlib.font_manager as fm

#  한글글꼴로 변경
# plt.rcParams['font.family'] = '한글글꼴명'
plt.rcParams['font.size'] = 11.0
# plt.rcParams['font.family'] = 'batang'
plt.rcParams['font.family'] = 'Malgun Gothic'

# 그래프에서 마이너스 폰트 깨지는 문제에 대한 대처
matplotlib.rcParams['axes.unicode_minus'] = False
plt.style.use('ggplot')

# 그래프 기본 크기 설정 
plt.rcParams['figure.figsize'] = [10, 10]

In [2]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.linear_model import Ridge, Lasso

from sklearn.metrics import f1_score, recall_score, precision_score, classification_report
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.pipeline import Pipeline

# 퀴즈 

1) 아래의 주소를 이용하여 데이타셋을 생성하여라 

'https://bit.ly/fish_csv'

2) 물고기의 종류는 다음과 같다. 

['Bream', 'Roach', 'Whitefish', 'Parkki', 'Perch', 'Pike', 'Smelt']

도미(Bream) 와 빙어(Smelt) 만 추출하여 새로운 데이타셋 bream_smelt 을 생성하고 
이진분류 모델을 구축하고 테스트하여라 (2진분류) 

3) 1의 데이타셋을 이용하여 다중 분류 모델을 구축하고 테스트하여라.(다중분류) 


In [3]:
df_fish = pd.read_csv('https://bit.ly/fish_csv')
df_fish.sample(5)

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
85,Perch,130.0,21.3,22.8,6.384,3.534
88,Perch,130.0,22.0,23.5,6.11,3.525
45,Roach,160.0,22.5,25.3,7.0334,3.8203
106,Perch,250.0,28.0,29.4,7.8204,4.2042
98,Perch,188.0,24.6,26.2,6.7334,4.1658


In [4]:
df_fish['Species'].unique(), df_fish['Species'].value_counts()  

array(['Bream', 'Roach', 'Whitefish', 'Parkki', 'Perch', 'Pike', 'Smelt'],
      dtype=object)

In [5]:
df_fish.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Species   159 non-null    object 
 1   Weight    159 non-null    float64
 2   Length    159 non-null    float64
 3   Diagonal  159 non-null    float64
 4   Height    159 non-null    float64
 5   Width     159 non-null    float64
dtypes: float64(5), object(1)
memory usage: 7.6+ KB


In [6]:
# df_fish['Weight'] 0 확인 => 전처리 필요 
df_fish.describe()

Unnamed: 0,Weight,Length,Diagonal,Height,Width
count,159.0,159.0,159.0,159.0,159.0
mean,398.326415,28.415723,31.227044,8.970994,4.417486
std,357.978317,10.716328,11.610246,4.286208,1.685804
min,0.0,8.4,8.8,1.7284,1.0476
25%,120.0,21.0,23.15,5.9448,3.38565
50%,273.0,27.3,29.4,7.786,4.2485
75%,650.0,35.5,39.65,12.3659,5.5845
max,1650.0,63.4,68.0,18.957,8.142


In [7]:
# 무게가 0인 생선은 누구? 
df_fish[df_fish['Weight'] == 0]

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
40,Roach,0.0,20.5,22.8,6.4752,3.3516


In [8]:
df_fish[df_fish['Species'] == 'Roach']

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
35,Roach,40.0,14.1,16.2,4.1472,2.268
36,Roach,69.0,18.2,20.3,5.2983,2.8217
37,Roach,78.0,18.8,21.2,5.5756,2.9044
38,Roach,87.0,19.8,22.2,5.6166,3.1746
39,Roach,120.0,20.0,22.2,6.216,3.5742
40,Roach,0.0,20.5,22.8,6.4752,3.3516
41,Roach,110.0,20.8,23.1,6.1677,3.3957
42,Roach,120.0,21.0,23.7,6.1146,3.2943
43,Roach,150.0,22.0,24.7,5.8045,3.7544
44,Roach,145.0,22.0,24.3,6.6339,3.5478


In [9]:
df_fish[df_fish['Species'] == 'Roach']['Weight'].mean() # Roach 평균 무게

152.05

In [10]:
df_fish[df_fish['Weight'] == 0].index  # 0인 인덱스

Int64Index([40], dtype='int64')

In [11]:
df_fish.iloc[df_fish[df_fish['Weight'] == 0].index, :]['Weight']

40    0.0
Name: Weight, dtype: float64

In [12]:
mean_value = df_fish[df_fish['Species'] == 'Roach']['Weight'].mean()
mean_value

152.05

In [13]:
df_fish.iloc[df_fish[df_fish['Weight'] == 0].index, 1] = mean_value

In [14]:
# 도미(Bream) 와 빙어(Smelt) 만 추출
bream_smelt = df_fish[(df_fish['Species'] == 'Bream') | (df_fish['Species']== 'Smelt')]
bream_smelt.sample(5)

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
155,Smelt,13.4,12.4,13.5,2.43,1.269
10,Bream,475.0,31.0,36.2,14.2628,5.1042
20,Bream,575.0,34.0,39.5,15.1285,5.5695
33,Bream,975.0,41.0,45.9,18.6354,6.7473
26,Bream,720.0,35.0,40.6,16.3618,6.09


In [15]:
bream_smelt.shape

(49, 6)

In [16]:
fish_X = bream_smelt.loc[:, 'Weight':]
fish_target = bream_smelt[['Species']]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(fish_X, fish_target, random_state=42)

In [18]:
# 스케일링
scale = StandardScaler()
scale.fit(X_train)
X_train_scaled = scale.transform(X_train)
X_test_scaled = scale.transform(X_test)

In [19]:
# model_lr = LogisticRegression(C=0.001)
model_lr = LogisticRegression(C=1)
model_lr.fit(X_train_scaled, y_train)

print(model_lr.score(X_train_scaled, y_train))
print(model_lr.score(X_test_scaled, y_test))

1.0
1.0


  y = column_or_1d(y, warn=True)


In [20]:
# 성능 평가 지표 
print(classification_report(y_test, model_lr.predict(X_test_scaled)))

              precision    recall  f1-score   support

       Bream       1.00      1.00      1.00        10
       Smelt       1.00      1.00      1.00         3

    accuracy                           1.00        13
   macro avg       1.00      1.00      1.00        13
weighted avg       1.00      1.00      1.00        13



In [21]:
model_lr.predict(X_test_scaled[:5])

array(['Bream', 'Smelt', 'Smelt', 'Smelt', 'Bream'], dtype=object)

In [22]:
fish_X = df_fish.loc[:, 'Weight':]
fish_target = df_fish[['Species']]
X_train, X_test, y_train, y_test = train_test_split(fish_X, fish_target, random_state=42)

scale = StandardScaler()
scale.fit(X_train)
X_train_scaled = scale.transform(X_train)
X_test_scaled = scale.transform(X_test)

# model_lr2 = LogisticRegression(C=100)
model_lr2 = LogisticRegression()
model_lr2.fit(X_train_scaled, y_train)
model_lr2.fit(X_test_scaled, y_test)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


LogisticRegression()

In [23]:
print(model_lr2.score(X_train_scaled, y_train))
print(model_lr2.score(X_test_scaled, y_test))

0.7394957983193278
0.8
