## 팔머 펭귄
> x를 주면 펭귄을 분류하는 문제

* Iter1. 
    - SVC같은 선형 분류 모델 하나 성정
    - 그리드 서치
    - 파이프 라인

In [1]:
import numpy as np
import pandas as pd


In [2]:
df = pd.read_csv("data/penguins_size.csv")
df.info(), df.head(), df.sample(15)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


(None,
   species     island  culmen_length_mm  culmen_depth_mm  flipper_length_mm  \
 0  Adelie  Torgersen              39.1             18.7              181.0   
 1  Adelie  Torgersen              39.5             17.4              186.0   
 2  Adelie  Torgersen              40.3             18.0              195.0   
 3  Adelie  Torgersen               NaN              NaN                NaN   
 4  Adelie  Torgersen              36.7             19.3              193.0   
 
    body_mass_g     sex  
 0       3750.0    MALE  
 1       3800.0  FEMALE  
 2       3250.0  FEMALE  
 3          NaN     NaN  
 4       3450.0  FEMALE  ,
        species     island  culmen_length_mm  culmen_depth_mm  \
 45      Adelie      Dream              39.6             18.8   
 149     Adelie      Dream              37.8             18.1   
 179  Chinstrap      Dream              49.5             19.0   
 264     Gentoo     Biscoe              50.5             15.9   
 70      Adelie  Torgersen         

In [3]:
df['species'].unique()
df['island'].unique()
df['sex'].unique()
df.isnull().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

In [4]:
penguins = df.dropna()
penguins.reset_index(drop=True, inplace=True)

In [5]:
penguins[penguins['sex']=='.']

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
327,Gentoo,Biscoe,44.5,15.7,217.0,4875.0,.


In [6]:
penguins.drop([333], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  penguins.drop([333], inplace=True)


In [7]:
penguins['sex'].unique()

array(['MALE', 'FEMALE', '.'], dtype=object)

In [8]:
penguins['species']=penguins['species'].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  penguins['species']=penguins['species'].astype('category')


In [9]:
penguins['species_cod'] = penguins['species'].cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  penguins['species_cod'] = penguins['species'].cat.codes


In [10]:
penguins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333 entries, 0 to 332
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   species            333 non-null    category
 1   island             333 non-null    object  
 2   culmen_length_mm   333 non-null    float64 
 3   culmen_depth_mm    333 non-null    float64 
 4   flipper_length_mm  333 non-null    float64 
 5   body_mass_g        333 non-null    float64 
 6   sex                333 non-null    object  
 7   species_cod        333 non-null    int8    
dtypes: category(1), float64(4), int8(1), object(2)
memory usage: 16.5+ KB


In [11]:
penguins_feats = ['island', 'culmen_length_mm', 'culmen_depth_mm',
                  'flipper_length_mm', 'body_mass_g', 'sex']
penguins_labs = ['species_cod']
X, y = penguins[penguins_feats].values, penguins[penguins_labs].values

## SVC를 사용한 학습 및 평가

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix

In [13]:
n_feats = [1, 2, 3, 4]
n_tras = Pipeline(steps=[('scaler', StandardScaler())])
c_feats = [0, 5] 
c_tras = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', n_tras, n_feats),
        ('cat', c_tras, c_feats)
    ]
)

In [15]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('svc', SVC(probability=True))
                           ])

In [16]:
SVC_scores = []
sVC_fimp = []
SVC_predictons = 0
FOLDS = 5

In [17]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [18]:
for fold, (train_idx, valid_idx) in enumerate(skf.split(penguins[penguins_feats].values,
                                                        penguins[penguins_labs].values)):
    print(f"Fold = {fold}, {train_idx}")

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_val_train, X_val, y_val_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)
    
    multi_model = pipeline.fit(X_val_train, y_val_train)
    preds_valid = multi_model.predict(X_val)
    acc = accuracy_score(y_val, preds_valid)
    SVC_scores.append(acc)

    # test_preds =  multi_model.predict(X_test)
    # SVC_predictons += test_preds/FOLDS

Fold = 0, [  0   1   4   5   7   8  11  12  13  14  15  16  17  18  19  20  21  23
  24  25  26  27  28  32  33  36  37  38  39  40  41  42  44  45  47  48
  52  53  54  56  57  58  59  61  62  63  64  65  66  67  68  69  70  71
  72  73  75  76  77  78  79  80  81  83  84  85  86  87  88  89  92  93
  96  97  98 100 101 103 104 105 106 107 108 109 110 111 112 113 114 115
 116 117 118 119 121 122 123 124 125 126 127 128 129 131 132 133 134 135
 136 137 138 139 141 142 144 145 146 147 148 149 150 151 152 153 154 155
 156 157 158 159 160 161 162 163 164 165 168 169 171 172 174 175 178 179
 181 182 183 185 186 187 188 190 191 193 194 196 198 199 200 201 202 203
 204 205 206 208 209 210 211 212 213 215 216 217 218 220 221 223 225 226
 227 228 229 230 231 232 233 234 237 238 239 241 242 243 244 246 247 249
 250 251 252 253 254 255 256 257 259 260 261 262 263 264 265 266 267 268
 269 271 272 273 276 277 278 279 280 281 283 284 286 287 288 289 290 291
 292 293 294 295 296 297 298 299 300 303 

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [19]:
np.mean(SVC_scores)

0.9850746268656716

In [20]:
confusion_matrix(y_val, preds_valid)

array([[32,  0,  0],
       [ 1, 13,  0],
       [ 0,  0, 21]], dtype=int64)