In [1]:
import numpy as np
import pandas as pd
import json
# ★★★
import warnings
warnings.filterwarnings(action='ignore')
import matplotlib.pyplot as plt
import seaborn as sns

scaler(value) > vector(array) > metrics(2차 demension)

In [2]:
# ! pip install scikit-learn

<img src="https://scikit-learn.org/stable/_static/ml_map.png">

# 분석 프로세스

<pre>
<!-- <font color=red><b> -->
1. Data Load
2. EDA (Exploratory Data Analysis) 탐색적 데이터 분석
 - 통계적 데이터 분포 확인
 - 차트 시각화
3. Feature Engineering : 전처리(preprocessing), 가공(engineering)
 - 타입변환(날짜, 카테고리변환(a,b,c -> 1,2,3))
 - 결측처리(버리기:dropna, 채우기:fillna, 모델활용채우기)
 - 바이닝(범주화 : cut, qcut)
 - 인코딩(라벨인코딩,원핫인코딩,pd.get_dummies())
 - 정규화(Scaling: MinMaxScaler, StandardScaler, RobustScaler, LogScaler)
 - 이상치(Outlier)
4. 모델 선정(Model Selection:pycaret)
 - 회귀(Regression)
 - 분류(Classification)
 - 군집(Clustering, PCA)
5. 모델 학습 및 예측(train_test_split & fit and Predict)
6. 모델 검증 및 평가 (Validation & Evaluation metrics)
7. 하이퍼파미터 튜닝(Hyper-parameter(모델의 속성값) optimization)
8. 모델 저장 및 배포(Model Save & Deployment)

# 결측
<pre>
* 결측처리(None,np.nan)
 - 삭제 
 - 보간(대체) : 모델, 통계적 수치

In [3]:
df = pd.DataFrame({"name":[None,np.nan,"NaN","allen","king"],"score": ["A",np.nan,np.nan,"D","C"]})
df.head()

Unnamed: 0,name,score
0,,A
1,,
2,,
3,allen,D
4,king,C


In [4]:
df.isna().sum()
df.isna().sum()[df.isna().sum()>0].index.values
# 컬럼명 출력(print()하면 값만 나옴)

array(['name', 'score'], dtype=object)

<pre>
df.isna(), df.notna()
df.isnull(), df.notnull()

In [5]:
df[df['score'].notna()]

Unnamed: 0,name,score
0,,A
3,allen,D
4,king,C


## 결측삭제
<pre>
DataFrame.dropna(*, axis=0, how=_NoDefault.no_default, thresh=_NoDefault.no_default, subset=None, inplace=False)[source]
axis{0 or ‘index’, 1 or ‘columns’}, default 0
how{‘any’, ‘all’}, default ‘any’


In [6]:
df.dropna(how = "all")
# row에 값이 모두 결측이면

Unnamed: 0,name,score
0,,A
2,,
3,allen,D
4,king,C


In [7]:
df.dropna(how = "any")
# row에 값이 하나라도 결측이면

Unnamed: 0,name,score
3,allen,D
4,king,C


## 결측보간
<pre>
DataFrame.fillna(value=None, *, method=None, axis=None, inplace=False, limit=None, downcast=None)[source]
method{‘backfill’, ‘bfill’, ‘pad’, ‘ffill’, None}, default None

In [8]:
df.fillna('9999')

Unnamed: 0,name,score
0,9999,A
1,9999,9999
2,,9999
3,allen,D
4,king,C


In [9]:
df['score'].fillna('F')

0    A
1    F
2    F
3    D
4    C
Name: score, dtype: object

* 앞/뒤 값 이용해서 보간

In [10]:
df['score'].fillna(method = 'bfill')
# 바로 뒤의 값으로 채우기 

0    A
1    D
2    D
3    D
4    C
Name: score, dtype: object

In [11]:
df['score'].fillna(method = 'ffill')
# 바로 앞의 값으로 채우기

0    A
1    A
2    A
3    D
4    C
Name: score, dtype: object

## 결측대체
<pre>
* 사용자 지정값
* 통계적 수치
    평균
    최빈도
    중앙
* 모델 예측값

In [12]:
df = pd.DataFrame({"name":["smith",np.nan,"jones","allen","king"],"score": ["A","B","B","A","A"],"sal": [1000,1000,3000,np.nan,4000]})
df

Unnamed: 0,name,score,sal
0,smith,A,1000.0
1,,B,1000.0
2,jones,B,3000.0
3,allen,A,
4,king,A,4000.0


### 사용자 지정값

In [13]:
dic = {"name":"AAA", "score":"F", "sal":999}
df.fillna(dic)

Unnamed: 0,name,score,sal
0,smith,A,1000.0
1,AAA,B,1000.0
2,jones,B,3000.0
3,allen,A,999.0
4,king,A,4000.0


### 통계적 수치 <pre>
 - 평균 : 결측 제외 합 / 결측 제외 값 갯수
 - 중앙값(중간값) : 홀수개(정중앙값), 짝수개(가운데 값 두개 평균) > <b>아웃라이어 영향 X</b>
 - 최빈도값 : 가장 많이 출현한 값

In [14]:
df['sal'].mean(), df['sal'].median() , df['sal'].mode()
# --------------------값(scaler)출력 /series출력 

(2250.0,
 2000.0,
 0    1000.0
 Name: sal, dtype: float64)

In [15]:
df['sal'].mode().values[0]
# mode()만 series형태로 출력 > 값만 뽑기

1000.0

In [16]:
np.mean([10,10,30,40]) , np.median([10,10,30,40])

(22.5, 20.0)

In [17]:
np.mean([10,10,30,100]) , np.median([10,10,30,100])
#아웃라이어 영향 X

(37.5, 20.0)

In [18]:
df['sal'].fillna(df['sal'].mean())

0    1000.0
1    1000.0
2    3000.0
3    2250.0
4    4000.0
Name: sal, dtype: float64

In [19]:
df['sal'].fillna(df.groupby('score')['sal'].mean().to_dict()['A'])

0    1000.0
1    1000.0
2    3000.0
3    2500.0
4    4000.0
Name: sal, dtype: float64

In [20]:
df.groupby('score')['sal'].mean().to_dict()

{'A': 2500.0, 'B': 2000.0}

* 그룹별로 평균내서 결측채우기

In [21]:
df.groupby('score')['sal'].transform( lambda x : x.fillna( x.mean() ) )

0    1000.0
1    1000.0
2    3000.0
3    2500.0
4    4000.0
Name: sal, dtype: float64

In [22]:
df['sal'].fillna(df.groupby('score')['sal'].transform("mean"))  #-----★

0    1000.0
1    1000.0
2    3000.0
3    2500.0
4    4000.0
Name: sal, dtype: float64

### 모델 예측 값
* bike_demand의 windspeed 0 채우기 참조

# 타입

In [23]:
df = pd.DataFrame(  {"sdate": "1999-01-01", "regdate": pd.date_range("2022-01-01","2022-01-05",freq = "D")
                },  )
df.head()

Unnamed: 0,sdate,regdate
0,1999-01-01,2022-01-01
1,1999-01-01,2022-01-02
2,1999-01-01,2022-01-03
3,1999-01-01,2022-01-04
4,1999-01-01,2022-01-05


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   sdate    5 non-null      object        
 1   regdate  5 non-null      datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 208.0+ bytes


In [25]:
df['y'] = df['regdate'].dt.year
df['m'] = df['regdate'].dt.month
df['d'] = df['regdate'].dt.day

df['h'] = df['regdate'].dt.hour
df['min'] = df['regdate'].dt.minute
df['sec'] = df['regdate'].dt.second
df['w'] = df['regdate'].dt.dayofweek

df.head()

Unnamed: 0,sdate,regdate,y,m,d,h,min,sec,w
0,1999-01-01,2022-01-01,2022,1,1,0,0,0,5
1,1999-01-01,2022-01-02,2022,1,2,0,0,0,6
2,1999-01-01,2022-01-03,2022,1,3,0,0,0,0
3,1999-01-01,2022-01-04,2022,1,4,0,0,0,1
4,1999-01-01,2022-01-05,2022,1,5,0,0,0,2


In [26]:
df['sdate2'] = df['sdate'].astype("datetime64[ns]")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   sdate    5 non-null      object        
 1   regdate  5 non-null      datetime64[ns]
 2   y        5 non-null      int64         
 3   m        5 non-null      int64         
 4   d        5 non-null      int64         
 5   h        5 non-null      int64         
 6   min      5 non-null      int64         
 7   sec      5 non-null      int64         
 8   w        5 non-null      int64         
 9   sdate2   5 non-null      datetime64[ns]
dtypes: datetime64[ns](2), int64(7), object(1)
memory usage: 528.0+ bytes


In [27]:
df['sdate3'] = pd.to_datetime(df['sdate'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   sdate    5 non-null      object        
 1   regdate  5 non-null      datetime64[ns]
 2   y        5 non-null      int64         
 3   m        5 non-null      int64         
 4   d        5 non-null      int64         
 5   h        5 non-null      int64         
 6   min      5 non-null      int64         
 7   sec      5 non-null      int64         
 8   w        5 non-null      int64         
 9   sdate2   5 non-null      datetime64[ns]
 10  sdate3   5 non-null      datetime64[ns]
dtypes: datetime64[ns](3), int64(7), object(1)
memory usage: 568.0+ bytes


# 인코딩

In [28]:
from sklearn.preprocessing import LabelEncoder

In [32]:
df = pd.DataFrame({"name":[None,np.nan,"NaN","allen","king"],"score": ["A",np.nan,np.nan,"D","C"]})
df.head()

Unnamed: 0,name,score
0,,A
1,,
2,,
3,allen,D
4,king,C


In [34]:
le = LabelEncoder()
le.fit(df['name'])
df['le_name0'] = le.transform(df['name'])

In [35]:
df
# Nan값은 맨 마지막숫자로 처리

Unnamed: 0,name,score,le_name0
0,,A,3
1,,,4
2,,,0
3,allen,D,1
4,king,C,2


In [36]:
from sklearn.preprocessing import OneHotEncoder

In [37]:
df['name'].shape
# (5,) > 가로로 다섯개 항목
# (5,1) > 세로로 다섯개

(5,)

In [38]:
df['score'].values.reshape(-1,1)

array([['A'],
       [nan],
       [nan],
       ['D'],
       ['C']], dtype=object)

In [39]:
# oe = OneHotEncoder()               #------------ (0,3)    1
oe = OneHotEncoder(sparse=False) #------------ [10000000] 

oe.fit(df['score'].values.reshape(-1,1))
res = oe.transform(df['score'].values.reshape(-1,1))
print(res)

[[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]]


In [40]:
resdf = pd.DataFrame(res)
resdf

Unnamed: 0,0,1,2,3
0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,1.0
3,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0


In [41]:
df1 = pd.concat([df, resdf], axis = 1)
df1

Unnamed: 0,name,score,le_name0,0,1,2,3
0,,A,3,1.0,0.0,0.0,0.0
1,,,4,0.0,0.0,0.0,1.0
2,,,0,0.0,0.0,0.0,1.0
3,allen,D,1,0.0,0.0,1.0,0.0
4,king,C,2,0.0,1.0,0.0,0.0


pandas.get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)

In [None]:
df2 = pd.get_dummies(df['name'])
df = pd.concat([df,df2], axis = 1)
df = df.drop('name', axis = 1)
df
# 컬럼 하나만 하면 합치기, 컬럼 삭제까지 내가 해야함 

In [None]:
df2 = pd.get_dummies(df)
df2
# ★★★df 채로 넣으면 편하다/ defalt는 결측 취급 안한다 > dummy_na = True로 수정하기!!!!!

# 바이닝

# 정규화


ref : http://piramvill2.org/?p=3748

<img src = "https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Ft1.daumcdn.net%2Fcfile%2Ftistory%2F276E5D47566A1CA329">
*분산 : 평균으로부터 떨어진 정도의 평균

<img src= "https://velog.velcdn.com/images%2Fgooook%2Fpost%2F38b370c1-015d-40ba-81f1-920781ade8c9%2FRegression-terminologies-Page-3.png">

<img src ="https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2F0odPk%2FbtqQjnMlwKM%2FhXlWLCPcyyZLVWUJjruEe0%2Fimg.png" >

<img src = "https://qph.cf2.quoracdn.net/main-qimg-47e35df47f8539c6460ca450daa3f0da-lq">


* 설명 불가능한 에러 : <b>잔차</b>제곱합
    * 잔차 : 실제값 - 예측값
    * $SSR = \sum_{i=1}^n ( {Y} - \hat{Y_i} )^2 $
    * SSR(<b>Residual</b>    sum of squares)
    * SSE(Error         sum of squares)
    <br><br>
* 설명가능한 에러
    * $SSE = \sum_{i=1}^n ( \hat{Y_i} - \bar{Y})^2 $
    * SSE(<b>Explained</b>   sum of squares)
    * SSR(Regression sum of Squares)
<br><br>
* SST(total sum of squares)  : <b>편차</b>제곱합  
    * $\sum_{i=1}^n ( Y_i - \bar{Y})^2 $ 
<br><br>
* R-squared (R제곱;결정계수) : 총 에러 중 설명가능한 에러의 비율 / 통계만 해당(머신러닝X)
    * $R^2 = SSE / SST $
    * $R^2 = 1 - (SSR / SST) $

### 스케일링(Scaling)
<pre>
* 값의 변화가 확확 커져서 모델이 주요 피쳐로 인식할 수도 > 이를 방지하기 위해서 스케일링
* 회귀모델에서 유용/분류에서는 X


* ★범주형(A,B,C ->1,2,3 )은 스케일링 하지 말아라!!!!!!!  연속형만 해라
* 데이터의 값이 너무 크거나 작아 변수의 영향이 제대로 반영되지 않을 경우, 알고리즘의 계산 과정에서 0으로 수렴하거나 값이 너무 커져버리는 경우 값의 수치를 변환해 사용
* StandardScaler() : 기본 스케일. 평균과 표준편차 사용
* MinMaxScaler() : 최대/최소값이 각각 1, 0이 되도록 스케일링
* MaxAbsScaler() : 최대절대값과 0이 각각 1, 0이 되도록 스케일링
* RobustScaler() : 중앙값(median)과 IQR(interquartile range) 사용. 아웃라이어의 영향을 최소화
* https://wooono.tistory.com/96

In [45]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

### StandardScaler
* 정규화 == 표준화 == 표준정규분포
    * 데이터를 통계적으로 표준정규분포화  (평균= 0, 표준편차= 1)
    * 표준화 (Z-score) = 편차/표준편차
    * 모수 : 평균, 분산

In [43]:
df = pd.DataFrame({"score": ["A","B","B","A","C"], "le_score":[0,1,1,0,2]})
df

Unnamed: 0,score,le_score
0,A,0
1,B,1
2,B,1
3,A,0
4,C,2


In [None]:
df.describe()
# std : 표준편차

In [None]:
np.mean([0,1,1,0,2]),np.std([0,1,1,0,2], ddof = 1)

ddof = degree of freedom, 자유도
N : 모집단
N -ddof : 샘플집단
np.std(ddof = 0)이 default
df.describ(ddof = 1)랑 차이가 난다

In [46]:
print(df['le_score'].values)
sc_list = [StandardScaler(),MinMaxScaler(), RobustScaler()]
for sc in sc_list:
    sc.fit(df['le_score'].values.reshape(-1,1))
    res=sc.transform(df['le_score'].values.reshape(-1,1))
    print(res)

[0 1 1 0 2]
[[-1.06904497]
 [ 0.26726124]
 [ 0.26726124]
 [-1.06904497]
 [ 1.60356745]]
[[0. ]
 [0.5]
 [0.5]
 [0. ]
 [1. ]]
[[-1.]
 [ 0.]
 [ 0.]
 [-1.]
 [ 1.]]


ref : https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html