### sklearn, 

#### sklearn.preprocessing.LabelEncoder
* LabelEncoder
* https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html?highlight=labelencoder#sklearn.preprocessing.LabelEncoder
* Methods

fit(y) : Fit label encoder.

fit_transform(y) : Fit label encoder and return encoded labels.

get_params([deep]) : Get parameters for this estimator.

inverse_transform(y) : Transform labels back to original encoding.

set_params(** params) : Set the parameters of this estimator.

transform(y) : Transform labels to normalized encoding.

In [2]:
from sklearn.preprocessing import LabelEncoder
items=['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서']

encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)
print('인코딩 변환값:', labels)

인코딩 변환값: [0 1 4 5 3 3 2 2]


In [3]:
print('인코딩 클래스:', encoder.classes_)

인코딩 클래스: ['TV' '냉장고' '믹서' '선풍기' '전자렌지' '컴퓨터']


In [4]:
print('디코딩 원본 값:', encoder.inverse_transform([4, 5, 2, 0, 1, 1, 3, 3]))

디코딩 원본 값: ['전자렌지' '컴퓨터' '믹서' 'TV' '냉장고' '냉장고' '선풍기' '선풍기']


In [5]:
from sklearn.preprocessing import LabelEncoder
items=['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서']

encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)
print('인코딩 변환값:', labels)

print('인코딩 클래스:', encoder.classes_)

print('디코딩 원본 값:', encoder.inverse_transform([4, 5, 2, 0, 1, 1, 3, 3]))

인코딩 변환값: [0 1 4 5 3 3 2 2]
인코딩 클래스: ['TV' '냉장고' '믹서' '선풍기' '전자렌지' '컴퓨터']
디코딩 원본 값: ['전자렌지' '컴퓨터' '믹서' 'TV' '냉장고' '냉장고' '선풍기' '선풍기']


#### titanic_train.csv 전처리

In [25]:
# Titanic 데이터 : 데이터를 보고 Name은 불필요하니 드랍, Parch/Ticket/Cabin/Embarked 드랍
# 즉 Suvived, Sex, Age, Embarked 컬럼만 남기고 이용
# feature 는 Sex, Age, Embarked 사용하고 label은 Survived 사용해서 지도학습 하려한다.
# fillna 사용하는데 Sex는 남, Age는 평균, Embarked는 S 로 바꿈
# 데이터의 전처리가 필요한 상황이라 데이터 전처리도 해야하는데 
# 결측치가 관찰이 되어서 결측치 처리를 해야 하고(결측치는 평균값으로)
# 문자열이 보여서 인코딩 해줘야 한다.(인코딩은 labelencoding을 이용하여 처리한 결과를 데이터 프레임으로 출력)

# 1. 판다스를 이용하여 csv 파일 읽어오기
# 2. 필요한 컬럼만 남기고 drop
# 3. fillna를 이용하여 결측치 처리(mean)
# 4. LabelEncoder 를 활용하여 인코딩

import pandas as pd
# titanic_df=pd.read_csv("C:/apps/ml/datasets/titanic_train.csv")
titanic_df=pd.read_csv("./datasets/titanic_train.csv")

In [26]:
titanic_df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


In [27]:
titanic_df.drop(columns=['PassengerId','Pclass','Name','SibSp','Parch','Ticket','Fare','Cabin'],inplace=True)

In [28]:
titanic_df.head(1)

Unnamed: 0,Survived,Sex,Age,Embarked
0,0,male,22.0,S


In [30]:
titanic_df['Age']=titanic_df['Age'].fillna(titanic_df['Age'].mean())

In [31]:
titanic_df.head(1)

Unnamed: 0,Survived,Sex,Age,Embarked
0,0,male,22.0,S


In [34]:
titanic_df['Embarked']=titanic_df['Embarked'].fillna('S')

In [35]:
titanic_df.head(1)

Unnamed: 0,Survived,Sex,Age,Embarked
0,0,male,22.0,S


In [36]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       891 non-null    float64
 3   Embarked  891 non-null    object 
dtypes: float64(1), int64(1), object(2)
memory usage: 28.0+ KB


In [37]:
# 데이터 형태들이 각기 다르기 때문에
# Sex, Embarked 

encoder = LabelEncoder()
titanic_df['Age']=encoder.fit_transform(titanic_df['Age'])

In [38]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Survived  891 non-null    int64 
 1   Sex       891 non-null    object
 2   Age       891 non-null    int64 
 3   Embarked  891 non-null    object
dtypes: int64(2), object(2)
memory usage: 28.0+ KB


In [39]:
encoder = LabelEncoder()
titanic_df['Embarked']=encoder.fit_transform(titanic_df['Embarked'])

In [40]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Survived  891 non-null    int64 
 1   Sex       891 non-null    object
 2   Age       891 non-null    int64 
 3   Embarked  891 non-null    int32 
dtypes: int32(1), int64(2), object(1)
memory usage: 24.5+ KB


In [42]:
encoder = LabelEncoder()
titanic_df['Sex']=encoder.fit_transform(titanic_df['Sex'])

In [43]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   Survived  891 non-null    int64
 1   Sex       891 non-null    int32
 2   Age       891 non-null    int64
 3   Embarked  891 non-null    int32
dtypes: int32(2), int64(2)
memory usage: 21.0 KB


In [44]:
titanic_df.head()

Unnamed: 0,Survived,Sex,Age,Embarked
0,0,1,28,2
1,1,0,52,0
2,1,0,34,2
3,1,0,48,2
4,0,1,48,2


#### sklearn.preprocessing.OneHotEncoder
* OneHotEncoder(*, categories='auto', drop=None, sparse=True, dtype=<class 'numpy.float64'>, handle_unknown='error', min_frequency=None, max_categories=None)
* https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html?highlight=onehotencoder#sklearn.preprocessing.OneHotEncoder

In [49]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

labels=labels.reshape(-1,1)

oh_encoder=OneHotEncoder()
oh_encoder.fit(labels)
oh_labels=oh_encoder.transform(labels)

print('원-핫 인코딩 데이터')
print(oh_labels.toarray())
print('원-핫 인코딩 데이터 차원')
print(oh_labels.shape)

원-핫 인코딩 데이터
[[1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]
원-핫 인코딩 데이터 차원
(8, 6)


In [None]:
df=pd.read_csv("./datasets/titanic_train.csv")

oh_encoder=OneHotEncoder()
oh_encoder.fit(labels)
oh_labels=oh_encoder.transform(labels)

print('원-핫 인코딩 데이터')
print(oh_labels.toarray())
print('원-핫 인코딩 데이터 차원')
print(oh_labels.shape)

### pandas.get_dummies
* get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)
* https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html

In [51]:
import pandas as pd 
df = pd.read_csv('./datasets/titanic_train.csv')
df.drop(columns = ['PassengerId','Pclass','Name','SibSp', 'Parch','Ticket','Fare','Cabin'],inplace=True)

df['Age']=df['Age'].fillna(df['Age'].mean())
df['Embarked'] = df['Embarked'].fillna('S')

In [57]:
data=df.loc[:,'Embarked']
data

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object

In [53]:
dummy_Embark=pd.get_dummies(data)

In [54]:
dummy_data=pd.get_dummies(df[['Sex', 'Embarked']])

In [58]:
dummy_data

Unnamed: 0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,1,0,0,1
1,1,0,1,0,0
2,1,0,0,0,1
3,1,0,0,0,1
4,0,1,0,0,1
...,...,...,...,...,...
886,0,1,0,0,1
887,1,0,0,0,1
888,1,0,0,0,1
889,0,1,1,0,0


In [59]:
df = pd.concat([titanic_df , dummy_data], ignore_index=True)
print(df)

      Survived  Sex   Age  Embarked  Sex_female  Sex_male  Embarked_C  \
0          0.0  1.0  28.0       2.0         NaN       NaN         NaN   
1          1.0  0.0  52.0       0.0         NaN       NaN         NaN   
2          1.0  0.0  34.0       2.0         NaN       NaN         NaN   
3          1.0  0.0  48.0       2.0         NaN       NaN         NaN   
4          0.0  1.0  48.0       2.0         NaN       NaN         NaN   
...        ...  ...   ...       ...         ...       ...         ...   
1777       NaN  NaN   NaN       NaN         0.0       1.0         0.0   
1778       NaN  NaN   NaN       NaN         1.0       0.0         0.0   
1779       NaN  NaN   NaN       NaN         1.0       0.0         0.0   
1780       NaN  NaN   NaN       NaN         0.0       1.0         1.0   
1781       NaN  NaN   NaN       NaN         0.0       1.0         0.0   

      Embarked_Q  Embarked_S  
0            NaN         NaN  
1            NaN         NaN  
2            NaN         NaN  

In [60]:
df = pd.concat([df, dummy_data], axis=1)

In [61]:
df

Unnamed: 0,Survived,Sex,Age,Embarked,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Sex_female.1,Sex_male.1,Embarked_C.1,Embarked_Q.1,Embarked_S.1
0,0.0,1.0,28.0,2.0,,,,,,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,52.0,0.0,,,,,,1.0,0.0,1.0,0.0,0.0
2,1.0,0.0,34.0,2.0,,,,,,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,48.0,2.0,,,,,,1.0,0.0,0.0,0.0,1.0
4,0.0,1.0,48.0,2.0,,,,,,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1777,,,,,0.0,1.0,0.0,0.0,1.0,,,,,
1778,,,,,1.0,0.0,0.0,0.0,1.0,,,,,
1779,,,,,1.0,0.0,0.0,0.0,1.0,,,,,
1780,,,,,0.0,1.0,1.0,0.0,0.0,,,,,


In [62]:
df.drop(columns=['Sex','Embarked'], inplace=True)

In [63]:
df

Unnamed: 0,Survived,Age,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Sex_female.1,Sex_male.1,Embarked_C.1,Embarked_Q.1,Embarked_S.1
0,0.0,28.0,,,,,,0.0,1.0,0.0,0.0,1.0
1,1.0,52.0,,,,,,1.0,0.0,1.0,0.0,0.0
2,1.0,34.0,,,,,,1.0,0.0,0.0,0.0,1.0
3,1.0,48.0,,,,,,1.0,0.0,0.0,0.0,1.0
4,0.0,48.0,,,,,,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1777,,,0.0,1.0,0.0,0.0,1.0,,,,,
1778,,,1.0,0.0,0.0,0.0,1.0,,,,,
1779,,,1.0,0.0,0.0,0.0,1.0,,,,,
1780,,,0.0,1.0,1.0,0.0,0.0,,,,,
