In [2]:
# 라이브러리 불러오기
import pandas as pd
import numpy as np

# read_csv() 함수로 df 생성
df = pd.read_csv('./data/auto-mpg.csv', header=None)

# 열 이름을 지정
df.columns = ['mpg','cylinders','displacement','horsepower','weight',
              'acceleration','model year','origin','name'] 

# horsepower 열의 누락 데이터('?') 삭제하고 실수형으로 변환
df['horsepower'] = df['horsepower'].replace('?', np.nan)      # '?'을 np.nan으로 변경
df = df.dropna(subset=['horsepower'], axis=0)                 # 누락데이터 행을 삭제
df['horsepower'] = df['horsepower'].astype('float')           # 문자열을 실수형으로 변환

# np.histogram 으로 3개의 bin으로 나누는 경계 값의 리스트 구하기
count, bin_dividers = np.histogram(df['horsepower'], bins=3)

# 3개의 bin에 이름 지정
bin_names = ['저출력', '보통출력', '고출력']

# pd.cut 으로 각 데이터를 3개의 bin에 할당
df['hp_bin'] = pd.cut(x=df['horsepower'],     # 데이터 배열
                      bins=bin_dividers,      # 경계 값 리스트
                      labels=bin_names,       # bin 이름
                      include_lowest=True)    # 첫 경계값 포함

# sklern 라이브러리 불러오기
from sklearn import preprocessing    

# 전처리를 위한 encoder 객체 만들기
label_encoder = preprocessing.LabelEncoder()       # label encoder 생성
onehot_encoder = preprocessing.OneHotEncoder()     # one hot encoder 생성

# label encoder로 문자열 범주를 숫자형 범주로 변환
onehot_labeled = label_encoder.fit_transform(df['hp_bin'].head(15))  
print(onehot_labeled)
print(type(onehot_labeled))

# 2차원 행렬로 형태 변경
onehot_reshaped = onehot_labeled.reshape(len(onehot_labeled), 1) 
print(onehot_reshaped)
print(type(onehot_reshaped))

# 희소행렬로 변환
onehot_fitted = onehot_encoder.fit_transform(onehot_reshaped)
print(onehot_fitted)
print(type(onehot_fitted))

[1 1 1 1 1 0 0 0 0 0 0 1 1 0 2]
<class 'numpy.ndarray'>
[[1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [2]]
<class 'numpy.ndarray'>
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 15 stored elements and shape (15, 3)>
  Coords	Values
  (0, 1)	1.0
  (1, 1)	1.0
  (2, 1)	1.0
  (3, 1)	1.0
  (4, 1)	1.0
  (5, 0)	1.0
  (6, 0)	1.0
  (7, 0)	1.0
  (8, 0)	1.0
  (9, 0)	1.0
  (10, 0)	1.0
  (11, 1)	1.0
  (12, 1)	1.0
  (13, 0)	1.0
  (14, 2)	1.0
<class 'scipy.sparse._csr.csr_matrix'>


In [3]:
# 실습

# %pip install scikit-learn

In [4]:
# 라이브러리 불러오기
import pandas as pd
import numpy as np

# read_csv() 함수로 df 생성
df = pd.read_csv('./data/auto-mpg.csv', header=None)
# 열 이름을 지정
df.columns = ['mpg','cylinders','displacement','horsepower','weight',
              'acceleration','model year','origin','name'] 

df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [14]:
# 연비(mpg)를 3개 구간으로 나누어 새로운 칼럼으로 추가
# 저연비, 보통, 고연비

labels = ['저연비', '보통', '고연비']

df['mpg_qcut'] = pd.qcut(df.mpg, q=3,  labels=labels)
df['mpg_cut'] = pd.cut(df.mpg, bins=3, labels=labels)


print(df.mpg_qcut.value_counts())
# 계급 구간 확인 - 라벨 없이
temp = pd.qcut(df.mpg, q=3)
print(temp.cat.categories)  # 구간별 데이터 범위

mpg_qcut
저연비    143
고연비    133
보통     122
Name: count, dtype: int64
IntervalIndex([(8.999, 19.0], (19.0, 26.933], (26.933, 46.6]], dtype='interval[float64, right]')


In [12]:
print(df.mpg_cut.value_counts())

temp2 = pd.cut(df.mpg, bins=3)
print(temp2.cat.categories)

mpg_cut
저연비    183
보통     171
고연비     44
Name: count, dtype: int64
IntervalIndex([(8.962, 21.533], (21.533, 34.067], (34.067, 46.6]], dtype='interval[float64, right]')


In [15]:
pd.get_dummies(df.mpg_qcut)

Unnamed: 0,저연비,보통,고연비
0,True,False,False
1,True,False,False
2,True,False,False
3,True,False,False
4,True,False,False
...,...,...,...
393,False,False,True
394,False,False,True
395,False,False,True
396,False,False,True


In [None]:
# 다중 공선성:
    # 회귀분석(여러변수로 결과를 예측하는 모델)
    # 독립변수(칼럼)끼리 너무 강한 상관관계를 가질 때 문제가 생김

In [42]:
from sklearn.preprocessing import OneHotEncoder


# 객체
encoder = OneHotEncoder(sparse_output=False) 

# sklearn 계열은 사용방법이 동일 / fit: 적용, transform: 변환, fit_transform: 적용과 변환
# encoder.fit_transform(df['mpg_cut']) # -> 1차원임. 2차원 데이터가 와야 한다
onehot = encoder.fit_transform(df[['mpg_qcut']]) # -> 2차원
cols = encoder.get_feature_names_out(['mpg_qcut'])

pd.DataFrame(onehot, columns=cols)
pd.concat([df.drop(columns=['mpg_qcut']), pd.DataFrame(onehot, columns=cols)],axis=1)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,name,mpg_cut,mpg_qcut_고연비,mpg_qcut_보통,mpg_qcut_저연비
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu,저연비,0.0,0.0,1.0
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320,저연비,0.0,0.0,1.0
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite,저연비,0.0,0.0,1.0
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst,저연비,0.0,0.0,1.0
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino,저연비,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.00,2790.0,15.6,82,1,ford mustang gl,보통,1.0,0.0,0.0
394,44.0,4,97.0,52.00,2130.0,24.6,82,2,vw pickup,고연비,1.0,0.0,0.0
395,32.0,4,135.0,84.00,2295.0,11.6,82,1,dodge rampage,보통,1.0,0.0,0.0
396,28.0,4,120.0,79.00,2625.0,18.6,82,1,ford ranger,보통,1.0,0.0,0.0


In [None]:
# get_dummies() : 빠르게 확인, 칼럼명 유지, pandas Dataframe/Series
    # 데이터 탐색, 시각화, 작은 데이터셋
# OneHotEncoder() : numpy array, Dataframe
    # 머신러닝 파이프라인/모델학습
    # fit, transform --> 학습-예측데이터 일관성 유지
    # 큰 데이터셋

In [56]:
train_df = df.drop(columns='mpg')
train_df
# cylinders, model year, origin --> onehot
# 제조사 칼럼에서 제조사만 추출 -> onehot
# 모든 하나의 데이터프레임으로 결합

encoder = OneHotEncoder(sparse_output=False)

cols = ['cylinders', 'model year', 'origin']
onehot = encoder.fit_transform(train_df[cols])
onehot_cols = encoder.get_feature_names_out(cols)

pd.DataFrame(onehot, columns=onehot_cols)
pd.concat([df.drop(columns=cols), pd.DataFrame(onehot,columns=onehot_cols)])


Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,name,mpg_qcut,mpg_cut,year_group,cylinders_3,...,model year_76,model year_77,model year_78,model year_79,model year_80,model year_81,model year_82,origin_1,origin_2,origin_3
0,18.0,307.0,130.0,3504.0,12.0,chevrolet chevelle malibu,저연비,저연비,1970s,,...,,,,,,,,,,
1,15.0,350.0,165.0,3693.0,11.5,buick skylark 320,저연비,저연비,1970s,,...,,,,,,,,,,
2,18.0,318.0,150.0,3436.0,11.0,plymouth satellite,저연비,저연비,1970s,,...,,,,,,,,,,
3,16.0,304.0,150.0,3433.0,12.0,amc rebel sst,저연비,저연비,1970s,,...,,,,,,,,,,
4,17.0,302.0,140.0,3449.0,10.5,ford torino,저연비,저연비,1970s,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,,,,,,,,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
394,,,,,,,,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
395,,,,,,,,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
396,,,,,,,,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [None]:
train2_df = pd.read_csv('./data/auto-mpg.csv', header=None)
train2_df.columns = ['mpg','cylinders','displacement','horsepower','weight',
              'acceleration','model year','origin','name']

# model year을 1970년대와 1980년대로 나누기
train2_df['year_group'] = pd.cut( train2_df['model year'], bins=[69, 79, 89],  # 경계: 70~79, 80~89
                                             labels=['1970s', '1980s'] )


from sklearn.preprocessing import OneHotEncoder
# 인코딩할 칼럼 지정
cols = ['cylinders','year_group', 'origin']

# OneHotEncoder 적용
encoder = OneHotEncoder(sparse_output=False)
onehot = encoder.fit_transform(train2_df[cols]) 
# 새 컬럼 이름 가져오기
onehot_cols = encoder.get_feature_names_out(cols)

# DataFrame 변환
onehot_train2_df = pd.DataFrame(onehot, columns=onehot_cols)

# 원본에서 인코딩 전 칼럼 제거 후 병합
final_train2_df = pd.concat([df.drop(columns=cols), pd.DataFrame(onehot, columns=onehot_cols)], axis=1)

 

final_train2_df.tail() 

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,model year,name,mpg_qcut,mpg_cut,cylinders_3,cylinders_4,cylinders_5,cylinders_6,cylinders_8,year_group_1970s,year_group_1980s,origin_1,origin_2,origin_3
393,27.0,140.0,86.0,2790.0,15.6,82,ford mustang gl,고연비,보통,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
394,44.0,97.0,52.0,2130.0,24.6,82,vw pickup,고연비,고연비,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
395,32.0,135.0,84.0,2295.0,11.6,82,dodge rampage,고연비,보통,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
396,28.0,120.0,79.0,2625.0,18.6,82,ford ranger,고연비,보통,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
397,31.0,119.0,82.0,2720.0,19.4,82,chevy s-10,고연비,보통,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [None]:
# 범주형 데이터 : 범위 내에서 결정되는 데이터
# 연속형 데이터 : 범위가 없는 데이터

In [71]:
from sklearn.linear_model import LinearRegression  # 선형 회귀2
# 작은 : 0 보통 1 크 2
data = {
    'size' : [0,0,0,1,1,1,2,2,2],
    'height':[120,121,119,122,123,121,125,124,126]
}
df = pd.DataFrame(data)
# 실제평균
df.groupby('size')['height'].mean()

size
0    120.0
1    122.0
2    125.0
Name: height, dtype: float64

In [72]:
model_lr = LinearRegression()
X = df.drop(columns=['height']).to_numpy()
y = df['height']
X.shape , y.shape, type(y)
model_ly = LinearRegression()
model_ly.fit(X,y)
predicted_y0 = model_ly.predict([[0]])[0]
predicted_y1 = model_ly.predict([[1]])[0]
predicted_y2 = model_ly.predict([[2]])[0]
predicted_y0,predicted_y1,predicted_y2

(np.float64(119.83333333333333),
 np.float64(122.33333333333333),
 np.float64(124.83333333333333))

In [73]:
# ont hot을 적용해서 학습하고 예측
df_encoded = pd.get_dummies(df,columns=['size'])
X_onehot = df_encoded[['size_0',	'size_1']].to_numpy()
model_onehot = LinearRegression()
model_onehot.fit(X_onehot,y)
predicted_onehot_y0 = model_onehot.predict([[1,0]])[0]
predicted_onehot_y1 = model_onehot.predict([[0,1]])[0]
predicted_onehot_y2 = model_onehot.predict([[0,0]])[0]
predicted_onehot_y0,predicted_onehot_y1,predicted_onehot_y2

(np.float64(120.0), np.float64(122.0), np.float64(125.0))