In [116]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [117]:
# 라이브러리 불러오기
import pandas as pd
import numpy as np

# read_csv() 함수로 df 생성
df = pd.read_csv('./data/auto-mpg.csv', header=None)

# 열 이름을 지정
df.columns = ['mpg','cylinders','displacement','horsepower','weight',
              'acceleration','model year','origin','name'] 

# horsepower 열의 누락 데이터('?') 삭제하고 실수형으로 변환
df['horsepower'] = df['horsepower'].replace('?', np.nan)      # '?'을 np.nan으로 변경
df = df.dropna(subset=['horsepower'], axis=0)                 # 누락데이터 행을 삭제
df['horsepower'] = df['horsepower'].astype('float')           # 문자열을 실수형으로 변환

# np.histogram 으로 3개의 bin으로 나누는 경계 값의 리스트 구하기
count, bin_dividers = np.histogram(df['horsepower'], bins=3)

# 3개의 bin에 이름 지정
bin_names = ['저출력', '보통출력', '고출력']

# pd.cut 으로 각 데이터를 3개의 bin에 할당
df['hp_bin'] = pd.cut(x=df['horsepower'],     # 데이터 배열
                      bins=bin_dividers,      # 경계 값 리스트
                      labels=bin_names,       # bin 이름
                      include_lowest=True)    # 첫 경계값 포함

# sklern 라이브러리 불러오기
from sklearn import preprocessing    

# 전처리를 위한 encoder 객체 만들기
label_encoder = preprocessing.LabelEncoder()       # label encoder 생성
onehot_encoder = preprocessing.OneHotEncoder()     # one hot encoder 생성

# label encoder로 문자열 범주를 숫자형 범주로 변환
onehot_labeled = label_encoder.fit_transform(df['hp_bin'].head(15))  
print(onehot_labeled)
print(type(onehot_labeled))

# 2차원 행렬로 형태 변경
onehot_reshaped = onehot_labeled.reshape(len(onehot_labeled), 1) 
print(onehot_reshaped)
print(type(onehot_reshaped))

# 희소행렬로 변환
onehot_fitted = onehot_encoder.fit_transform(onehot_reshaped)
print(onehot_fitted)
print(type(onehot_fitted))

[1 1 1 1 1 0 0 0 0 0 0 1 1 0 2]
<class 'numpy.ndarray'>
[[1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [2]]
<class 'numpy.ndarray'>
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 15 stored elements and shape (15, 3)>
  Coords	Values
  (0, 1)	1.0
  (1, 1)	1.0
  (2, 1)	1.0
  (3, 1)	1.0
  (4, 1)	1.0
  (5, 0)	1.0
  (6, 0)	1.0
  (7, 0)	1.0
  (8, 0)	1.0
  (9, 0)	1.0
  (10, 0)	1.0
  (11, 1)	1.0
  (12, 1)	1.0
  (13, 0)	1.0
  (14, 2)	1.0
<class 'scipy.sparse._csr.csr_matrix'>


In [118]:
import pandas as pd
import numpy as np

# read_csv() 함수로 df 생성
df = pd.read_csv('./data/auto-mpg.csv', header=None)

df.columns = ['mpg','cylinders','displacement','horsepower','weight',
              'acceleration','model year','origin','name'] 

df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [119]:
# 연비를 3개 구간으로 나눠서 새로운 컬럼명으로 추가
# 컬럼명 : 저연비 , 보통 , 고연비
# 연속형(수치형) 데이터 -> 범주형(categoty)
df.mpg
# pd.qcut(df.mpg , q=3 , labels=['저연비','보통','고연비'])
df['mpg_qcut']= pd.qcut(df.mpg , q=3 , labels=['저연비','보통','고연비'])

size , bins_range = np.histogram(df.mpg , bins=3)
df['mpg_cut']= pd.cut(df.mpg ,bins=bins_range , labels=['저연비','보통','고연비'] )
# (df.mpg_dcut.value_counts()) + (df.mpg_cut.value_counts()) # 사칙연산 O 
df.mpg_qcut.value_counts()
df['mpg_qcut'].cat.categories
# 계급 구간을 확인하는 방법 -> 라벨 없이 적용 후 구문을 변수에 넣고 !
temp = pd.qcut(df.mpg , q=3)
print(f'구간별 데이터 범위 : {temp.cat.categories}')


구간별 데이터 범위 : IntervalIndex([(8.999, 19.0], (19.0, 26.933], (26.933, 46.6]], dtype='interval[float64, right]')


In [120]:
size , bins_range = np.histogram(df.mpg , bins=3)
df['mpg_cut']= pd.cut(df.mpg ,bins=bins_range , labels=['저연비','보통','고연비'] )
df.mpg_qcut.value_counts()
print(f'계급별 구간 : {bins_range}')

계급별 구간 : [ 9.         21.53333333 34.06666667 46.6       ]


In [121]:
pd.get_dummies(df.mpg_qcut,dtype = float) #dtype = float  문자열 -> 숫자 

Unnamed: 0,저연비,보통,고연비
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0
...,...,...,...
393,0.0,0.0,1.0
394,0.0,0.0,1.0
395,0.0,0.0,1.0
396,0.0,0.0,1.0


In [122]:
# 다중 공선성 : 회귀분석(여러변수로 결과를 예측하는 모델) , 독립변수(컬럼들..)끼리 서로 너무 강하게 상관관계를 가질때 문제가 생김

In [123]:
# get_dummy() : 빠르게 확인해주고 컬럼명을 유지해줌
#       데이터 탐색과 시각화 / 작은 데이터셋에 유리함.
# OneHotEncoder() : numpy , array , DataFrame
#       머신러닝 파이라인 / 모델 학습
#       fit , transform  -->  학습데이터  -  예측데이터 일관성 유지  /  큰 데이터셋에 유리함.

In [124]:
from sklearn.preprocessing import OneHotEncoder # sklearn -> 머신러닝 라이브러리 
import pandas as pd
import numpy as np

# read_csv() 함수로 df 생성
df = pd.read_csv('./data/auto-mpg.csv', header=None)
df.columns = ['mpg','cylinders','displacement','horsepower','weight',
              'acceleration','model year','origin','name'] 


# 범주형 데이터로 변환
df['mpg_qcut']= pd.qcut(df.mpg , q=3 , labels=['저연비','보통','고연비'])
# sklearn 계열은 사용방법이 통일 fit : 적용  /  transform : 변환   fit_transform 은 두개를 한꺼번에 실행
# encoder.fit_transform(df.mpg_qcut)  # 1차원 데이터
encoder = OneHotEncoder(sparse_output=False) # 객체 
# encoder = OneHotEncoder(sparse_output=True)
temp = encoder.fit_transform(df[['mpg_qcut']])   #2차원 데이터  /  여기서는 2차원 데이터가 와야함 
cols = encoder.get_feature_names_out(['mpg_qcut'])
pd.DataFrame(temp,columns=cols)
# pd.DataFrame.sparse.from_spmatrix(temp, columns=cols)
pd.concat([df.drop(columns=['mpg_qcut']),pd.DataFrame(temp, columns=cols)] , axis = 1)
# pd.concat([df.drop(columns=['mpg_qcut']), pd.DataFrame.sparse.from_spmatrix(temp, columns=cols)],axis=1)


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,name,mpg_qcut_고연비,mpg_qcut_보통,mpg_qcut_저연비
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu,0.0,0.0,1.0
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320,0.0,0.0,1.0
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite,0.0,0.0,1.0
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst,0.0,0.0,1.0
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.00,2790.0,15.6,82,1,ford mustang gl,1.0,0.0,0.0
394,44.0,4,97.0,52.00,2130.0,24.6,82,2,vw pickup,1.0,0.0,0.0
395,32.0,4,135.0,84.00,2295.0,11.6,82,1,dodge rampage,1.0,0.0,0.0
396,28.0,4,120.0,79.00,2625.0,18.6,82,1,ford ranger,1.0,0.0,0.0


In [125]:
train_df = df = df.drop(columns=['mpg'])
train_df.head()

# cylinders , model year , orign  --> onehot
#   onehot 이후에 onehot에 대상이 된 컬럼은 drop
# 제조사 컬럼에서 제조사만 추출해서  --> onehot
# 하나의 데이터 프레임으로 결합 concat
# train_df.origin.value_counts()
train_df



Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin,name,mpg_qcut
0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu,저연비
1,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320,저연비
2,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite,저연비
3,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst,저연비
4,8,302.0,140.0,3449.0,10.5,70,1,ford torino,저연비
...,...,...,...,...,...,...,...,...,...
393,4,140.0,86.00,2790.0,15.6,82,1,ford mustang gl,고연비
394,4,97.0,52.00,2130.0,24.6,82,2,vw pickup,고연비
395,4,135.0,84.00,2295.0,11.6,82,1,dodge rampage,고연비
396,4,120.0,79.00,2625.0,18.6,82,1,ford ranger,고연비


In [126]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
encoder_fit_transform = encoder.fit_transform(train_df[['mpg_qcut']])
encoder_fit_transform
encoder.get_feature_names_out(['mpg_qcut'])
pd.DataFrame.sparse.from_spmatrix(encoder_fit_transform,columns=cols)


Unnamed: 0,mpg_qcut_고연비,mpg_qcut_보통,mpg_qcut_저연비
0,0,0,1.0
1,0,0,1.0
2,0,0,1.0
3,0,0,1.0
4,0,0,1.0
...,...,...,...
393,1.0,0,0
394,1.0,0,0
395,1.0,0,0
396,1.0,0,0


In [127]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
origin_cols = ['cylinders','model year', 'origin','maker']
train_df['maker'] = [n_list[0] for n_list in df['name'].str.split()]
total_onehots = []
for colname in origin_cols:
    encoder_fit_transform = encoder.fit_transform(train_df[[colname]])
    cols = encoder.get_feature_names_out([colname])
    total_onehots.append(pd.DataFrame.sparse.from_spmatrix(encoder_fit_transform,columns=cols))

In [128]:
total_onehots.insert(0,train_df)
new_train_df = pd.concat(total_onehots,axis=1)
new_train_df = new_train_df.drop(columns=origin_cols)
new_train_df.head()

Unnamed: 0,displacement,horsepower,weight,acceleration,name,mpg_qcut,cylinders_3,cylinders_4,cylinders_5,cylinders_6,...,maker_renault,maker_saab,maker_subaru,maker_toyota,maker_toyouta,maker_triumph,maker_vokswagen,maker_volkswagen,maker_volvo,maker_vw
0,307.0,130.0,3504.0,12.0,chevrolet chevelle malibu,저연비,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,350.0,165.0,3693.0,11.5,buick skylark 320,저연비,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,318.0,150.0,3436.0,11.0,plymouth satellite,저연비,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,304.0,150.0,3433.0,12.0,amc rebel sst,저연비,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,302.0,140.0,3449.0,10.5,ford torino,저연비,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [129]:
encoder = OneHotEncoder()

orign_cols = ['cylinders , model year']

total_onhots = []
for columns in orign_cols:
    encoder_fit_transform = encoder.fit_transform(train_df[['mpg_qcut']])
    encoder_fit_transform
    encoder.get_feature_names_out(['mpg_qcut'])
    total_onhots.append(pd.DataFrame.sparse.from_spmatrix(encoder_fit_transform,columns=cols))




ValueError: Column length mismatch: 37 vs. 3

In [None]:
# 제조사 컬럼에서 제조사만 추출해서  --> onehot

# 빈 리스트 준비
brands = []

# 각 이름을 쪼개서 첫 번째만 가져오기
for n_list in df['name'].str.split():
    brands.append(n_list[0])

# 결과 확인
print(brands[:5])
# ['chevrolet', 'buick', 'plymouth', 'amc', 'ford']

# DataFrame에 추가
df['brand'] = brands


['chevrolet', 'buick', 'plymouth', 'amc', 'ford']


In [None]:
pd.concat(total_onhots,axis=1)

Unnamed: 0,mpg_qcut_고연비,mpg_qcut_보통,mpg_qcut_저연비
0,0,0,1.0
1,0,0,1.0
2,0,0,1.0
3,0,0,1.0
4,0,0,1.0
...,...,...,...
393,1.0,0,0
394,1.0,0,0
395,1.0,0,0
396,1.0,0,0


In [None]:
pd.concat(df['total_onhots'],axis=1)

In [None]:
encoder2 = OneHotEncoder(sparse_output=True) # 기본값 : True 객체가 너무 많아서 압축해서 보여줌. false를 하면 행열이 나옴.  /  희소 행열들을 모아둔.
encoded = encoder2.fit_transform(df[['mpg_qcut']])
encoded

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 398 stored elements and shape (398, 3)>

In [None]:
# 범주형 데이터  -->  범위 내에서 결정되는 데이터 - category
# 연속형 데이터  -->  범위가 없는 변화무쌍한 데이터

# 반려동물을 키움으로써 행복도 영향 / 개 고양이 새 다양한 직업,연봉,가족여부,나이더불어 이사람이 보유하고 있는 동물이 뭔지.
# 개 고양이 새 -> 범주형 데이터 ( 카테고리 안에서 움직이기 때문에 모델 학습할때 참고용으로만 사용. )  //  0 과 1 로만 표현 (one-hot_encoding으로 표현하는거임.)
# 강아지 [1,0,0] -> 강아지 on 나머지 off
# 고양이 [0,1,0]
#  새    [0,0,1]
# 강아지 0 , 고양이 1 , 새 2  --> ? 이건 왜 안됨?  --> 새가 강아지보다 2베 증가했다고 인식하기때문에.

# ------------------------------------------------------------------------------------------------

In [150]:
import pandas as pd
from sklearn.linear_model import LinearRegression  # 선형회귀 2 한쪽이 증가하면 한쪽이 감소하는(?)
# 신발사이즈번수 , 키를 예측. 등
# 작은 : 0 , 보통 : 1 , 큰 : 2
data = {
    'size' : [0,0,0,1,1,1,2,2,2],
    'hight' : [150,160,180,185,163,155,166,162,190]
}
df = pd.DataFrame(data)
# df
# df_encoded = df[pd.get_dummies(df.size,dtype = float)]
# df_encoded

# 실제평균
df.groupby('size')['hight'].mean()

model_lr = LinearRegression()
X = df.drop(columns=['hight'])
Y = df['hight']
X.shape , Y.shape , type(Y)

# 학습 시켜보자
model_lr = LinearRegression()
model_lr.fit(X, Y)



0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [None]:
# 학습을 시켰으면 예측도 해보자.


model_lr = LinearRegression()
X = df.drop(columns=['hight'])
Y = df['hight']

X.shape , Y.shape , type(Y)
model_lr = LinearRegression()
model_lr.fit(X, Y)
predicted_y0 = model_lr.predict(pd.DataFrame({'size': [0]}))[0]
predicted_y1 = model_lr.predict(pd.DataFrame({'size': [1]}))[0]
predicted_y2 = model_lr.predict(pd.DataFrame({'size': [2]}))[0]
predicted_y0 , predicted_y1 , predicted_y2

# size=0일 때 예측 키: 163.22cm
# size=1일 때 예측 키: 167.89cm
# size=2일 때 예측 키: 172.56cm


(np.float64(163.22222222222223),
 np.float64(167.88888888888889),
 np.float64(172.55555555555557))

In [None]:
# one-hot-encoding을 적용해서 학습하고 예측해보자

df_encoded = pd.get_dummies(df,columns=['size'])
X_onehot = df_encoded[['size_0' , 'size_1']]
model_onehot = LinearRegression()
model_onehot.fit(X_onehot,Y)
predicted_onehot_y0 = model_onehot.predict([[1,0]])[0]
predicted_onehot_y1 = model_onehot.predict([[0,1]])[0]
predicted_onehot_y2 = model_onehot.predict([[0,0]])[0]
predicted_onehot_y0 , predicted_onehot_y1 , predicted_onehot_y2

# size=0일 때 예측 키: 163.33cm
# size=1일 때 예측 키: 167.67cm
# size=2일 때 예측 키: 172.67cm



(np.float64(163.33333333333331),
 np.float64(167.66666666666666),
 np.float64(172.66666666666666))