In [4]:
## 파이토치 설치

!pip install torch

Collecting torch
  Downloading torch-1.12.1-cp39-cp39-win_amd64.whl (161.8 MB)
Installing collected packages: torch
Successfully installed torch-1.12.1


In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# 데이터 불러오기
data=pd.read_csv("./car_evaluation.csv")
data.head()

Unnamed: 0,price,maint,doors,persons,lug_capacity,safety,output
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [None]:
## EDA
fig_size=plt.rcParams['figure.figsize']
fig_size[0]=8
fig_size[1]=6
plt.rcParams['figure.figsize']=fig_size

data.output.value_counts().plot(kind='pie',autopct='0.05f%%',colors=['lightblue','lightgree','orange','pink'],explode=(0.05,0.05,0.05,0.05))

In [3]:
## 카테고리 칼럼 변환
categorical_columns=['price','maint','doors','persons','lug_capacity','safety']

for category in categorical_columns:
    data[category]=data[category].astype('category')
    
price=data['price'].cat.codes.values
maint=data['maint'].cat.codes.values
doors=data['doors'].cat.codes.values
persons=data['persons'].cat.codes.values
lug_capacity=data['lug_capacity'].cat.codes.values
safety=data['safety'].cat.codes.values

categorical_data=np.stack([price,maint,doors,persons,lug_capacity,safety],1)
categorical_data[:10]

array([[3, 3, 0, 0, 2, 1],
       [3, 3, 0, 0, 2, 2],
       [3, 3, 0, 0, 2, 0],
       [3, 3, 0, 0, 1, 1],
       [3, 3, 0, 0, 1, 2],
       [3, 3, 0, 0, 1, 0],
       [3, 3, 0, 0, 0, 1],
       [3, 3, 0, 0, 0, 2],
       [3, 3, 0, 0, 0, 0],
       [3, 3, 0, 1, 2, 1]], dtype=int8)

In [4]:
## 카테고리 칼럼 -> 텐서로 변환
categorical_data=torch.tensor(categorical_data,dtype=torch.int64)
categorical_data[:10]

tensor([[3, 3, 0, 0, 2, 1],
        [3, 3, 0, 0, 2, 2],
        [3, 3, 0, 0, 2, 0],
        [3, 3, 0, 0, 1, 1],
        [3, 3, 0, 0, 1, 2],
        [3, 3, 0, 0, 1, 0],
        [3, 3, 0, 0, 0, 1],
        [3, 3, 0, 0, 0, 2],
        [3, 3, 0, 0, 0, 0],
        [3, 3, 0, 1, 2, 1]])

In [5]:
## get_dummies를 이용해 output 칼럼 변환
outputs=pd.get_dummies(data.output)
outputs=outputs.values
outputs=torch.tensor(outputs).flatten() ## 1차원 텐서로 변환

print(categorical_data.shape)
print(outputs.shape)

torch.Size([1728, 6])
torch.Size([6912])


In [6]:
## 워드 임베딩 크기 설정
categorical_column_sizes=[len(data[column].cat.categories) for column in categorical_columns]
categorical_embedding_sizes=[(col_size,min(50,(col_size+1)//2)) for col_size in categorical_column_sizes]

print(categorical_embedding_sizes) ## (범주형 칼럼의 고유값의 수, 차원 크기)

[(4, 2), (4, 2), (4, 2), (3, 2), (3, 2), (3, 2)]


In [27]:
## 데이터셋 분리
total_records=1728
test_records=int(total_records*0.2)

categorical_train_data=categorical_data[:total_records-test_records]
categorical_test_data=categorical_data[total_records-test_records:total_records]
train_outputs=outputs[:total_records-test_records]
test_outputs=outputs[total_records-test_records:total_records]

In [11]:
## 모델 네트워크 생성
class Model(nn.Module):
    def __init__(self,embedding_size,output_size,layers,p=0.4):
        super().__init__()
        self.all_embeddings=nn.ModuleList([nn.Embedding(ni,nf) for ni,nf in embedding_size])
        self.embedding_dropout=nn.Dropout(p)
        
        all_layers=[]
        num_categorical_cols=sum((nf for ni,nf in embedding_size))
        input_size=num_categorical_cols
        
        for i in layers:
            all_layers.append(nn.Linear(input_size,i))
            all_layers.append(nn.ReLU(inplace=True))
            all_layers.append(nn.BatchNorm1d(i))
            all_layers.append(nn.Dropout(p))
            input_size=i
            
        all_layers.append(nn.Linear(layers[-1],output_size))
        self.layers=nn.Sequential(*all_layers)
        
    def forward(self,x_categorical):
        embeddings=[]
        for i,e in enumerate(self.all_embeddings):
            embeddings.append(e(x_categorical[:,i]))
        x=torch.cat(embeddings,1)
        x=self.embedding_dropout(x)
        x=self.layers(x)
        return x

In [13]:
## 모델 초기화
model=Model(categorical_embedding_sizes,4,[200,100,50],p=0.4)
print(model)

Model(
  (all_embeddings): ModuleList(
    (0): Embedding(4, 2)
    (1): Embedding(4, 2)
    (2): Embedding(4, 2)
    (3): Embedding(3, 2)
    (4): Embedding(3, 2)
    (5): Embedding(3, 2)
  )
  (embedding_dropout): Dropout(p=0.4, inplace=False)
  (layers): Sequential(
    (0): Linear(in_features=12, out_features=200, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=200, out_features=100, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=100, out_features=50, bias=True)
    (9): ReLU(inplace=True)
    (10): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.4, inplace=False)
    (12): Linear(in_features=50, out_features=4, bias=True)
  )
)


In [14]:
## 모델 파라미터 정의
loss_function=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=0.001)

In [15]:
if torch.cuda.is_available():
    device=torch.device('cuda')
else:
    device=torch.device('cpu')

In [16]:
epochs=500
aggregated_losses=[]
train_outputs=train_outputs.to(device=device,dtype=torch.int64)

for i in range(epochs):
    i+=1
    y_pred=model(categorical_train_data)
    single_loss=loss_function(y_pred,train_outputs)
    aggregated_losses.append(single_loss)
    
    if i%25==1:
        print(f'epoch:{i:3} loss:{single_loss.item():10.8f}')
        
    optimizer.zero_grad()
    single_loss.backward()
    optimizer.step()

print(f'epoch:{i:3} loss:{single_loss.item():10.0f}')

epoch:  1 loss:1.63960373
epoch: 26 loss:1.45137513
epoch: 51 loss:1.36734223
epoch: 76 loss:1.23154187
epoch:101 loss:1.07761014
epoch:126 loss:0.93326908
epoch:151 loss:0.82108617
epoch:176 loss:0.75516218
epoch:201 loss:0.69735128
epoch:226 loss:0.66804397
epoch:251 loss:0.63743061
epoch:276 loss:0.62235022
epoch:301 loss:0.60445195
epoch:326 loss:0.61003506
epoch:351 loss:0.59467822
epoch:376 loss:0.59520388
epoch:401 loss:0.59085774
epoch:426 loss:0.58432341
epoch:451 loss:0.57722622
epoch:476 loss:0.57491773
epoch:500 loss:         1


In [28]:
test_outputs = test_outputs.to(device=device, dtype=torch.int64)
with torch.no_grad():
    y_val = model(categorical_test_data).to(device)
    loss = loss_function(y_val, test_outputs)
print(f'Loss: {loss:.8f}')

Loss: 0.55256742


- 마지막 에포크에서 loss 값이 튄 것을 어떻게 해결해야할지는 아직 찾아보는 중
- 예상치 못한 이상치가 존재해서지 않을까란 생각
- test data로 확인해봤을 때 loss가 0.55로, 다른 train data에서의 loss 값보다 낮아 일단 문제 없다고 생각

In [29]:
print(y_val[:5])

tensor([[ 2.9704,  1.7372, -2.6595, -2.6399],
        [ 2.6280,  1.4437, -2.8888, -2.8278],
        [ 3.1629,  1.7796, -4.1434, -4.2581],
        [ 2.6304,  1.6518, -2.8058, -2.7679],
        [ 1.9476,  1.0910, -1.7264, -1.9949]])


In [30]:
y_val = np.argmax(y_val.cpu().numpy(), axis=1)
print(y_val[:5])

[0 0 0 0 0]


In [31]:
import warnings
warnings.filterwarnings('ignore') 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

test_outputs=test_outputs.cpu().numpy()
print(confusion_matrix(test_outputs,y_val))
print(classification_report(test_outputs,y_val))
print(accuracy_score(test_outputs, y_val))

[[259   0]
 [ 84   2]]
              precision    recall  f1-score   support

           0       0.76      1.00      0.86       259
           1       1.00      0.02      0.05        86

    accuracy                           0.76       345
   macro avg       0.88      0.51      0.45       345
weighted avg       0.82      0.76      0.66       345

0.7565217391304347
