In [157]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torchmetrics.functional as metrics
import re, os   #regular expression
from sklearn.metrics import classification_report

In [158]:
# 학습 데이터 주소
train_data_path = '../data/text/lang_data/train/'
# 훈련 데이터 주소
test_data_path = '../data/text/lang_data/test/'

In [159]:
torch.manual_seed(40)

<torch._C.Generator at 0x1fac320ae90>

In [160]:
re.match('[a-z]+', 'abc1234abc')

<re.Match object; span=(0, 3), match='abc'>

In [161]:
re.findall('[a-z]+', 'abc1234abc')

['abc', 'abc']

In [162]:
re.finditer('[a-z]+', 'abc1234abc')

<callable_iterator at 0x1facfd2b5b0>

In [163]:
# 데이터프레임 생성 함수
def makeDF(filepath):
    data_list = []
    file_list = os.listdir(filepath)        # 폴더 경로 내 파일 목록
    for file in file_list:
        label = re.findall('[a-zA-Z]+', file)[0]       # 라벨 데이터 : 정규표현식 활용
        row = cntAlpha(filepath+file)                  # 행 데이터 추출
        row.append(label)                              # 라벨 데이터 추가
        data_list.append(row)                          # 행 데이터 추가
        
    return pd.DataFrame(data_list)

# 알파뱃 개수 데이터 반환 함수
def cntAlpha(filepath):
    f = open(filepath, 'r', encoding='utf-8')
    
    text = f.read()
    # 알파뱃 개수 리스트
    num_list = [ text.upper().count(chr(i)) for i in range(65, 91) ]
    num_list = list(map(lambda x: x/sum(num_list), num_list))
    
    f.close()
    return num_list

In [164]:
trainDF = makeDF(train_data_path)
testDF = makeDF(test_data_path)

In [165]:
trainDF

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,0.075952,0.01284,0.045702,0.046137,0.105332,0.015669,0.019151,0.043743,0.073993,0.001741,...,0.077693,0.061371,0.080522,0.025898,0.009793,0.014146,0.000653,0.020022,0.000435,en
1,0.096747,0.012406,0.044621,0.033257,0.105817,0.021268,0.015951,0.039408,0.082048,0.001043,...,0.065367,0.069016,0.088042,0.02794,0.008236,0.014908,0.002137,0.015742,0.005578,en
2,0.084178,0.019912,0.030404,0.03887,0.136998,0.017408,0.031239,0.027423,0.075355,0.002623,...,0.09014,0.071659,0.077739,0.030643,0.013712,0.01395,0.002027,0.010731,0.000596,en
3,0.071646,0.012172,0.045643,0.032642,0.120055,0.014661,0.025173,0.023513,0.094606,0.00249,...,0.053942,0.087967,0.081051,0.029046,0.018811,0.011895,0.000553,0.017981,0.000553,en
4,0.07221,0.027715,0.029977,0.039593,0.121041,0.01678,0.023567,0.059012,0.065234,0.001508,...,0.059201,0.073341,0.093703,0.024321,0.00509,0.019608,0.006033,0.017534,0.001697,en
5,0.073806,0.020368,0.031099,0.039641,0.141261,0.020368,0.020368,0.056943,0.065046,0.003285,...,0.072492,0.059571,0.095488,0.024967,0.010731,0.023872,0.003066,0.014893,0.000657,en
6,0.081006,0.013555,0.035795,0.03401,0.123701,0.019075,0.018994,0.036932,0.075406,0.000812,...,0.077273,0.070617,0.088231,0.024269,0.022403,0.013312,0.002273,0.015422,0.000812,en
7,0.083812,0.016864,0.043478,0.040493,0.122686,0.023311,0.019532,0.037635,0.07546,0.00108,...,0.071649,0.066408,0.078,0.028329,0.012799,0.014006,0.001969,0.016674,0.001429,en
8,0.089447,0.015744,0.029711,0.041249,0.116504,0.020782,0.015496,0.048153,0.080226,0.002159,...,0.056813,0.072399,0.080923,0.032117,0.008794,0.016643,0.002497,0.014822,0.000427,en
9,0.082587,0.013215,0.032848,0.038085,0.123968,0.022142,0.016847,0.051183,0.077219,0.0021,...,0.064544,0.071968,0.084381,0.036524,0.011829,0.016876,0.001181,0.013799,0.001152,en


In [166]:
testDF

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,0.067823,0.013459,0.034328,0.048817,0.116114,0.020014,0.016002,0.022798,0.07692,0.002411,...,0.070124,0.07955,0.075122,0.02591,0.014775,0.036103,0.005634,0.013087,0.000416,en
1,0.080283,0.016174,0.03535,0.038342,0.129865,0.016704,0.01895,0.042697,0.073986,0.004463,...,0.066227,0.063599,0.07888,0.027631,0.013026,0.01488,0.002119,0.0133,0.001491,en
2,0.087546,0.014697,0.03085,0.039405,0.121263,0.019975,0.016745,0.047823,0.075352,0.000956,...,0.064568,0.074077,0.083087,0.025254,0.009965,0.012832,0.001183,0.013696,0.001229,en
3,0.078505,0.012169,0.039091,0.031337,0.119966,0.021215,0.016692,0.036399,0.084105,0.000969,...,0.060521,0.069782,0.096812,0.031661,0.015184,0.0126,0.002692,0.019169,0.000538,en
4,0.090582,0.012481,0.032243,0.05191,0.137481,0.01598,0.011914,0.009455,0.075359,0.001702,...,0.067322,0.076967,0.072428,0.059758,0.012292,0.001986,0.003026,0.004917,0.002931,fr
5,0.085097,0.010113,0.039651,0.046137,0.139917,0.01188,0.011542,0.007146,0.077505,0.002797,...,0.06813,0.081501,0.07148,0.0521,0.011726,0.001506,0.003396,0.002198,0.001675,fr
6,0.056764,0.012008,0.035835,0.049876,0.127155,0.013476,0.00862,0.007303,0.08605,0.002786,...,0.067304,0.090078,0.068433,0.042912,0.013852,0.028909,0.009298,0.005157,0.000414,fr
7,0.071875,0.011413,0.038476,0.04033,0.139357,0.012185,0.015386,0.01841,0.079491,0.00415,...,0.06406,0.073023,0.066334,0.048652,0.013598,0.002892,0.004282,0.003355,0.001192,fr
8,0.178832,0.026958,0.013194,0.040057,0.092074,0.007119,0.033507,0.01832,0.082582,0.00617,...,0.050973,0.053726,0.055434,0.041576,0.002088,0.002563,0.00038,0.012055,0.000475,id
9,0.178803,0.036816,0.007309,0.038576,0.08013,0.004467,0.036816,0.019491,0.076746,0.007715,...,0.056984,0.04372,0.059691,0.040742,0.00203,0.003519,0.000677,0.014348,0.000677,id


In [167]:
# 라벨 인코딩을 위한 딕셔너리 제작
labels = trainDF[trainDF.columns[-1]].unique()
encoder = dict(zip(labels, range(4)))

In [168]:
# 라벨 인코딩
trainDF[trainDF.columns[-1]].replace(encoder, inplace=True)
testDF[testDF.columns[-1]].replace(encoder, inplace=True)

In [169]:
# 모델 클래스
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.ANN = nn.Sequential(
            nn.Linear(26, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 4),
            nn.ReLU()
        )
        
    def forward(self, x):
        return self.ANN(x)

In [170]:
# 모델 학습 함수
def trainModel(model, epoch=1000, trainData=None):
    if type(trainData) == type(pd.DataFrame()):
        features = torch.tensor(trainData[trainData.columns[:-1]].values, dtype=torch.float32)
        target = torch.tensor(trainData[trainData.columns[-1]].values)
    elif type(trainData) == type(np.array([])):
        features = torch.tensor(trainData[:-1], dtype=torch.float32)
        target = torch.tensor(trainData[-1])
    else:
        print('The data type is not available.')
        return 1
    
    opt = torch.optim.Adam(model.parameters(), lr=0.001)
    model.train()
    
    for e in range(1, epoch+1):
        h = model(features)
        print(features.dtype, features.shape)
        print(h.dtype, h.shape)
        print(target.dtype, target.shape)
        
        cost = nn.functional.cross_entropy(h, target)
        
        opt.zero_grad()
        cost.backward()
        opt.step()
        
        print(f'Epoch [{e:4}/{epoch:4}] ----------------')
        print(f'Cost : {cost.item()}, Acc : {metrics.accuracy(h, target, task="multiclass", num_classes=4)}, '
              f'F1 : {metrics.f1_score(h, target, task="multiclass", num_classes=4)}')

In [171]:
# 모델 평가 함수
def testModel(model, testData=None):
    if type(testData) == type(pd.DataFrame()):
        features = torch.tensor(testData[testData.columns[:-1]].values, dtype=torch.float32)
        target = testData[testData.columns[-1]]
    elif type(testData) == type(np.array([])):
        features = torch.tensor(testData[:-1], dtype=torch.float32)
        target = testData[-1]
    else:
        print('The data type is not available.')
        return 1
        
    model.eval()
    pre = model(features)
    
    print(pre)
    print()
    print(classification_report(target, pre.argmax(dim=1)))

In [172]:
model1 = MyModel()

In [173]:
# 데이터가 너무 적어서 학습이 지나치게 빠르다...
# 정확도 75면 쓸만한건가..?
trainModel(model1, trainData=trainDF)

torch.float32 torch.Size([40, 26])
torch.float32 torch.Size([40, 4])
torch.int64 torch.Size([40])
Epoch [   1/1000] ----------------
Cost : 1.3934129476547241, Acc : 0.25, F1 : 0.25
torch.float32 torch.Size([40, 26])
torch.float32 torch.Size([40, 4])
torch.int64 torch.Size([40])
Epoch [   2/1000] ----------------
Cost : 1.3925470113754272, Acc : 0.25, F1 : 0.25
torch.float32 torch.Size([40, 26])
torch.float32 torch.Size([40, 4])
torch.int64 torch.Size([40])
Epoch [   3/1000] ----------------
Cost : 1.3917615413665771, Acc : 0.25, F1 : 0.25
torch.float32 torch.Size([40, 26])
torch.float32 torch.Size([40, 4])
torch.int64 torch.Size([40])
Epoch [   4/1000] ----------------
Cost : 1.391003966331482, Acc : 0.25, F1 : 0.25
torch.float32 torch.Size([40, 26])
torch.float32 torch.Size([40, 4])
torch.int64 torch.Size([40])
Epoch [   5/1000] ----------------
Cost : 1.390308141708374, Acc : 0.25, F1 : 0.25
torch.float32 torch.Size([40, 26])
torch.float32 torch.Size([40, 4])
torch.int64 torch.Size(

In [174]:
# 성능 평가를 해보니 개박살이 났다!
testModel(model1, testData=testDF)

tensor([[11.3984,  0.0000,  0.0000,  0.0000],
        [ 8.4724,  0.0000,  0.0000,  0.0000],
        [ 9.5120,  0.0000,  0.0000,  0.0000],
        [10.9264,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.2939,  0.0000],
        [ 4.7239,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  8.5375,  0.0000],
        [ 0.0000,  0.0000,  8.2744,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  8.0731,  0.0000],
        [ 0.0000,  0.0000, 11.9142, 24.0140],
        [ 0.0000,  0.0000, 12.2300, 25.0684],
        [ 0.0000,  0.0000, 10.8710, 17.5755],
        [ 0.0000,  0.0000, 10.3217, 14.0062]], grad_fn=<ReluBackward0>)

              precision    recall  f1-score   support

           0       0.50      1.00      0.67         4
           1       0.00      0.00      0.00         4
           2       0.75      0.75      0.75         4
           3       1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [175]:
# 괘씸해서 에포크를 졸라 많이 줘봤다.
# 물론 이런다고 정확도가 올라가진 않는다.
model2 = MyModel()
trainModel(model2, epoch=10000, trainData=trainDF)

torch.float32 torch.Size([40, 26])
torch.float32 torch.Size([40, 4])
torch.int64 torch.Size([40])
Epoch [   1/10000] ----------------
Cost : 1.389627456665039, Acc : 0.25, F1 : 0.25
torch.float32 torch.Size([40, 26])
torch.float32 torch.Size([40, 4])
torch.int64 torch.Size([40])
Epoch [   2/10000] ----------------
Cost : 1.3890827894210815, Acc : 0.25, F1 : 0.25
torch.float32 torch.Size([40, 26])
torch.float32 torch.Size([40, 4])
torch.int64 torch.Size([40])
Epoch [   3/10000] ----------------
Cost : 1.3886022567749023, Acc : 0.25, F1 : 0.25
torch.float32 torch.Size([40, 26])
torch.float32 torch.Size([40, 4])
torch.int64 torch.Size([40])
Epoch [   4/10000] ----------------
Cost : 1.388152837753296, Acc : 0.25, F1 : 0.25
torch.float32 torch.Size([40, 26])
torch.float32 torch.Size([40, 4])
torch.int64 torch.Size([40])
Epoch [   5/10000] ----------------
Cost : 1.3877170085906982, Acc : 0.25, F1 : 0.25
torch.float32 torch.Size([40, 26])
torch.float32 torch.Size([40, 4])
torch.int64 torch.

In [176]:
# 뭐 어쩌란거지
testModel(model2, testData=testDF)

tensor([[20.7655,  0.0000,  0.0000,  0.0000],
        [15.8679,  0.0000,  0.0000,  0.0000],
        [17.1351,  0.0000,  0.0000,  0.0000],
        [20.2423,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000],
        [11.9149,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000, 13.9722,  0.0000],
        [ 0.0000,  0.0000, 13.9265,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000, 13.0387,  0.0000],
        [ 0.0000,  0.0000, 19.0798, 36.0180],
        [ 0.0000,  0.0000, 19.7723, 38.0009],
        [ 0.0000,  0.0000, 18.0580, 32.8511],
        [ 0.0000,  0.0000, 17.9223, 27.1463]], grad_fn=<ReluBackward0>)

              precision    recall  f1-score   support

           0       0.44      1.00      0.62         4
           1       0.00      0.00      0.00         4
           2       1.00      0.75      0.86         4
           3       1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


데이터가 너무 적어서 이 모양인거 같은데... 일단 튜닝을 하는 시늉은 해보자

In [177]:
# 모델을 조금 단순화해보았다.
class MyModel2(nn.Module):
    def __init__(self):
        super().__init__()
        self.ANN = nn.Sequential(
            nn.Linear(26, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 4),
            nn.ReLU()
        )
        
    def forward(self, x):
        return self.ANN(x)

In [178]:
model3 = MyModel2()
trainModel(model3, trainData=trainDF)

torch.float32 torch.Size([40, 26])
torch.float32 torch.Size([40, 4])
torch.int64 torch.Size([40])
Epoch [   1/1000] ----------------
Cost : 1.3864617347717285, Acc : 0.25, F1 : 0.25
torch.float32 torch.Size([40, 26])
torch.float32 torch.Size([40, 4])
torch.int64 torch.Size([40])
Epoch [   2/1000] ----------------
Cost : 1.3863646984100342, Acc : 0.25, F1 : 0.25
torch.float32 torch.Size([40, 26])
torch.float32 torch.Size([40, 4])
torch.int64 torch.Size([40])
Epoch [   3/1000] ----------------
Cost : 1.3862725496292114, Acc : 0.25, F1 : 0.25
torch.float32 torch.Size([40, 26])
torch.float32 torch.Size([40, 4])
torch.int64 torch.Size([40])
Epoch [   4/1000] ----------------
Cost : 1.3861862421035767, Acc : 0.25, F1 : 0.25
torch.float32 torch.Size([40, 26])
torch.float32 torch.Size([40, 4])
torch.int64 torch.Size([40])
Epoch [   5/1000] ----------------
Cost : 1.3861057758331299, Acc : 0.25, F1 : 0.25
torch.float32 torch.Size([40, 26])
torch.float32 torch.Size([40, 4])
torch.int64 torch.Siz

In [179]:
# 어림없지!
testModel(model3, testData=testDF)

tensor([[6.4250, 0.0000, 0.0000, 0.0000],
        [5.2833, 0.0000, 0.0000, 0.0000],
        [6.5181, 0.0000, 0.0000, 0.0000],
        [6.8893, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [2.2618, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [1.9059, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [0.2193, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000]], grad_fn=<ReluBackward0>)

              precision    recall  f1-score   support

           0       0.25      1.00      0.40         4
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00         4

    accuracy                 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
