# 밑준비 하기

## Autogluon 설치
- 코랩 환경에서는 설치후 런타임 재시작 필요

In [1]:
!pip install autogluon

Collecting autogluon
  Downloading autogluon-0.8.2-py3-none-any.whl (9.7 kB)
Collecting autogluon.core[all]==0.8.2 (from autogluon)
  Downloading autogluon.core-0.8.2-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.0/224.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.features==0.8.2 (from autogluon)
  Downloading autogluon.features-0.8.2-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.tabular[all]==0.8.2 (from autogluon)
  Downloading autogluon.tabular-0.8.2-py3-none-any.whl (285 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m285.7/285.7 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.multimodal==0.8.2 (from autogluon)
  Downloading autogluon.multimodal-0.8.2-py3-none-any.whl (372 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3

## 라이브러리 준비

In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np

import pytz
from copy import deepcopy
from collections import Counter

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## feature enginnering

In [18]:
target = 'root_cause_type'

def feature_engineering(data):
    '''
    params
        data : DataFrame - feature engineering의 대상이 되는 데이터
    return
        DataFrame
    '''
    # ticketno column을 int64로 변경
    temp = deepcopy(data)
    temp['ticketno'] = temp['ticketno'].astype('int64')

    # ticketno, alarmno를 로그변환
    temp['ticketno_log1p'] = np.log1p(temp['ticketno'])
    temp['alarmno_log1p'] = np.log1p(temp['alarmno'])

    # alarmtime column을 datetime64로 변경
    tz_pytz = pytz.timezone('Asia/Seoul')
    temp['alarmtime'] = pd.to_datetime(temp['alarmtime'], unit='ns', utc=True).dt.tz_convert('Asia/Seoul')

    # alarmlevel을 범주형 데이터로 취급
    temp['alarmlevel'] = temp['alarmlevel'].astype('object')

    # site & sysname 알파벳 별로 분리
    site_col_dict = {}
    for i in range(4):
        site_col_dict[f'site{i}'] = []


    for idx, value in temp['site'].items():
        for num, each_char in enumerate(value):
            site_col_dict[f'site{num}'].append(each_char)
    site_df = pd.DataFrame(site_col_dict)

    sysname_col_dict = {}
    for i in range(4):
        sysname_col_dict[f'sysname{i}'] = []


    for idx, value in temp['sysname'].items():
        for num, each_char in enumerate(value):
            sysname_col_dict[f'sysname{num}'].append(each_char)

    sysname_df = pd.DataFrame(sysname_col_dict)
    temp = pd.concat([temp, site_df, sysname_df], axis=1)

    #slot, port의 결측치는 각각 0, 100으로 채운다.
    temp['port'].fillna(0.0, inplace=True)
    temp['slot'].fillna(100.0, inplace=True)

    # port와 slot의 dtype을 int32로 바꾼다
    temp['port'] = temp['port'].astype('int32')
    temp['slot'] = temp['slot'].astype('int32')

    return temp

## Hyperparameters

In [19]:
# 단어 임베딩 모델의 종류
# embedding_model = 'w2v'
embedding_model = 'fast'

# 수치로 표현되는 벡터의 차원수
word_vec_len = 64

# 수치 표현시 고려되는 주변 단어들의 수
window_size = 7

# 주변 단어들을 고려하는 방식 (0 or 1)
sg = 0

# 데이터 불러오기

In [20]:
train_path = '/content/drive/MyDrive/my_data/kt_network/q2/Q2_train.csv'
test_path = '/content/drive/MyDrive/my_data/kt_network/q2/Q2_test.csv'
submisson_path = '/content/drive/MyDrive/my_data/kt_network/q2/Q2_label_sample.csv'
save_path = '/content/drive/MyDrive/kt_network_competition/AutogluonModels_kt_network'
w2v_path = '/content/drive/MyDrive/kt_network_competition/w2v_model'
fast_path = '/content/drive/MyDrive/kt_network_competition/fasttext_model'

In [21]:
data = feature_engineering(pd.read_csv(train_path))
data

Unnamed: 0,ticketno,alarmno,alarmtime,alarmlevel,alarmmsg_original,site,sysname,unit,slot,port,...,ticketno_log1p,alarmno_log1p,site0,site1,site2,site3,sysname0,sysname1,sysname2,sysname3
0,21122633,1669820428245,2022-12-01 00:02:24+09:00,5,ETH-ERR,ACEN,acnt,EQPT,3,1,...,16.865856,28.143737,A,C,E,N,a,c,n,t
1,21122633,1669821318728,2022-12-01 00:17:15+09:00,5,ETH-ERR,ACEN,acnt,EQPT,3,1,...,16.865856,28.143738,A,C,E,N,a,c,n,t
2,21122633,1669822214832,2022-12-01 00:32:11+09:00,5,ETH-ERR,ACEN,acnt,EQPT,3,1,...,16.865856,28.143738,A,C,E,N,a,c,n,t
3,21122633,1669823114128,2022-12-01 00:47:10+09:00,5,ETH-ERR,ACEN,acnt,EQPT,3,1,...,16.865856,28.143739,A,C,E,N,a,c,n,t
4,21122633,1669824028082,2022-12-01 01:02:24+09:00,5,ETH-ERR,ACEN,acnt,EQPT,3,1,...,16.865856,28.143739,A,C,E,N,a,c,n,t
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9317,21774618,1671892499215,2022-12-24 23:37:14+09:00,7,OPT-LOS,AFAA,agow,EQPT,5,5,...,16.896256,28.144977,A,F,A,A,a,g,o,w
9318,15693425,877949375,2022-12-25 10:13:46+09:00,7,OPT-REMOVE,AGFD,aibb,EQPT,1,8,...,16.568752,20.593099,A,G,F,D,a,i,b,b
9319,21809789,1671974758375,2022-12-25 22:28:14+09:00,7,OPT-LOS,ADKA,aeaq,EQPT,3,3,...,16.897870,28.145027,A,D,K,A,a,e,a,q
9320,21811213,1671978167736,2022-12-25 23:25:03+09:00,7,OPT-LOS,ABZO,acie,EQPT,5,6,...,16.897935,28.145029,A,B,Z,O,a,c,i,e


In [22]:
test = pd.read_csv(test_path)
test

Unnamed: 0,ticketno,alarmno,alarmtime,alarmlevel,alarmmsg_original,site,sysname,unit,slot,port,sva,root_cause_domain
0,21812391.0,1671894138838,2022-12-25 00:02:16+09:00,5,Loss Of Signal,AEAQ,afbd,r1sr1sl16/ETHLocPort#15#1,16.0,15.0,NSA,C
1,21775988.0,1671894172511,2022-12-25 00:02:51+09:00,5,Loss Of Signal,ADZW,aezn,r1sr1sl13/ETHLocPort#20#1,13.0,20.0,NSA,C
2,21792259.0,1671894204682,2022-12-25 00:03:22+09:00,4,DDM_RX_PWR_HIGH,AECE,afeg,X2FUA,2.0,1.0,NSA,B
3,21812412.0,1671894215702,2022-12-25 00:03:33+09:00,5,BATT_ENV_FAIL,ACCN,aclp,---,,,SA,B
4,21812417.0,1671894220812,2022-12-25 00:03:39+09:00,5,Loss Of Signal,AEAQ,afbd,r1sr1sl16/ETHLocPort#16#1,16.0,16.0,NSA,C
...,...,...,...,...,...,...,...,...,...,...,...,...
37666,21986223.0,1672412311698,2022-12-30 23:58:30+09:00,5,Loss Of Signal,ADZW,aezn,r1sr1sl13/ETHLocPort#20#1,13.0,20.0,NSA,C
37667,22015278.0,1672412316271,2022-12-30 23:58:33+09:00,4,DDM_RX_PWR_HIGH,AEMD,afsr,G16FU,5.0,6.0,SA,B
37668,21986426.0,1672412317238,2022-12-30 23:58:34+09:00,5,MEP_LSP_RDI,ACMY,acxj,G2FUA,1.0,1.0,NSA,B
37669,22015300.0,1672412373531,2022-12-30 23:59:32+09:00,5,Loss Of Signal,AEAQ,afbd,r1sr1sl16/ETHLocPort#16#1,16.0,16.0,NSA,C


# 단어 임베딩 모델링
- W2V, FastText 모델
- 장치 제조사별 alarmmsg_original 변수 값들끼리 겹치는 부분이 없는 것은 아니다. 축약어 사용 등 통일화가 가능한 부분이 있다는 것을 이용해 모든 장치 제조사를 아우를 수 있는 단어 임베딩 모델을 만들자.

## 장치 제조사별 alarmmsg_original 범주 추출

### Domain A

In [23]:
msg_d = data["alarmmsg_original"].unique().tolist()
msg_d

['ETH-ERR',
 'OPT-LOS',
 'DCC-FAIL',
 'LSP-LOC',
 'ETH-NO-RX-TRAFFIC',
 'OPT-PWR-LOW',
 'PSU-FAIL',
 'ETH-LINK-FAIL',
 'AUTONEGO_MISMATCH',
 'TDM-PW-RMT_FAIL',
 'OPT-REMOVE',
 'TRK-CONN-MIS',
 'ProtectionSwitched',
 'NVRAM-FAIL',
 'PW-LOC',
 'BATT-LOW',
 'BOOTING',
 'ETH-NO-TX-TRAFFIC',
 'Restarted',
 'OS_MISMATCH',
 'UNIT-REM',
 'TDM-PW-LOF',
 'FAN-48V-FAIL',
 'FAN-FAIL',
 '48V-FAIL',
 'PortShutdown',
 'QL_FAIL',
 'UNIT-IPC-FAIL',
 'LINK_FAIL_MANUAL',
 'PSU-REM',
 'OPT-MIS']

### Domain B & C

In [24]:
msg_t = test["alarmmsg_original"].unique().tolist()
msg = [s.lower() for s in list(set(msg_d + msg_t))]
msg

['unit_init (by reset)',
 'manual (management) removal',
 'synchronization signal fail raise',
 'ne not reach via primary mng interf',
 'eth-no-tx-traffic',
 'ql_fail',
 'unit_out',
 'unit_init (by inslot)',
 'tdm-pw-rmt_fail',
 'autonego_mismatch',
 'mep_pwe_csf',
 'ether_link_down (llcf)',
 'ioc payload-links with matrix failure',
 'house keeping alarm',
 'ddm_rx_pwr_low',
 'eth-no-rx-traffic',
 'mep_lsp_loc',
 'input power degrade defect',
 'improper removal',
 'oamloss of continuity',
 'fan-fail',
 'loss of connectivity',
 'unit-rem',
 'ether_port_crc',
 'batt-low',
 'ether_tca_crc_15m',
 'ether_tca_crc_1d',
 'procedure error',
 'hardware failure',
 'opt-los',
 'os_mismatch',
 'remote fault indication',
 'ether_link_down (local_fault)',
 'opt-pwr-high',
 'oam loss of continuity',
 'nvram-fail',
 'fan-48v-fail',
 'node isolation',
 'dcn_fail',
 'pw-loc',
 'sys_temp_low',
 'client signal fail',
 'unit_fail',
 'mep_pwe_loc',
 'module_out ( sfp+ )',
 'oam rdi',
 'lsp-loc',
 'server sig

## alarmmsg_original 범주 토큰화

In [25]:
import re

pattern = r'[\s\-_\(\)\/]'
cleaned_msg = []
for m in msg:
    # 기호 제거
    m = re.sub(pattern, ' ', m)
    # 단어 분리
    words = [word for word in re.findall('[A-Za-z0-9]+', m) if len(word) > 1]
    # 분리된 단어들을 리스트에 추가
    cleaned_msg.append(words)
cleaned_msg

[['unit', 'init', 'by', 'reset'],
 ['manual', 'management', 'removal'],
 ['synchronization', 'signal', 'fail', 'raise'],
 ['ne', 'not', 'reach', 'via', 'primary', 'mng', 'interf'],
 ['eth', 'no', 'tx', 'traffic'],
 ['ql', 'fail'],
 ['unit', 'out'],
 ['unit', 'init', 'by', 'inslot'],
 ['tdm', 'pw', 'rmt', 'fail'],
 ['autonego', 'mismatch'],
 ['mep', 'pwe', 'csf'],
 ['ether', 'link', 'down', 'llcf'],
 ['ioc', 'payload', 'links', 'with', 'matrix', 'failure'],
 ['house', 'keeping', 'alarm'],
 ['ddm', 'rx', 'pwr', 'low'],
 ['eth', 'no', 'rx', 'traffic'],
 ['mep', 'lsp', 'loc'],
 ['input', 'power', 'degrade', 'defect'],
 ['improper', 'removal'],
 ['oamloss', 'of', 'continuity'],
 ['fan', 'fail'],
 ['loss', 'of', 'connectivity'],
 ['unit', 'rem'],
 ['ether', 'port', 'crc'],
 ['batt', 'low'],
 ['ether', 'tca', 'crc', '15m'],
 ['ether', 'tca', 'crc', '1d'],
 ['procedure', 'error'],
 ['hardware', 'failure'],
 ['opt', 'los'],
 ['os', 'mismatch'],
 ['remote', 'fault', 'indication'],
 ['ether', 'li

### 축약어 사전 생성
- A 제조사는 축약어를 사용한다. 같은 의미를 가진 서로 다른 단어가 서로 다른 수치 표현으로 변환되는 것(= 서로 다른 의미를 가진 단어로 취급)을 방지하기 위해 통일해야 한다.

In [26]:
full_name =  {
'ais': 'alarm indication signal',
'batt': 'battery',
'cep': 'critical event prevention',
'conn': 'connection',
'crc': 'cyclic redundancy check',
'csf': 'client signal fail',
'dcc': 'data communication channel',
'dcn': 'data communication network',
'dc': 'direct current',
'ddm': 'digital diagnostic monitoring',
'env': 'environment',
'err': 'error',
'eth': 'ethernet',
'fan': 'fan',
'fl': 'frame loss',
'idf': 'in-service data frame',
'init': 'initialization',
'ioc': 'input output controller',
'ipc': 'inter process communication',
'iof': 'ingress only filtering',
'll': 'link-layer',
'llcf': 'link level control field',
'loc': 'loss of communication',
'loc': 'loss of continuity',
'lof': 'loss of frame',
'los': 'loss of signal',
'lsp': 'label switched path',
'meg': 'mega',
'mep': 'maintenance entity group end point',
'mis': 'misalignment',
'ne': 'network element',
'nvram': 'non volatile random access memory',
'oam': 'operations, administration, and maintenance',
'oamloss': 'operations administration and maintenance loss',
'odi': 'optical delay interferometer',
'opt': 'optical',
'pde': 'path delay estimation',
'pdm': 'physical data model',
'poam': 'physical operations, administration, and maintenance',
'ps': 'packet switching',
'psm': 'power save mode',
'psu': 'power supply unit',
'pw': 'pseudowire',
'pwe': 'pseudowire emulation',
'ql': 'queue length',
'rdi': 'remote defect indication',
'rem': 'remote',
'rfa': 'remote failure analysis',
'rmt': 'remote',
'rx': 'receiver',
'rsmt': 'required signal-to-noise ratio margin threshold',
'sfp': 'small form factor pluggable',
'stm': 'synchronous transport module',
'sys': 'system',
'tca': 'threshold crossing alert',
'tdm': 'time division multiplexing',
'trk': 'track',
'tx': 'transmit',
'urc': 'unidirectional remote control',
}

## 축약어로 표현된 토큰들을 풀어내어 통일

In [27]:
fin_msg = []
for sub_list in cleaned_msg:
    temp = []
    for item in sub_list:
        if item in full_name:
            temp.append(full_name[item])
        else:
            temp.append(item)
    fin_msg.append(temp)
fin_msg

[['unit', 'initialization', 'by', 'reset'],
 ['manual', 'management', 'removal'],
 ['synchronization', 'signal', 'fail', 'raise'],
 ['network element', 'not', 'reach', 'via', 'primary', 'mng', 'interf'],
 ['ethernet', 'no', 'transmit', 'traffic'],
 ['queue length', 'fail'],
 ['unit', 'out'],
 ['unit', 'initialization', 'by', 'inslot'],
 ['time division multiplexing', 'pseudowire', 'remote', 'fail'],
 ['autonego', 'mismatch'],
 ['maintenance entity group end point',
  'pseudowire emulation',
  'client signal fail'],
 ['ether', 'link', 'down', 'link level control field'],
 ['input output controller', 'payload', 'links', 'with', 'matrix', 'failure'],
 ['house', 'keeping', 'alarm'],
 ['digital diagnostic monitoring', 'receiver', 'pwr', 'low'],
 ['ethernet', 'no', 'receiver', 'traffic'],
 ['maintenance entity group end point',
  'label switched path',
  'loss of continuity'],
 ['input', 'power', 'degrade', 'defect'],
 ['improper', 'removal'],
 ['operations administration and maintenance los

## 문장형으로 풀어낸 축약어들을 개별 단어들로 토큰화

In [28]:
msg_uni = []
for error in fin_msg:
    new_error = []
    for word in error:
        word_list = re.split(pattern, word)
        new_error += word_list
    msg_uni.append(new_error)
msg_uni

[['unit', 'initialization', 'by', 'reset'],
 ['manual', 'management', 'removal'],
 ['synchronization', 'signal', 'fail', 'raise'],
 ['network', 'element', 'not', 'reach', 'via', 'primary', 'mng', 'interf'],
 ['ethernet', 'no', 'transmit', 'traffic'],
 ['queue', 'length', 'fail'],
 ['unit', 'out'],
 ['unit', 'initialization', 'by', 'inslot'],
 ['time', 'division', 'multiplexing', 'pseudowire', 'remote', 'fail'],
 ['autonego', 'mismatch'],
 ['maintenance',
  'entity',
  'group',
  'end',
  'point',
  'pseudowire',
  'emulation',
  'client',
  'signal',
  'fail'],
 ['ether', 'link', 'down', 'link', 'level', 'control', 'field'],
 ['input',
  'output',
  'controller',
  'payload',
  'links',
  'with',
  'matrix',
  'failure'],
 ['house', 'keeping', 'alarm'],
 ['digital', 'diagnostic', 'monitoring', 'receiver', 'pwr', 'low'],
 ['ethernet', 'no', 'receiver', 'traffic'],
 ['maintenance',
  'entity',
  'group',
  'end',
  'point',
  'label',
  'switched',
  'path',
  'loss',
  'of',
  'continui

## 단어 임베딩 모델 생성

In [30]:
from gensim.models import Word2Vec, FastText

In [32]:
# 학습시킬 전체 말뭉치(= alarmmsg_original의 전체 범주)
sentences = msg_uni

# 모델 생성
if embedding_model == 'w2v':
    model = Word2Vec(sentences, vector_size=word_vec_len, window=window_size, min_count=1, workers=4)
    # 모델 저장
    model.save(w2v_path)
    # 모델 불러오기 예시
    # model = Word2Vec.load(w2v_path)
elif embedding_model == 'fast':
    model = FastText(sentences, vector_size=word_vec_len, window=window_size, min_count=1, workers=4, sg=sg)
    model.save(fast_path)
    # 모델 불러오기 예시
    # model = FastText.load(fast_path)
# 단어를 임베딩한 벡터 값 확인
print(model.wv['communication', 'data', 'network'])

[[ 2.33462139e-04  1.10550527e-03 -8.30097066e-04 -9.65214160e-04
   1.65458478e-03  1.84627905e-04  1.73240493e-03 -2.34158098e-04
  -2.08324310e-03 -1.01679834e-04  2.85268109e-03  7.06844672e-04
   1.54870690e-03  2.28503501e-04  9.95156239e-04  9.44359053e-04
  -1.14503561e-03  4.91483370e-04 -9.21080296e-04  1.61688324e-04
   6.10307325e-04  6.01704989e-04  3.35507822e-04 -9.15470295e-08
  -4.91310318e-04 -8.73949553e-04 -1.35218864e-03  1.55750095e-04
  -1.55129033e-04 -8.36733088e-05  1.01745839e-03  7.60567491e-04
  -2.55621108e-03 -6.10819203e-04  4.00126482e-05 -1.01661379e-03
   2.64662970e-03  1.94886758e-04 -5.50074619e-04  6.32610230e-04
  -9.65014915e-04 -8.58902989e-04 -1.65244425e-03  1.20835635e-03
   1.42975070e-03 -1.72063825e-03  7.63372867e-04 -9.44979955e-04
  -5.51469275e-05 -9.46307147e-04  2.99201114e-03 -7.33090856e-04
  -1.73058652e-04 -7.49957835e-05  2.86441045e-05 -2.30262452e-03
  -3.40513740e-04  5.78511972e-04  1.29883550e-03  2.06581061e-03
   1.45014

# 단어 임베딩 모델을 이용해 alarmmsg_original을 수치 표현

## 임베딩 모델에 대한 input 양식에 맞도록 전처리

In [33]:
temp = data.copy()
# 필요한 변수들만 추출
temp = temp[['alarmtime', 'alarmlevel', 'alarmmsg_original', 'site1', 'sysname1',	'alarmno_log1p', 'ticketno_log1p', target]]
# alarmmsg_original 변수 값들을 모두 소문자로 변경
temp["alarmmsg_original"] = temp["alarmmsg_original"].str.lower()
temp

Unnamed: 0,alarmtime,alarmlevel,alarmmsg_original,site1,sysname1,alarmno_log1p,ticketno_log1p,root_cause_type
0,2022-12-01 00:02:24+09:00,5,eth-err,C,c,28.143737,16.865856,LinkCut
1,2022-12-01 00:17:15+09:00,5,eth-err,C,c,28.143738,16.865856,LinkCut
2,2022-12-01 00:32:11+09:00,5,eth-err,C,c,28.143738,16.865856,LinkCut
3,2022-12-01 00:47:10+09:00,5,eth-err,C,c,28.143739,16.865856,LinkCut
4,2022-12-01 01:02:24+09:00,5,eth-err,C,c,28.143739,16.865856,LinkCut
...,...,...,...,...,...,...,...,...
9317,2022-12-24 23:37:14+09:00,7,opt-los,F,g,28.144977,16.896256,LinkCut
9318,2022-12-25 10:13:46+09:00,7,opt-remove,G,i,20.593099,16.568752,UnitFail
9319,2022-12-25 22:28:14+09:00,7,opt-los,D,e,28.145027,16.897870,LinkCut
9320,2022-12-25 23:25:03+09:00,7,opt-los,B,c,28.145029,16.897935,LinkCut


In [34]:
# 불용어 처리
temp['alarmmsg_original'] = temp['alarmmsg_original'].apply(lambda x : re.sub(pattern, ' ', x))
# alarmmsg_original 1차 토큰화
temp['alarmmsg_original'] = temp['alarmmsg_original'].apply(lambda x : re.findall('[A-Za-z0-9]+', x))
temp

Unnamed: 0,alarmtime,alarmlevel,alarmmsg_original,site1,sysname1,alarmno_log1p,ticketno_log1p,root_cause_type
0,2022-12-01 00:02:24+09:00,5,"[eth, err]",C,c,28.143737,16.865856,LinkCut
1,2022-12-01 00:17:15+09:00,5,"[eth, err]",C,c,28.143738,16.865856,LinkCut
2,2022-12-01 00:32:11+09:00,5,"[eth, err]",C,c,28.143738,16.865856,LinkCut
3,2022-12-01 00:47:10+09:00,5,"[eth, err]",C,c,28.143739,16.865856,LinkCut
4,2022-12-01 01:02:24+09:00,5,"[eth, err]",C,c,28.143739,16.865856,LinkCut
...,...,...,...,...,...,...,...,...
9317,2022-12-24 23:37:14+09:00,7,"[opt, los]",F,g,28.144977,16.896256,LinkCut
9318,2022-12-25 10:13:46+09:00,7,"[opt, remove]",G,i,20.593099,16.568752,UnitFail
9319,2022-12-25 22:28:14+09:00,7,"[opt, los]",D,e,28.145027,16.897870,LinkCut
9320,2022-12-25 23:25:03+09:00,7,"[opt, los]",B,c,28.145029,16.897935,LinkCut


In [35]:
# 축약어로 표현된 단어를 풀어낸 뒤 토큰화하는 함수
def expand_abbr(abbr_lst, abbr_dict):
    temp = []
    for abbr in abbr_lst:
        if abbr in abbr_dict:
            temp.extend(abbr_dict[abbr].split())
        else:
            if len(abbr) > 1:
                temp.append(abbr)
    return temp

In [36]:
# 축약어로 표현된 단어들을 풀어내주고 2차 토큰화
temp['alarmmsg_original'] = temp['alarmmsg_original'].apply(lambda x : expand_abbr(x, full_name))
temp

Unnamed: 0,alarmtime,alarmlevel,alarmmsg_original,site1,sysname1,alarmno_log1p,ticketno_log1p,root_cause_type
0,2022-12-01 00:02:24+09:00,5,"[ethernet, error]",C,c,28.143737,16.865856,LinkCut
1,2022-12-01 00:17:15+09:00,5,"[ethernet, error]",C,c,28.143738,16.865856,LinkCut
2,2022-12-01 00:32:11+09:00,5,"[ethernet, error]",C,c,28.143738,16.865856,LinkCut
3,2022-12-01 00:47:10+09:00,5,"[ethernet, error]",C,c,28.143739,16.865856,LinkCut
4,2022-12-01 01:02:24+09:00,5,"[ethernet, error]",C,c,28.143739,16.865856,LinkCut
...,...,...,...,...,...,...,...,...
9317,2022-12-24 23:37:14+09:00,7,"[optical, loss, of, signal]",F,g,28.144977,16.896256,LinkCut
9318,2022-12-25 10:13:46+09:00,7,"[optical, remove]",G,i,20.593099,16.568752,UnitFail
9319,2022-12-25 22:28:14+09:00,7,"[optical, loss, of, signal]",D,e,28.145027,16.897870,LinkCut
9320,2022-12-25 23:25:03+09:00,7,"[optical, loss, of, signal]",B,c,28.145029,16.897935,LinkCut


In [37]:
# 벡터 값들의 평균을 내주는 함수
def mean_vector(vectors):
    return np.mean(vectors, axis=0)

In [38]:
# 각 단어 토큰을 수치 표현으로 임베딩
temp['alarmmsg_original'] = temp['alarmmsg_original'].apply(lambda x : model.wv[x])
# 여러 개의 단어 토큰으로 이루어진 alarmmsg_original을 벡터 값들의 평균으로 표현
temp['alarmmsg_original'] = temp['alarmmsg_original'].apply(lambda x : mean_vector(x))
temp

Unnamed: 0,alarmtime,alarmlevel,alarmmsg_original,site1,sysname1,alarmno_log1p,ticketno_log1p,root_cause_type
0,2022-12-01 00:02:24+09:00,5,"[-0.0009134301, -0.00068933005, -0.00039331298...",C,c,28.143737,16.865856,LinkCut
1,2022-12-01 00:17:15+09:00,5,"[-0.0009134301, -0.00068933005, -0.00039331298...",C,c,28.143738,16.865856,LinkCut
2,2022-12-01 00:32:11+09:00,5,"[-0.0009134301, -0.00068933005, -0.00039331298...",C,c,28.143738,16.865856,LinkCut
3,2022-12-01 00:47:10+09:00,5,"[-0.0009134301, -0.00068933005, -0.00039331298...",C,c,28.143739,16.865856,LinkCut
4,2022-12-01 01:02:24+09:00,5,"[-0.0009134301, -0.00068933005, -0.00039331298...",C,c,28.143739,16.865856,LinkCut
...,...,...,...,...,...,...,...,...
9317,2022-12-24 23:37:14+09:00,7,"[-0.0008455632, 0.0013156387, 0.0017750752, 0....",F,g,28.144977,16.896256,LinkCut
9318,2022-12-25 10:13:46+09:00,7,"[-0.0020802815, 0.0030788938, 0.0017263817, 0....",G,i,20.593099,16.568752,UnitFail
9319,2022-12-25 22:28:14+09:00,7,"[-0.0008455632, 0.0013156387, 0.0017750752, 0....",D,e,28.145027,16.897870,LinkCut
9320,2022-12-25 23:25:03+09:00,7,"[-0.0008455632, 0.0013156387, 0.0017750752, 0....",B,c,28.145029,16.897935,LinkCut


# Autogluon으로 Auto-modeling

### 다차원 벡터의 각 차원을 하나의 변수로 취급하여 모델링

In [39]:
# 다차원 벡터 값으로 표현된 alarmmsg_original을 alarmmsg_vector_{x}로 1차원의 변수들로 풀어 헤치기
for i in range(word_vec_len):
    temp[f'alarmmsg_vector_{i}'] = temp['alarmmsg_original'].apply(lambda x: x[i])
temp

Unnamed: 0,alarmtime,alarmlevel,alarmmsg_original,site1,sysname1,alarmno_log1p,ticketno_log1p,root_cause_type,alarmmsg_vector_0,alarmmsg_vector_1,...,alarmmsg_vector_54,alarmmsg_vector_55,alarmmsg_vector_56,alarmmsg_vector_57,alarmmsg_vector_58,alarmmsg_vector_59,alarmmsg_vector_60,alarmmsg_vector_61,alarmmsg_vector_62,alarmmsg_vector_63
0,2022-12-01 00:02:24+09:00,5,"[-0.0009134301, -0.00068933005, -0.00039331298...",C,c,28.143737,16.865856,LinkCut,-0.000913,-0.000689,...,0.001279,-0.001462,-0.000386,0.000189,0.001264,0.000108,0.001854,0.002203,-0.000884,-0.001552
1,2022-12-01 00:17:15+09:00,5,"[-0.0009134301, -0.00068933005, -0.00039331298...",C,c,28.143738,16.865856,LinkCut,-0.000913,-0.000689,...,0.001279,-0.001462,-0.000386,0.000189,0.001264,0.000108,0.001854,0.002203,-0.000884,-0.001552
2,2022-12-01 00:32:11+09:00,5,"[-0.0009134301, -0.00068933005, -0.00039331298...",C,c,28.143738,16.865856,LinkCut,-0.000913,-0.000689,...,0.001279,-0.001462,-0.000386,0.000189,0.001264,0.000108,0.001854,0.002203,-0.000884,-0.001552
3,2022-12-01 00:47:10+09:00,5,"[-0.0009134301, -0.00068933005, -0.00039331298...",C,c,28.143739,16.865856,LinkCut,-0.000913,-0.000689,...,0.001279,-0.001462,-0.000386,0.000189,0.001264,0.000108,0.001854,0.002203,-0.000884,-0.001552
4,2022-12-01 01:02:24+09:00,5,"[-0.0009134301, -0.00068933005, -0.00039331298...",C,c,28.143739,16.865856,LinkCut,-0.000913,-0.000689,...,0.001279,-0.001462,-0.000386,0.000189,0.001264,0.000108,0.001854,0.002203,-0.000884,-0.001552
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9317,2022-12-24 23:37:14+09:00,7,"[-0.0008455632, 0.0013156387, 0.0017750752, 0....",F,g,28.144977,16.896256,LinkCut,-0.000846,0.001316,...,0.003791,0.000627,-0.001644,-0.002288,0.002155,-0.002362,0.001379,0.001334,-0.001510,-0.001159
9318,2022-12-25 10:13:46+09:00,7,"[-0.0020802815, 0.0030788938, 0.0017263817, 0....",G,i,20.593099,16.568752,UnitFail,-0.002080,0.003079,...,0.001429,0.000107,-0.003421,-0.000794,-0.001162,0.000664,-0.000670,-0.000187,-0.001082,-0.000850
9319,2022-12-25 22:28:14+09:00,7,"[-0.0008455632, 0.0013156387, 0.0017750752, 0....",D,e,28.145027,16.897870,LinkCut,-0.000846,0.001316,...,0.003791,0.000627,-0.001644,-0.002288,0.002155,-0.002362,0.001379,0.001334,-0.001510,-0.001159
9320,2022-12-25 23:25:03+09:00,7,"[-0.0008455632, 0.0013156387, 0.0017750752, 0....",B,c,28.145029,16.897935,LinkCut,-0.000846,0.001316,...,0.003791,0.000627,-0.001644,-0.002288,0.002155,-0.002362,0.001379,0.001334,-0.001510,-0.001159


In [40]:
temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9322 entries, 0 to 9321
Data columns (total 72 columns):
 #   Column              Non-Null Count  Dtype                     
---  ------              --------------  -----                     
 0   alarmtime           9322 non-null   datetime64[ns, Asia/Seoul]
 1   alarmlevel          9322 non-null   object                    
 2   alarmmsg_original   9322 non-null   object                    
 3   site1               9322 non-null   object                    
 4   sysname1            9322 non-null   object                    
 5   alarmno_log1p       9322 non-null   float64                   
 6   ticketno_log1p      9322 non-null   float64                   
 7   root_cause_type     9322 non-null   object                    
 8   alarmmsg_vector_0   9322 non-null   float32                   
 9   alarmmsg_vector_1   9322 non-null   float32                   
 10  alarmmsg_vector_2   9322 non-null   float32                   
 11  alar

### Train Test split

In [41]:
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularPredictor, TabularDataset
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [42]:
x = temp.drop(columns=['alarmmsg_original',target])
y = temp[target]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((7457, 70), (1865, 70), (7457,), (1865,))

### 학습용 데이터셋 생성

In [43]:
train_data = pd.concat([x_train, y_train], axis=1)  # 학습용 x, y 데이터 셋 병합
train_data = TabularDataset(train_data)
train_data

Unnamed: 0,alarmtime,alarmlevel,site1,sysname1,alarmno_log1p,ticketno_log1p,alarmmsg_vector_0,alarmmsg_vector_1,alarmmsg_vector_2,alarmmsg_vector_3,...,alarmmsg_vector_55,alarmmsg_vector_56,alarmmsg_vector_57,alarmmsg_vector_58,alarmmsg_vector_59,alarmmsg_vector_60,alarmmsg_vector_61,alarmmsg_vector_62,alarmmsg_vector_63,root_cause_type
7019,2022-12-19 09:55:04+09:00,5,C,c,28.144689,16.886686,-0.000913,-0.000689,-0.000393,0.003353,...,-0.001462,-0.000386,0.000189,1.264222e-03,0.000108,0.001854,0.002203,-0.000884,-0.001552,LinkCut
1580,2022-12-03 05:47:23+09:00,5,C,c,28.143853,16.868132,-0.000913,-0.000689,-0.000393,0.003353,...,-0.001462,-0.000386,0.000189,1.264222e-03,0.000108,0.001854,0.002203,-0.000884,-0.001552,LinkCut
4841,2022-12-10 19:09:41+09:00,5,D,e,20.612935,16.650187,0.001308,-0.000385,0.001037,0.000290,...,0.001452,0.000737,0.000794,1.466250e-04,0.001374,-0.001111,-0.000146,-0.001877,-0.000941,PowerFail
8335,2022-12-22 07:42:44+09:00,5,C,c,28.144840,16.893222,-0.000913,-0.000689,-0.000393,0.003353,...,-0.001462,-0.000386,0.000189,1.264222e-03,0.000108,0.001854,0.002203,-0.000884,-0.001552,LinkCut
525,2022-12-01 19:23:04+09:00,5,D,e,28.143779,16.865088,0.002821,0.002356,0.002445,-0.001251,...,0.000944,-0.000232,-0.001146,-2.098695e-03,0.002648,0.000347,0.002082,-0.000891,0.002884,LinkCut
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3764,2022-12-07 01:07:43+09:00,5,E,f,28.144050,16.871516,-0.001075,0.000416,0.000290,0.001151,...,0.000646,0.000066,-0.000710,-8.881325e-07,-0.002323,0.001164,0.001235,-0.000422,-0.001714,LinkCut
9104,2022-12-24 13:14:27+09:00,7,F,h,28.144955,16.895775,-0.000846,0.001316,0.001775,0.000833,...,0.000627,-0.001644,-0.002288,2.155288e-03,-0.002362,0.001379,0.001334,-0.001510,-0.001159,LinkCut
3019,2022-12-05 19:18:49+09:00,5,E,f,28.143986,16.869298,0.002821,0.002356,0.002445,-0.001251,...,0.000944,-0.000232,-0.001146,-2.098695e-03,0.002648,0.000347,0.002082,-0.000891,0.002884,LinkCut
2580,2022-12-04 23:36:54+09:00,5,E,f,28.143943,16.869401,0.002821,0.002356,0.002445,-0.001251,...,0.000944,-0.000232,-0.001146,-2.098695e-03,0.002648,0.000347,0.002082,-0.000891,0.002884,LinkCut


## Auto-Modeling

In [44]:
# path 인자로 모델 저장 경로 지정 가능
predictor = TabularPredictor(label=target, path=save_path)
predictor.fit(train_data)

Beginning AutoGluon training ...
AutoGluon will save models to "/content/drive/MyDrive/kt_network_competition/AutogluonModels_kt_network/"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Fri Jun 9 10:57:30 UTC 2023
Disk Space Avail:   7.17 GB / 16.11 GB (44.5%)
	We recommend a minimum available disk space of 10 GB, and large datasets may require more.
Train Data Rows:    7457
Train Data Columns: 70
Label Column: root_cause_type
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == object).
	3 unique label values:  ['LinkCut', 'PowerFail', 'UnitFail']
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Train Data Class Count: 3
Using Feature Generators to preprocess the data ...
Fitting AutoML

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x78fd8f804eb0>

## Evaluation

In [45]:
# 예측 및 평가
predictions = predictor.predict(x_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
print("Classification report:\n", classification_report(y_test, predictions))

Accuracy: 0.9983914209115281
Classification report:
               precision    recall  f1-score   support

     LinkCut       1.00      1.00      1.00      1619
   PowerFail       0.99      1.00      0.99       145
    UnitFail       1.00      0.98      0.99       101

    accuracy                           1.00      1865
   macro avg       1.00      0.99      0.99      1865
weighted avg       1.00      1.00      1.00      1865



In [46]:
# Autogluon이 사용한 모델 리스트
predictor.get_model_names()

['KNeighborsUnif',
 'KNeighborsDist',
 'NeuralNetFastAI',
 'LightGBMXT',
 'LightGBM',
 'RandomForestGini',
 'RandomForestEntr',
 'CatBoost',
 'ExtraTreesGini',
 'ExtraTreesEntr',
 'XGBoost',
 'NeuralNetTorch',
 'LightGBMLarge',
 'WeightedEnsemble_L2']

In [47]:
# Autogluon이 사용한 모든 모델에 대하여 평가하기
for model_name in predictor.get_model_names():
    print(f'모델 이름 : {model_name}')
    # test 값 예측
    pred = predictor.predict(x_test, model=model_name)
    # 정확도 계산
    accuracy = accuracy_score(y_test, pred)
    print("Accuracy:", accuracy)
    print("Classification report:\n", classification_report(y_test, pred))
    print(confusion_matrix(y_test, pred))

모델 이름 : KNeighborsUnif
Accuracy: 0.9260053619302949
Classification report:
               precision    recall  f1-score   support

     LinkCut       0.96      0.96      0.96      1619
   PowerFail       0.63      0.66      0.65       145
    UnitFail       0.85      0.75      0.80       101

    accuracy                           0.93      1865
   macro avg       0.81      0.79      0.80      1865
weighted avg       0.93      0.93      0.93      1865

[[1555   52   12]
 [  48   96    1]
 [  21    4   76]]
모델 이름 : KNeighborsDist
Accuracy: 0.9485254691689008
Classification report:
               precision    recall  f1-score   support

     LinkCut       0.98      0.97      0.97      1619
   PowerFail       0.75      0.87      0.81       145
    UnitFail       0.84      0.75      0.79       101

    accuracy                           0.95      1865
   macro avg       0.85      0.86      0.86      1865
weighted avg       0.95      0.95      0.95      1865

[[1567   38   14]
 [  18  126  

## 모델 불러오기

In [48]:
predictor = TabularPredictor.load(save_path)

In [49]:
# 예측 및 평가
predictions = predictor.predict(x_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
print("Classification report:\n", classification_report(y_test, predictions))

Accuracy: 0.9983914209115281
Classification report:
               precision    recall  f1-score   support

     LinkCut       1.00      1.00      1.00      1619
   PowerFail       0.99      1.00      0.99       145
    UnitFail       1.00      0.98      0.99       101

    accuracy                           1.00      1865
   macro avg       1.00      0.99      0.99      1865
weighted avg       1.00      1.00      1.00      1865

