# 밑준비


In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer, tokenizer_from_json
from tensorflow.keras.utils import pad_sequences, to_categorical
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec, FastText

import pandas as pd
import numpy as np


from copy import deepcopy
from collections import Counter
import pytz, re, json

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 전처리 옵션

In [5]:
# embedding_model = 'w2v'
embedding_model = 'fast'
pattern = r'[\s\-_\(\)\/]'

alarmno_len = 12
alarmlv_len = 12
alarmmsg_len = 12
site_len = 12
sys_len = 12

full_name =  {
'ais': 'alarm indication signal',
'batt': 'battery',
'cep': 'critical event prevention',
'conn': 'connection',
'crc': 'cyclic redundancy check',
'csf': 'client signal fail',
'dcc': 'data communication channel',
'dcn': 'data communication network',
'dc': 'direct current',
'ddm': 'digital diagnostic monitoring',
'env': 'environment',
'err': 'error',
'eth': 'ethernet',
'fan': 'fan',
'fl': 'frame loss',
'idf': 'in-service data frame',
'init': 'initialization',
'ioc': 'input output controller',
'ipc': 'inter process communication',
'iof': 'ingress only filtering',
'll': 'link-layer',
'llcf': 'link level control field',
'loc': 'loss of communication',
'loc': 'loss of continuity',
'lof': 'loss of frame',
'los': 'loss of signal',
'lsp': 'label switched path',
'meg': 'mega',
'mep': 'maintenance entity group end point',
'mis': 'misalignment',
'ne': 'network element',
'nvram': 'non volatile random access memory',
'oam': 'operations, administration, and maintenance',
'oamloss': 'operations administration and maintenance loss',
'odi': 'optical delay interferometer',
'opt': 'optical',
'pde': 'path delay estimation',
'pdm': 'physical data model',
'poam': 'physical operations, administration, and maintenance',
'ps': 'packet switching',
'psm': 'power save mode',
'psu': 'power supply unit',
'pw': 'pseudowire',
'pwe': 'pseudowire emulation',
'ql': 'queue length',
'rdi': 'remote defect indication',
'rem': 'remote',
'rfa': 'remote failure analysis',
'rmt': 'remote',
'rx': 'receiver',
'rsmt': 'required signal-to-noise ratio margin threshold',
'sfp': 'small form factor pluggable',
'stm': 'synchronous transport module',
'sys': 'system',
'tca': 'threshold crossing alert',
'tdm': 'time division multiplexing',
'trk': 'track',
'tx': 'transmit',
'urc': 'unidirectional remote control',
}

# 데이터 불러오기

In [6]:
train_path = '/content/drive/MyDrive/my_data/kt_network/q2/Q2_train.csv'
test_path = '/content/drive/MyDrive/my_data/kt_network/q2/Q2_test.csv'
submisson_path = '/content/drive/MyDrive/my_data/kt_network/q2/Q2_label_sample.csv'
save_path = '/content/drive/MyDrive/kt_network_competition/AutogluonModels_kt_network'
w2v_path = '/content/drive/MyDrive/kt_network_competition/w2v_model'
fast_path = '/content/drive/MyDrive/kt_network_competition/fasttext_model'
tk_path = '/content/drive/MyDrive/kt_network_competition/'
rnn_path = '/content/drive/MyDrive/my_data/kt_network/model/q2model.h5'

In [7]:
test = pd.read_csv("/content/drive/MyDrive/my_data/kt_network/q2/Q2_test.csv")
test

Unnamed: 0,ticketno,alarmno,alarmtime,alarmlevel,alarmmsg_original,site,sysname,unit,slot,port,sva,root_cause_domain
0,21812391.0,1671894138838,2022-12-25 00:02:16+09:00,5,Loss Of Signal,AEAQ,afbd,r1sr1sl16/ETHLocPort#15#1,16.0,15.0,NSA,C
1,21775988.0,1671894172511,2022-12-25 00:02:51+09:00,5,Loss Of Signal,ADZW,aezn,r1sr1sl13/ETHLocPort#20#1,13.0,20.0,NSA,C
2,21792259.0,1671894204682,2022-12-25 00:03:22+09:00,4,DDM_RX_PWR_HIGH,AECE,afeg,X2FUA,2.0,1.0,NSA,B
3,21812412.0,1671894215702,2022-12-25 00:03:33+09:00,5,BATT_ENV_FAIL,ACCN,aclp,---,,,SA,B
4,21812417.0,1671894220812,2022-12-25 00:03:39+09:00,5,Loss Of Signal,AEAQ,afbd,r1sr1sl16/ETHLocPort#16#1,16.0,16.0,NSA,C
...,...,...,...,...,...,...,...,...,...,...,...,...
37666,21986223.0,1672412311698,2022-12-30 23:58:30+09:00,5,Loss Of Signal,ADZW,aezn,r1sr1sl13/ETHLocPort#20#1,13.0,20.0,NSA,C
37667,22015278.0,1672412316271,2022-12-30 23:58:33+09:00,4,DDM_RX_PWR_HIGH,AEMD,afsr,G16FU,5.0,6.0,SA,B
37668,21986426.0,1672412317238,2022-12-30 23:58:34+09:00,5,MEP_LSP_RDI,ACMY,acxj,G2FUA,1.0,1.0,NSA,B
37669,22015300.0,1672412373531,2022-12-30 23:59:32+09:00,5,Loss Of Signal,AEAQ,afbd,r1sr1sl16/ETHLocPort#16#1,16.0,16.0,NSA,C


# 임베딩 모델 불러오기

In [8]:
# 모델 생성
if embedding_model == 'w2v':
    model = Word2Vec.load(w2v_path)
elif embedding_model == 'fast':
    model = FastText.load(fast_path)
# 단어를 임베딩한 벡터 값 확인
print(model.wv['communication', 'data', 'network'])

[[ 2.33462139e-04  1.10550527e-03 -8.30097066e-04 -9.65214160e-04
   1.65458478e-03  1.84627905e-04  1.73240493e-03 -2.34158098e-04
  -2.08324310e-03 -1.01679834e-04  2.85268109e-03  7.06844672e-04
   1.54870690e-03  2.28503501e-04  9.95156239e-04  9.44359053e-04
  -1.14503561e-03  4.91483370e-04 -9.21080296e-04  1.61688324e-04
   6.10307325e-04  6.01704989e-04  3.35507822e-04 -9.15470295e-08
  -4.91310318e-04 -8.73949553e-04 -1.35218864e-03  1.55750095e-04
  -1.55129033e-04 -8.36733088e-05  1.01745839e-03  7.60567491e-04
  -2.55621108e-03 -6.10819203e-04  4.00126482e-05 -1.01661379e-03
   2.64662970e-03  1.94886758e-04 -5.50074619e-04  6.32610230e-04
  -9.65014915e-04 -8.58902989e-04 -1.65244425e-03  1.20835635e-03
   1.42975070e-03 -1.72063825e-03  7.63372867e-04 -9.44979955e-04
  -5.51469275e-05 -9.46307147e-04  2.99201114e-03 -7.33090856e-04
  -1.73058652e-04 -7.49957835e-05  2.86441045e-05 -2.30262452e-03
  -3.40513740e-04  5.78511972e-04  1.29883550e-03  2.06581061e-03
   1.45014

# 전처리 및 sequence 데이터화

In [9]:
# 축약어를 문장화 한 뒤 토큰화
def expand_abbr(abbr_lst, abbr_dict):
    temp = []
    for abbr in abbr_lst:
        if abbr in abbr_dict:
            temp.extend(abbr_dict[abbr].split())
        else:
            if len(abbr) > 1:
                temp.append(abbr)
    return temp

# 벡터들의 평균 구하기
def mean_vector(vectors):
    return np.mean(vectors, axis=0)


def feature_engineering(data, pattern, abbr_dict):
    '''
    params
        data : DataFrame - feature engineering의 대상이 되는 데이터
    return
        DataFrame
    '''
    # ticketno column을 int64로 변경
    temp = deepcopy(data)
    temp['ticketno'] = temp['ticketno'].astype('int64')

    # ticketno, alarmno를 로그변환
    temp['ticketno_log1p'] = np.log1p(temp['ticketno'])
    temp['alarmno_log1p'] = np.log1p(temp['alarmno'])

    # alarmtime column을 datetime64로 변경
    tz_pytz = pytz.timezone('Asia/Seoul')
    temp['alarmtime'] = pd.to_datetime(temp['alarmtime'], unit='ns', utc=True).dt.tz_convert('Asia/Seoul')

    # alarmmsg
    temp["alarmmsg_vector"] = temp["alarmmsg_original"].str.lower()
        # 불용어 처리
    temp['alarmmsg_vector'] = temp['alarmmsg_vector'].apply(lambda x : re.sub(pattern, ' ', x))
        # 토큰화
    temp['alarmmsg_vector'] = temp['alarmmsg_vector'].apply(lambda x : re.findall('[A-Za-z0-9]+', x))
        # 축약어 풀이 및 토큰화
    temp['alarmmsg_vector'] = temp['alarmmsg_vector'].apply(lambda x : expand_abbr(x, abbr_dict))
        # 토큰 임베딩
    temp['alarmmsg_vector'] = temp['alarmmsg_vector'].apply(lambda x : model.wv[x])
        # 수치표현된 토큰 값들을 평균내어 alarmmsg_original에 대한 수치표현 값 구하기
    temp['alarmmsg_vector'] = temp['alarmmsg_vector'].apply(lambda x : mean_vector(x))
        # 벡터에 차원을 추가해 다차원 벡터값들을 합치기 편하게 만든다.
    temp['alarmmsg_vector'] = temp['alarmmsg_vector'].apply(lambda x : np.expand_dims(x, axis=0))


    # alarmlevel 범주화
    temp['alarmlevel'] = temp['alarmlevel'].astype('object')

    # site & sysname을 알파벳 단위로 분리
    # site
    site_col_dict = {}
    for i in range(4):
        site_col_dict[f'site{i}'] = []

    for idx, value in temp['site'].items():
        for num, each_char in enumerate(value):
            site_col_dict[f'site{num}'].append(each_char)
    site_df = pd.DataFrame(site_col_dict)
    # sysname
    sysname_col_dict = {}
    for i in range(4):
        sysname_col_dict[f'sysname{i}'] = []

    for idx, value in temp['sysname'].items():
        for num, each_char in enumerate(value):
            sysname_col_dict[f'sysname{num}'].append(each_char)

    sysname_df = pd.DataFrame(sysname_col_dict)
    temp = pd.concat([temp, site_df, sysname_df], axis=1)

    return temp

In [10]:
def make_seq_data(data, train=True):

    temp = deepcopy(data)

    # seq 데이터 생성
    temp_seq = temp.groupby(['ticketno'], group_keys=True, as_index=False)['alarmno_log1p'].apply(lambda x : ' '.join(map(str, x)))
        # ticketno 별로 그룹화 해서 sequence 데이터화 시키기
    temp_seq['ticketno_log1p'] = temp.groupby(['ticketno'], group_keys=True, as_index=False)['ticketno_log1p'].apply(lambda x : ' '.join(map(str, x)))['ticketno_log1p']
    temp_seq['alarmlevel'] = temp.groupby(['ticketno'], group_keys=True, as_index=False)['alarmlevel'].apply(lambda x : ' '.join(map(str, x)))['alarmlevel']
    temp_seq['alarmmsg_vector'] = temp.groupby(['ticketno'], group_keys=True, as_index=False)['alarmmsg_vector'].apply(lambda x: np.array(x)).apply(lambda x : np.concatenate(x, axis=0))
    temp_seq['site1'] = temp.groupby(['ticketno'], group_keys=True, as_index=False)['site1'].apply(lambda x : ' '.join(x))['site1']
    temp_seq['sysname1'] = temp.groupby(['ticketno'], group_keys=True, as_index=False)['sysname1'].apply(lambda x : ' '.join(x))['sysname1']


    # seq에서 새로운 정보 추출 및 데이터형 변환
    temp_seq['alarmno_log1p'] = temp_seq['alarmno_log1p'].apply(lambda x: list(map(float, x.split(' '))))
    temp_seq['ticketno_log1p'] = temp_seq['ticketno_log1p'].apply(lambda x: list(map(float, x.split(' '))))
    temp_seq['timesteps'] = temp_seq['alarmlevel'].apply(lambda x: len(x.split(' ')))


    if train == True:
    # 라벨링
        # 데이터 프레임의 feature중 list 또는 ndarray를 값으로 가지고 있는 feature가 있다면 drop_duplicate 실행시 오류가 난다.
        # 중복 제거 기준을 축소시킴으로서 해결하자.
        temp_seq = pd.merge(temp_seq, temp[['ticketno', target]], how='left', on='ticketno').drop_duplicates(subset='ticketno', keep='first', ignore_index=True)
    else:
        pass

    return temp_seq

In [11]:
temp2 = feature_engineering(test, pattern, full_name)
temp2

Unnamed: 0,ticketno,alarmno,alarmtime,alarmlevel,alarmmsg_original,site,sysname,unit,slot,port,...,alarmno_log1p,alarmmsg_vector,site0,site1,site2,site3,sysname0,sysname1,sysname2,sysname3
0,21812391,1671894138838,2022-12-25 00:02:16+09:00,5,Loss Of Signal,AEAQ,afbd,r1sr1sl16/ETHLocPort#15#1,16.0,15.0,...,28.144978,"[[-0.00087648333, 0.00029225633, 0.0017456828,...",A,E,A,Q,a,f,b,d
1,21775988,1671894172511,2022-12-25 00:02:51+09:00,5,Loss Of Signal,ADZW,aezn,r1sr1sl13/ETHLocPort#20#1,13.0,20.0,...,28.144978,"[[-0.00087648333, 0.00029225633, 0.0017456828,...",A,D,Z,W,a,e,z,n
2,21792259,1671894204682,2022-12-25 00:03:22+09:00,4,DDM_RX_PWR_HIGH,AECE,afeg,X2FUA,2.0,1.0,...,28.144978,"[[0.0005182036, -0.000920635, -0.0012859794, -...",A,E,C,E,a,f,e,g
3,21812412,1671894215702,2022-12-25 00:03:33+09:00,5,BATT_ENV_FAIL,ACCN,aclp,---,,,...,28.144978,"[[-0.0017245616, -0.00029658875, -0.0014544856...",A,C,C,N,a,c,l,p
4,21812417,1671894220812,2022-12-25 00:03:39+09:00,5,Loss Of Signal,AEAQ,afbd,r1sr1sl16/ETHLocPort#16#1,16.0,16.0,...,28.144978,"[[-0.00087648333, 0.00029225633, 0.0017456828,...",A,E,A,Q,a,f,b,d
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37666,21986223,1672412311698,2022-12-30 23:58:30+09:00,5,Loss Of Signal,ADZW,aezn,r1sr1sl13/ETHLocPort#20#1,13.0,20.0,...,28.145288,"[[-0.00087648333, 0.00029225633, 0.0017456828,...",A,D,Z,W,a,e,z,n
37667,22015278,1672412316271,2022-12-30 23:58:33+09:00,4,DDM_RX_PWR_HIGH,AEMD,afsr,G16FU,5.0,6.0,...,28.145288,"[[0.0005182036, -0.000920635, -0.0012859794, -...",A,E,M,D,a,f,s,r
37668,21986426,1672412317238,2022-12-30 23:58:34+09:00,5,MEP_LSP_RDI,ACMY,acxj,G2FUA,1.0,1.0,...,28.145288,"[[-0.0017422084, -0.00074117514, 0.00022273058...",A,C,M,Y,a,c,x,j
37669,22015300,1672412373531,2022-12-30 23:59:32+09:00,5,Loss Of Signal,AEAQ,afbd,r1sr1sl16/ETHLocPort#16#1,16.0,16.0,...,28.145288,"[[-0.00087648333, 0.00029225633, 0.0017456828,...",A,E,A,Q,a,f,b,d


In [12]:
test_seq = make_seq_data(temp2, train=False)
test_seq

Unnamed: 0,ticketno,alarmno_log1p,ticketno_log1p,alarmlevel,alarmmsg_vector,site1,sysname1,timesteps
0,15238899,"[20.58714799033169, 20.587147994914957, 20.587...","[16.539361927140092, 16.539361927140092, 16.53...",4 7 4 7,"[[0.00034126244, -0.000639107, 0.00067938084, ...",B B B B,b b b b,4
1,15712444,"[20.593143995476403, 20.593143994337435]","[16.569963631476742, 16.569963631476742]",4 4,"[[0.00034126244, -0.000639107, 0.00067938084, ...",E E,f f,2
2,15723187,"[20.593261748634625, 20.593265157156104, 20.59...","[16.57064712338659, 16.57064712338659, 16.5706...",5 4 4,"[[-0.0009484015, -0.0005296781, -0.0012801687,...",C C C,d d d,3
3,15737103,"[20.593405691325867, 20.593405694741875, 20.59...","[16.57153179419051, 16.57153179419051, 16.5715...",4 7 7 4,"[[0.00034126244, -0.000639107, 0.00067938084, ...",C C C C,d d d d,4
4,15737132,"[20.59340588603827, 20.59340588376093, 20.5934...","[16.57153363696751, 16.57153363696751, 16.5715...",7 4 7 4,"[[0.0009238396, -0.00025987678, 0.00069495384,...",C C C C,d d d d,4
...,...,...,...,...,...,...,...,...
4322,22015278,[28.145288200831267],[16.907247270269067],4,"[[0.0005182036, -0.000920635, -0.0012859794, -...",E,f,1
4323,22015300,[28.145288235069234],[16.90724826957455],5,"[[-0.00087648333, 0.00029225633, 0.0017456828,...",E,f,1
4324,23818326,"[28.148832739791523, 28.14883271728848, 28.148...","[16.98596588432827, 16.98596588432827, 16.9859...",5 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4,"[[0.0011685278, -0.0014730846, 0.0008064375, 0...",E E E E E E E E E E E E E E E E E,f f f f f f f f f f f f f f f f f,17
4325,23819373,[28.148834578650376],[16.986009841109485],5,"[[0.0011685278, -0.0014730846, 0.0008064375, 0...",E,f,1


# json으로 저장된 tokenizer 불러오기

In [15]:
tokenizers = {}
for name in ['alarmlv_tk', 'site1_tk', 'sysname1_tk']:
    with open(f'{tk_path}{name}.json', 'r') as file:
        tokenizers[name] = tokenizer_from_json(json.load(file))


# tokenizer 및 padding 적용

In [16]:
alarmno_padseq = pad_sequences(list(test_seq['alarmno_log1p']), maxlen=alarmno_len, dtype='float', padding='pre', truncating='post')
alarmno_padseq = np.expand_dims(alarmno_padseq, axis=-1)
alarmno_padseq[0]

array([[ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [20.58714799],
       [20.58714799],
       [20.58714799],
       [20.58714799]])

In [18]:
alarmlv_idxseq = tokenizers['alarmlv_tk'].texts_to_sequences(test_seq['alarmlevel'])
alarmlv_padseq = pad_sequences(alarmlv_idxseq, maxlen=alarmlv_len, padding='pre', truncating='post')
alarmlv_padseq[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 3, 2, 3, 2], dtype=int32)

In [19]:
msgvec_padseq = pad_sequences(test_seq['alarmmsg_vector'], maxlen=alarmmsg_len, dtype='float', padding='pre', truncating='post')
msgvec_padseq[0]

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+0

In [20]:
site1_idxseq = tokenizers['site1_tk'].texts_to_sequences(test_seq['site1'])
sysname1_idxseq = tokenizers['sysname1_tk'].texts_to_sequences(test_seq['sysname1'])

site1_padseq = pad_sequences(site1_idxseq, maxlen=site_len, padding='pre', truncating='post')
sysname1_padseq = pad_sequences(sysname1_idxseq, maxlen=sys_len, padding='pre', truncating='post')
site1_padseq[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], dtype=int32)

# RNN 모델 불러오기

In [21]:
rnn_model = load_model(rnn_path)

# 예측

In [22]:
prediction = rnn_model.predict([alarmno_padseq, alarmlv_padseq, msgvec_padseq, site1_padseq, sysname1_padseq])
prediction



array([[6.6862015e-08, 9.9992830e-01, 7.1461785e-05],
       [2.2456188e-08, 9.9997562e-01, 2.4375906e-05],
       [1.2096277e-10, 9.9999100e-01, 8.9610012e-06],
       ...,
       [1.0000000e+00, 5.8054872e-09, 3.1583427e-11],
       [9.9941027e-01, 5.8879791e-04, 9.3333375e-07],
       [9.9999928e-01, 6.7651513e-07, 6.1541115e-12]], dtype=float32)

In [23]:
submission = pd.read_csv(submisson_path)
submission['root_cause_type'] = prediction.argmax(axis=1)
submission

Unnamed: 0,ticketno,root_cause_type
0,15238899.0,1
1,15712444.0,1
2,15723187.0,1
3,15737103.0,1
4,15737132.0,1
...,...,...
4322,22015278.0,1
4323,22015300.0,0
4324,23818326.0,0
4325,23819373.0,0


In [24]:
label_dict = {'LinkCut' : 0, 'PowerFail' : 1, 'UnitFail' : 2}
label_dict_r = dict((v, k) for k, v in label_dict.items())
label_dict, label_dict_r

({'LinkCut': 0, 'PowerFail': 1, 'UnitFail': 2},
 {0: 'LinkCut', 1: 'PowerFail', 2: 'UnitFail'})

In [25]:
submission['root_cause_type'] = submission['root_cause_type'].map(label_dict_r)
submission['root_cause_type']

0       PowerFail
1       PowerFail
2       PowerFail
3       PowerFail
4       PowerFail
          ...    
4322    PowerFail
4323      LinkCut
4324      LinkCut
4325      LinkCut
4326      LinkCut
Name: root_cause_type, Length: 4327, dtype: object

In [26]:
unique_elements, counts_elements = np.unique(submission["root_cause_type"], return_counts=True)
print("각 유니크 요소의 개수:", unique_elements, counts_elements)

각 유니크 요소의 개수: ['LinkCut' 'PowerFail' 'UnitFail'] [3569  537  221]
