In [5]:
import pandas as pd
import os
from collections import Counter
from glob import glob
import numpy as np
import sys
from pprint import pprint 


def show_progress(i, max_iter):
    sys.stdout.write('\r Progess {}/{}'.format(i, max_iter))
    sys.stdout.flush()
    

# 1. Check Log Data Features 
![Imgur](https://i.imgur.com/XZhqx4f.png)
 - Log Data 의 각 Features 의 특징에 대해 알아봅니다. 
 - Log Structure 

 - 계층 구조 
     - 상 : [Time] [Ip] [Request Type] [Dummy] [Data] 
     - 중 : [payId] [mode] [mileage] [userName] [senderName] [senderPhone] [carrier] [brand] [totalPrice] [payMethod] [retCode] [retMsg] [chargingAmount] [resultMessage] [resultResponse] [pgAgreeNo] [pgChannel] [payDt] [payTm] [userNo]
     [buyTotalCnt] [pgPayResult] [sendType] [earnMileage] [address] [couponCode] [couponDiscountRate] [couponDiscountType] [instDt] [instTm] [updtDt] [updtTm] **[b2cBuyHistVOList]** **[sc]** [nonmemberName] [nonmemberTel]
     [nonmemberPw] [nonmemberEmail] [billingCountry] [authToken] [deposit] [cancelDt] [cancelTm] [priceVat] [schedDay] [schedTime] [ipAddress] [cardNumberLast4] [instId] [updtId] **[isFraudWhite]** **[isFraudBlack]** [comBuyRealTimeSendHistVOList] **[comBuyResSendHistVOList]** [clientIP]
     - 하 : 


 - 계층 구조 
     - 상 : [Time] **[Session id]** [ip] [type] [Request Type] [Dummy] [Data]
     - 중 : [payId] [mode] [mileage] [userName] [senderName] [senderPhone] [carrier] [brand] [totalPrice] [payMethod] [retCode] [retMsg] [chargingAmount] [resultMessage] [resultResponse] [pgAgreeNo] [pgChannel] [payDt] [payTm] [userNo]
     [buyTotalCnt] [pgPayResult] [sendType] [earnMileage] [address] [couponCode] [couponDiscountRate] [couponDiscountType] [instDt] [instTm] [updtDt] [updtTm] **[userAgent]** **[b2cBuyHistVOList]** **['sc]** [nonmemberName] [nonmemberTel]
     [nonmemberPw] [nonmemberEmail] [billingCountry] [authToken] [deposit] [cancelDt] [cancelTm] [priceVat] [schedDay] [schedTime] [ipAddress] [cardNumberLast4] [instId] [updtId] [isFraudWhite] [isFraudBlack] [comBuyRealTimeSendHistVOList] [comBuyResSendHistVOList] [clientIP] **[openMarketOrderId]** **[openMarketItemId]** **[isInstallment]** **[optionalResponse]**
     - 하 : 


**TODO : 각 계층 별 Feature 특징 및 의미 파악하기**

## 2. Log Data 해석
- 결제 시도 FLOW
    - [consent] => [getPaymentInfo] => [paymentComplete]
- consent 만 시도하면 Blacklist 로 넣는다. 
- 브렌드 별로 Fraud 가 많다.

## 2. 계층 구조 상 파악 

### 2.1 [계층 구조 상] 데이터 파악 
 - 데이터 구조 파악 결과 : 
     - 주어진 모든 파일에는 [계층 구조 상] Feature 가 5개 또는 7개가 있음
 

In [6]:
"""
Cell : 각 파일별 [계층 구조 상] Feature 개 수 측정
"""


files = glob('./payment-11-logs/payment-11-logs/*')
n_features = []
total_logs = 0
f5_lines = []
f7_lines = []
for file in files:
    # open file 
    f = open(file, encoding='UTF-8')
    
    # read all lines from file 
    lines = f.readlines()

    # total log line count 
    total_logs += len(lines)
    
    # 각 line 별로 Feature 개 수를 파악함. 참고로 line seperator 는 \t 
    for line in lines:
        features = line.split('\t')
        n_features.append(len(features))

        # 각 Log Data 개 수 별로 따로 로그를 저장함. 
        # 아래 셀에서 Test 코드를 작성 하기 위함. 
        if len(features) == 5:
            f5_lines.append(features)
        elif len(features) == 7:
            f7_lines.append(features)
        else:
            pass;

    print('File Name : {} # Features : {} # Logs : {}'.format(file, set(n_features), len(lines)))

print('Total Log Lines : {}'.format(total_logs))
print('Total n feature : 5 , Log Lines : {}'.format(len(f5_lines)))
print('Total n feature : 7 , Log Lines : {}'.format(len(f7_lines)))
    

File Name : ./payment-11-logs/payment-11-logs/payment-202005.txt # Features : {1, 7} # Logs : 92397
File Name : ./payment-11-logs/payment-11-logs/payment-202004.txt # Features : {1, 7} # Logs : 96514
File Name : ./payment-11-logs/payment-11-logs/payment-201910.txt # Features : {1, 7} # Logs : 107906
File Name : ./payment-11-logs/payment-11-logs/payment-202001.txt # Features : {1, 7} # Logs : 126764
File Name : ./payment-11-logs/payment-11-logs/payment-201911.txt # Features : {1, 7} # Logs : 159671
File Name : ./payment-11-logs/payment-11-logs/payment-201907.txt # Features : {1, 5, 7} # Logs : 62622
File Name : ./payment-11-logs/payment-11-logs/payment-202003.txt # Features : {1, 5, 7} # Logs : 92450
File Name : ./payment-11-logs/payment-11-logs/payment-202002.txt # Features : {1, 5, 7} # Logs : 113469
File Name : ./payment-11-logs/payment-11-logs/payment-201912.txt # Features : {1, 5, 7} # Logs : 167541
File Name : ./payment-11-logs/payment-11-logs/payment-201908.txt # Features : {1, 5

### 2.2 [계층 구조 상][Test] 각 file 별 Feature 종류 보증 확인
 - 각 파일별 Feature가 같은 위치와 같은 종류인지를 확인
 - 결과 :
     - ip : User 식별자로 사용
     - Request Type 이라고 예측되는 Feature 는 아래 3가지 데이터 Type 을 가지고 있음
         - [consent]            
         - [getPaymentInfo]     
         - [paymentComplete]
     - dummy
         - 아무 데이터도 없음

### 2.2.1 [계층 구조 상][Test] 각 file 별 Feature 종류가 5개인 데이터 보증

In [7]:
# [계층 구조 중] 인 Data 제거 : [계층 구조 상] 의 Feature 구조를 파악하기 위해 
df = pd.DataFrame(np.array(f5_lines)[:, :-1], columns=['time', 'ip', 'req_type', 'dummy'])

print('User 수 : ', len(df.ip.value_counts()))
print('------------------------------------')

print('Request Type')
print(df.req_type.value_counts())
print('------------------------------------')

print('Dummy')
print(df.dummy.value_counts())


User 수 :  1092
------------------------------------
Request Type
[consent]            8766
[getPaymentInfo]     5808
[paymentComplete]    5498
Name: req_type, dtype: int64
------------------------------------
Dummy
-    20072
Name: dummy, dtype: int64


#### [계층 구조 중] 2.2.2.1 세부 데이터 확인
 : [계층 구조 중]의 Key는 크게 2가지 Type 으로 나뉘어진다.
  
|       | 33                 | 55                           | data type (example)                                                                                                                                 | Description                                                   | use or not |
|-------|--------------------|------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------|------------|
|       |                    | 'sc'                         | Json                                                                                                                                                |                                                               | x          |
|       | payId              | 'payId'                      | string (190703000193)                                                                                                                               | ID when  generated on payment                                 |            |
|       |                    | nonmemberName                | [nan]                                                                                                                                               |                                                               |            |
|       |                    | nonmemberTel                 | [nan]                                                                                                                                               |                                                               |            |
|       |                    | nonmemberPw                  | [nan]                                                                                                                                               |                                                               |            |
|       |                    | nonmemberEmail               | [nan]                                                                                                                                               |                                                               |            |
|       | mode               | mode                         | 33: ['SHOP']                                                                                                                                        | # 33 data only have ['SHOP'] feature                          |            |
|       | mileage            | mileage                      | number (3000, 2700)                                                                                                                                 |                                                               |            |
|       | userName           | userName                     | string ('TREATS')                                                                                                                                   | User Name                                                     |            |
|       | senderName         | senderName                   | string ('TEST4')                                                                                                                                    | person to receive a gift                                      |            |
|       | senderPhone        | senderPhone                  | '639565041094'                                                                                                                                      | phonenumber to receive a gift or phone number to give a gift  |            |
|       | carrier            | carrier                      | [None 'GLOBE' 'SMART' 'SUN' 'UNKNOWN']                                                                                                              |                                                               |            |
|       | brand              | brand                        | [None 'PREPAID' 'POSTPAID' 'BUDDY' 'TM' 'TNT' 'SUN-PREPAID' 'SUN-POSTPAID' 'BRO PREPAID' 'SMART_BUDDY' 'UNKNOWN' 'SMART_SMART_POSTPAID' 'SMART_TNT' |                                                               |            |
|       | totalPrice         | totalPrice                   | number  (3.00e+03)                                                                                                                                  |                                                               |            |
|       | payMethod          | payMethod                    | ['9' '7' '1']                                                                                                                                       |                                                               |            |
|       | retCode            | retCode                      | [None '00' '1' '0' '404' '400' '403' '502' '10']                                                                                                    |                                                               |            |
|       | retMsg             | retMsg                       | [None 'SUCCESS' 'FAILURE' 'PENDING']                                                                                                                |                                                               |            |
|       | chargingAmount     | chargingAmount               | number  (nan, 3.50e+0)                                                                                                                              |                                                               |            |
|       |                    | billingCountry               | [nan None 'PH']                                                                                                                                     |                                                               |            |
|       |                    | authToken                    | -                                                                                                                                                   |                                                               |            |
|       | resultMessage      | resultMessage                | message from server                                                                                                                                 |                                                               |            |
|       | resultResponse     | resultResponse               | -string                                                                                                                                             |                                                               |            |
|       | pgAgreeNo          | pgAgreeNo                    | number                                                                                                                                              |                                                               |            |
|       | pgChannel          | pgChannel                    | ['PGC05' 'PGC02' 'PGC01' 'PGC03' 'PGC04' 'PGC12' 'PGC06' 'PGC13']                                                                                   |                                                               |            |
|       | payDt              | payDt                        | [None '20190703']                                                                                                                                   | pay to date                                                   |            |
|       | payTm              | payTm                        | [None '145922']                                                                                                                                     |                                                               |            |
|       | userNo             | userNo                       | User Number                                                                                                                                         | User Number  ? what is different with user id                 |            |
|       |                    | deposit                      | [nan  0.]                                                                                                                                           |                                                               |            |
|       |                    | cancelDt                     | [nan]                                                                                                                                               |                                                               |            |
|       |                    | cancelTm                     | [nan]                                                                                                                                               |                                                               |            |
|       |                    | priceVat                     | [nan                                                                                                                                                | 0.]                                                           |            |
|       | buyTotalCnt        | buyTotalCnt                  | [1]                                                                                                                                                 |                                                               |            |
|       | pgPayResult        | pgPayResult                  | [None '1000' '9900' '2000']                                                                                                                         |                                                               |            |
|       | sendType           | sendType                     | ['W' 'M' 'A']                                                                                                                                       |                                                               |            |
|       | earnMileage        | earnMileage                  | number (0)                                                                                                                                          |                                                               |            |
|       | address            | address                      | [nan]                                                                                                                                               |                                                               |            |
|       | couponCode         | couponCode                   | ['' None nan 'GRABTREATS']                                                                                                                          |                                                               |            |
|       | couponDiscountRate | couponDiscountRate           | [nan 77.]                                                                                                                                           |                                                               |            |
|       | couponDiscountType | couponDiscountType           | [None nan 'CPN_DSCT_BY_ONE_PRICE_SALE']                                                                                                             |                                                               |            |
|       |                    | schedDay                     | [nan]                                                                                                                                               |                                                               |            |
|       |                    | schedTime                    | [nan]                                                                                                                                               |                                                               |            |
|       |                    | ipAddress                    | [nan]                                                                                                                                               |                                                               |            |
|       |                    | cardNumberLast4              | [nan]                                                                                                                                               |                                                               |            |
|       | instDt             | instDt                       | Date : yyyymmdd (20190703)                                                                                                                          |                                                               |            |
|       | instTm             | instTm                       | (145922)                                                                                                                                            |                                                               |            |
|       |                    | instId                       | [nan '2USHOP']                                                                                                                                      |                                                               |            |
|       | updtDt             | updtDt                       | Date : yyyymmdd (20190703)                                                                                                                          |                                                               |            |
|       | updtTm             | updtTm                       | 145922                                                                                                                                              |                                                               |            |
|       |                    | updtId                       | [nan '2USHOP']                                                                                                                                      |                                                               |            |
|       | b2cBuyHistVOList   | b2cBuyHistVOList             | Json                                                                                                                                                |                                                               |            |
|       |                    | isFraudWhite                 | [nan False True]                                                                                                                                    |                                                               |            |
|       |                    | isFraudBlack                 | [nan False]                                                                                                                                         |                                                               |            |
|       |                    | comBuyRealTimeSendHistVOList | Json                                                                                                                                                |                                                               |            |
|       |                    | comBuyResSendHistVOList      | Json                                                                                                                                                |                                                               |            |
|       |                    | clientIP                     | [nan '130.105.186.28']                                                                                                                              |                                                               |            |
| # log |                    | 18601                        | 1471                                                                                                                                                |                                                               |            |
|       |                    |                              |                                                                                                                                                     |                                                               |            |

In [4]:
data = np.array(f5_lines)[:, -1]
null = None
false = False
true = True
total_keys = []
bucket_df = []
for d in data[:]:
    d = eval(d)
    total_keys.extend(d)
    bucket_df.append(d)

    # log 개수 확인 
Counter(total_keys)

line_33 = []
line_55 = []
[line_33.append(l) if len(l)==33 else line_55.append(l) for l in bucket_df ]
len(line_33), len(line_55)

(18601, 1471)

In [53]:
data = np.array(f5_lines)[:, -1]
null = None
false = False
true = True
total_keys = []
bucket_df = []

# 각 중위 계층 Feature를 데이터를 수집한 후 DataFrame 으로 변화한다.
for d in data[:]:
    d = eval(d)
    total_keys.extend(d)
    bucket_df.append(pd.Series(d))

In [52]:
#[중위 계층] Feature 을 분석한다. 
df = pd.DataFrame(bucket_df)
for c_name in df.columns:
    try:
        if not c_name == 'authToken':
            print(c_name, '\n' ,df[c_name].unique())
    except:
        print(c_name)


payId 
 ['190703000193' '190703000194' '190703000195' ... '190715001166'
 '190715001169' '190715001170']
mode 
 ['SHOP']
mileage 
 [3000.   nan 2700. 1350.  750.  400.  700.  300.  500. 1000.  450. 1250.
 6750. 4000. 1200.]
userName 
 ['TREATS' 'Karlo L' 'Trae M' ... 'Atlene' 'Rayle L' 'Janina']
senderName 
 ['TEST4' 'Karlo L' 'Justine' ... 'Rica' 'Kaori J' 'alex']
senderPhone 
 ['639565041094' '639176882243' '639264894644' ... '639260976099'
 '639202081492' '639278274578']
carrier 
 [None 'GLOBE' 'SMART' 'SUN' 'UNKNOWN']
brand 
 [None 'PREPAID' 'POSTPAID' 'BUDDY' 'TM' 'TNT' 'SUN-PREPAID' 'SUN-POSTPAID'
 'BRO PREPAID' 'SMART_BUDDY' 'UNKNOWN' 'SMART_SMART_POSTPAID' 'SMART_TNT']
totalPrice 
 [3.00e+03 3.50e+01 4.80e+02 1.27e+02 1.00e+02 1.10e+03 2.70e+02 4.50e+01
 4.00e+01 3.00e+01 2.00e+02 1.60e+02 7.00e+01 1.20e+02 5.00e+01 7.20e+01
 8.00e+01 5.80e+02 4.00e+02 3.95e+02 2.40e+02 1.35e+02 1.10e+02 5.00e+02
 3.60e+02 2.10e+02 5.50e+01 2.25e+02 9.80e+01 2.90e+01 1.90e+02 6.75e+02
 1.75e+02

In [29]:
def fn(data):
    level_1 = {} 
    level_2 = {} 
    level_3 = [] 

    level_1['top'] = []
    for key in data.keys():
        if type(data[key]) == list or type(data[key]) == dict:
            data_2 = data[key]
            level_2[key] = [] 
            for key_2 in data_2:
                print(key_2)
                if type(data_2[key_2]) == dict or type(data_2[key_2]) == list:
                    pass;
                else:
                    level_2[key].append(key)
        else: 
            level_1['top'].append(key)
            
    return level_1, level_2

In [30]:
# fn(eval(data[1]))
eval(data[1])['b2cBuyHistVOList'][0]

{'buyTotAmt': 3000.0,
 'discountAmt': -2775.0,
 'limitStartDate': '20190703',
 'limitEndDate': '20190901',
 'mmsTitle': None,
 'mmsSendMsg': 'I love you.',
 'goodsDispSeq': 10063,
 'goodsComName': 'Flower Chimp Inc.',
 'goodsSeq': 'G00000004166',
 'goodsName': 'P250 Worth Voucher',
 'saleAmt': 200.89,
 'saleVat': 24.11,
 'realAmt': 2678.6,
 'realVat': 321.4,
 'pgSaleAmt': 2678.6,
 'pgSaleVat': 321.4,
 'goodsTotCnt': 1,
 'resCustName': 'TEST4',
 'resCustPhoneNo': '639565041094',
 'calStd': '11720',
 'b2cBuyUserId': 'ad8adf98-3ebe-4972-8afc-cf2c5d821995',
 'b2cBuyUserPhoneNo': '639565041094',
 'b2cBuyUsername': 'TREATS',
 'payId': '190703000193',
 'userNo': 3303,
 'dispNo': '003007',
 'regDate': '20190703',
 'regTime': '145922',
 'uptDate': '20190703',
 'uptTime': '145922'}

In [8]:
data = np.array(f5_lines)[:10, -1]

### 2.2.2 [계층 구조 상][Test] 각 file 별 Feature 종류가 7개인 데이터 보증

In [8]:
# [계층 구조 중] 인 Data 제거 : [계층 구조 상] 의 Feature 구조를 파악하기 위해 
# 데이터의 갯수 때문에 분할해서 처리

In [9]:
import random

# # random 으로 추출된 데이터를 약 3,000,000 개 추출해 적용
random.shuffle(f7_lines)
n_samples = 10000


In [10]:
df = pd.DataFrame(np.array(f7_lines[:n_samples])[:, :-1], columns=['time', 'dummy1','ip', 'type', 'req_type', 'dummy'])

print('Test Log 수 : ', n_samples)
print('------------------------------------')

print('User 수 : ', len(df.ip.value_counts()))
print('------------------------------------')
print('------------------------------------')

print('dummy1')
print('#', len(df.dummy1.value_counts()))
print(df.dummy1.value_counts())
print('------------------------------------')
print('------------------------------------')

print('Type')
print(df.type.value_counts())
print('------------------------------------')
print('------------------------------------')

print('Request Type')
print(df.req_type.value_counts())
print('------------------------------------')
print('------------------------------------')

print('Dummy')
print(df.dummy.value_counts())




Test Log 수 :  10000
------------------------------------
User 수 :  8037
------------------------------------
------------------------------------
dummy1
# 8422
[FD6543AFF306D4EEEBCBA1307A558018]    18
[6103B06960C2943514485CFFC64EE138]    16
[53FFA6FFAE3B419B7EF5E412F31B45A4]    11
[671655C4EFC5E8E4E4FDE18F33C2F2CA]    11
[4B14C0FDF2256A6931EFFF36DB3FAB58]    10
                                      ..
[4397567DFF798F0A9CCB60A28D654F01]     1
[F52BC88C72C1A9628F45C08DDA9360EA]     1
[A5CB4F8489977964F4A8D680390A59C1]     1
[0ADB6BB2F17A3534586D6B3A7FA28FAB]     1
[58B7E8EB10FA988321EBC1F1169B9B85]     1
Name: dummy1, Length: 8422, dtype: int64
------------------------------------
------------------------------------
Type
INFO     9339
WARN      661
Name: type, dtype: int64
------------------------------------
------------------------------------
Request Type
[consent]            4172
[getPaymentInfo]     2999
[paymentComplete]    2829
Name: req_type, dtype: int64
----------------------

## 3. 계층 구조 중 파악 

- [계층 구조 상] 총 Feature 개 수 가 5개인 데이터의 [계층 구조 중] 파악

In [11]:
df = pd.DataFrame()
null = None
false = False
true = True

data = np.array(f5_lines)[:, -1].tolist()


In [12]:
df = pd.DataFrame()
null = None
false = False
true = True

middle_keys = []  
data = np.array(f5_lines)[:, -1].tolist()
for ind, d in enumerate(data):
    # 진행을 보여줍니다.
    d = eval(d)
    middle_keys.extend(d.keys())
Counter(middle_keys)

Counter({'payId': 20072,
         'mode': 20072,
         'mileage': 19806,
         'userName': 20072,
         'senderName': 20072,
         'senderPhone': 20072,
         'carrier': 20072,
         'brand': 20072,
         'totalPrice': 20072,
         'payMethod': 20072,
         'retCode': 20072,
         'retMsg': 20072,
         'chargingAmount': 20006,
         'resultMessage': 20072,
         'resultResponse': 20072,
         'pgAgreeNo': 19970,
         'pgChannel': 20072,
         'payDt': 20072,
         'payTm': 20072,
         'userNo': 20072,
         'buyTotalCnt': 20072,
         'pgPayResult': 20072,
         'sendType': 20072,
         'earnMileage': 20072,
         'address': 19806,
         'couponCode': 19806,
         'couponDiscountRate': 19806,
         'couponDiscountType': 19806,
         'instDt': 20072,
         'instTm': 20072,
         'updtDt': 20072,
         'updtTm': 20072,
         'b2cBuyHistVOList': 19806,
         'sc': 1205,
         'nonmemberNa

In [235]:
"""
파일 하나를 parsing 함.
:return:
"""
sample_path = os.path.join('.', 'sample', 'payment-201907.txt')
f = open(sample_path, encoding='UTF-8')

# read sample
# Eval function 을 사용하기 위해서 해당 txt 파일에 있지만 eval 할 수 없는 변수들을 지정함
null = None
false = False
true = True

# txt file parsing
buy_hists = []
len_elements= [] 
index_collector = {'len5_nested' : [],'len5_unnested' : [], 'len7_nested' : [], 'len7_unnested' : []} 
for idx, line in enumerate(f.readlines()):
    """
    Log format 은 아래와 같은 종류로 되어 있음
    1. hh:mm:ss \t [ip] \t [type] \t - or WARN \t {json obj} \n
    2. hh:mm:ss \t [ip] \t ip \t [type] \t - or WARN \t {json obj} \n

    WARN 이면 b2cBuyHistVOList 가 없다.   
    """
    
    n_elements = len(line.split('\t'))
    len_elements.append(n_elements)

    try:
        if n_elements == 5:
            time, log_id, cls, state, obj = line.split('\t')[:]
            length_number = 5

        elif n_elements == 7:
            time, log_id, ip, cls, state, _,  obj = line.split('\t')[:]
            length_number = 7

        elif n_elements == 1:
            continue

        else:
            print(n_type, line)
            
 
        obj = eval(obj)
        obj['type'] = cls

        # 해당 obj 는 Nested Dict 구조로 되어 있음
        # Nested 된 column name 은 b2cBuyHistVOList 임.
        # b2cBuyHistVOList 은 [obj] 구조로 되어 있음
        # * Warning, db 구조에 문제가 있는듯 왜 list 로 쌓여 있는거지? => 모든 경우의 수를 check 해야 함.*
        
        if 'b2cBuyHistVOList' in obj.keys():
            nested = 'nested'
        else:
            nested = 'unnested'
        
        key_name = 'len{}_{}'.format(length_number, nested)
        index_collector[key_name] += [(idx, obj)]  

    except IndexError as ie:
        # Error Case
        # Log 가 기록된 날짜를 지정한 Row
        print(line)
    except ValueError as ve:
        print(l## 2. 계층 구조 상 파악 ine)

    except KeyError as ke:
        print(line)
        pass;


### 1.1 [Log 계층 구조 상] Data Sample


In [253]:
sample_path = os.path.join('.', 'sample', 'payment-201907.txt')
f = open(sample_path, encoding='UTF-8')
lines = f.readlines()
index, obj  = index_collector['len5_unnested'][0]
lines[index].split('\t')[:-1]

['18:16:01', '[http-nio-10.0.0.205-443-exec-509]', '[getPaymentInfo]', '-']

In [254]:
sample_path = os.path.join('.', 'sample', 'payment-201907.txt')
f = open(sample_path, encoding='UTF-8')
lines = f.readlines()
index, obj  = index_collector['len5_nested'][0]
lines[index].split('\t')[:-1]

['14:59:22', '[http-nio-10.0.0.205-443-exec-507]', '[consent]', '-']

In [255]:
sample_path = os.path.join('.', 'sample', 'payment-201907.txt')
f = open(sample_path, encoding='UTF-8')
lines = f.readlines()
index, obj  = index_collector['len7_nested'][3]
lines[index].split('\t')[:-1]

['11:57:48',
 '[796B29912C0E82A87D1D7EDB92A19705]',
 '[130.105.10.127 ]',
 'INFO ',
 '[getPaymentInfo]',
 '-']

In [256]:
sample_path = os.path.join('.', 'sample', 'payment-201907.txt')
f = open(sample_path, encoding='UTF-8')
lines = f.readlines()
index, obj  = index_collector['len7_unnested'][3]
lines[index].split('\t')[:-1]

['17:20:09',
 '[9F51B6FE287E1447289E223464BD4890]',
 '[110.54.163.20  ]',
 'WARN ',
 '[getPaymentInfo]',
 '-']

### 1.2 종류별 데이터 개숫 

In [228]:
msgs = [print(' {} :  {}'.format(key, len(value))) for key, value in index_collector.items()]


 len5_nested :  19806
 len5_unnested :  266
 len7_nested :  42100
 len7_unnested :  393


## 2. Log 에서 `b2cBuyHistVOList` Feature 분석
 - (조건) Log 내 상위 Element 개 수 5

In [216]:
# 길이가 5개인 Log의 각 column 별로 고유한 columns 명을 확인합니다. 
keys = []
values = []

for index, obj in index_collector['len5_unnested']:
    keys.append(tuple(obj.keys()))
    values.append(tuple(obj.values()))
    

In [217]:
unique_key_type = set(keys)
unique_ele , counts = np.unique(np.concatenate(list(unique_key_type)), return_counts=True)

df_len5_unnested = pd.DataFrame(columns=unique_ele)

for index, obj in index_collector['len5_unnested']:
    series = pd.Series(obj, name=index)
    df_len5_unnested = df_len5_unnested.append(series)

In [218]:
df_len5_unnested.columns

Index(['authToken', 'billingCountry', 'brand', 'buyTotalCnt', 'carrier',
       'chargingAmount', 'deposit', 'earnMileage', 'instDt', 'instId',
       'instTm', 'mode', 'payDt', 'payId', 'payMethod', 'payTm', 'pgAgreeNo',
       'pgChannel', 'pgPayResult', 'priceVat', 'resultMessage',
       'resultResponse', 'retCode', 'retMsg', 'sendType', 'senderName',
       'senderPhone', 'totalPrice', 'type', 'updtDt', 'updtId', 'updtTm',
       'userName', 'userNo'],
      dtype='object')

In [219]:
print("총 Columns 개 수 : ", len(df_len5_unnested.columns))
for col_name in df_len5_unnested.columns:
    if col_name != 'authToken' and col_name != 'resultResponse' :        
        print('column name : {} \ncategorical {}'.format(col_name, list(df_len5_unnested[col_name].value_counts().index)))
        print('--------------------------------------------------------')

총 Columns 개 수 :  34
column name : billingCountry 
categorical ['PH']
--------------------------------------------------------
column name : brand 
categorical ['PREPAID', 'TM', 'BUDDY', 'TNT', 'POSTPAID', 'SUN-PREPAID', 'BRO PREPAID']
--------------------------------------------------------
column name : buyTotalCnt 
categorical [1]
--------------------------------------------------------
column name : carrier 
categorical ['GLOBE', 'SMART', 'SUN']
--------------------------------------------------------
column name : chargingAmount 
categorical [675.0, 625.0, 50.0, 70.0, 40.0, 400.0, 100.0, 20.0, 135.0, 55.0, 500.0, 480.0, 225.0, 125.0, 1100.0, 80.0, 110.0, 420.0, 45.0, 34.0, 330.0, 210.0, 181.0, 27.0, 200.0, 180.0, 375.0, 160.0, 122.0, 150.0, 380.0, 270.0, 75.0, 395.0, 98.0]
--------------------------------------------------------
column name : deposit 
categorical [0.0]
--------------------------------------------------------
column name : earnMileage 
categorical [67, 5, 62, 110, 5

## Log 내  element 갯수 7 , Unnested
####  Length 5, unnested 에 없는 Feature 
 + couponCode 
 + couponDiscountRate 
 + couponDiscountType

In [229]:
# 길이가 5개인 Log의 각 column 별로 고유한 columns 명을 확인합니다. 

keys = []
values = []
for index, obj in index_collector['len7_unnested']:
    keys.append(tuple(obj.keys()))
    values.append(tuple(obj.values()))
    

In [230]:
unique_key_type = set(keys)
unique_ele , counts = np.unique(np.concatenate(list(unique_key_type)), return_counts=True)

df_len5_unnested = pd.DataFrame(columns=unique_ele)

for index, obj in index_collector['len5_unnested']:
    series = pd.Series(obj, name=index)
    df_len5_unnested = df_len5_unnested.append(series)

In [231]:
df_len5_unnested.columns

Index(['authToken', 'billingCountry', 'brand', 'buyTotalCnt', 'carrier',
       'chargingAmount', 'couponCode', 'couponDiscountRate',
       'couponDiscountType', 'deposit', 'earnMileage', 'instDt', 'instId',
       'instTm', 'mode', 'payDt', 'payId', 'payMethod', 'payTm', 'pgAgreeNo',
       'pgChannel', 'pgPayResult', 'priceVat', 'resultMessage',
       'resultResponse', 'retCode', 'retMsg', 'sendType', 'senderName',
       'senderPhone', 'totalPrice', 'type', 'updtDt', 'updtId', 'updtTm',
       'userName', 'userNo'],
      dtype='object')

In [233]:
print("총 Columns 개 수 : ", len(df_len5_unnested.columns))
for col_name in df_len5_unnested.columns:
    if col_name != 'authToken' and col_name != 'resultResponse' :        
        print('column name : {} \ncategorical {}'.format(col_name, list(df_len5_unnested[col_name].value_counts().index)))
        print('--------------------------------------------------------')

총 Columns 개 수 :  37
column name : billingCountry 
categorical ['PH']
--------------------------------------------------------
column name : brand 
categorical ['PREPAID', 'TM', 'BUDDY', 'TNT', 'POSTPAID', 'SUN-PREPAID', 'BRO PREPAID']
--------------------------------------------------------
column name : buyTotalCnt 
categorical [1]
--------------------------------------------------------
column name : carrier 
categorical ['GLOBE', 'SMART', 'SUN']
--------------------------------------------------------
column name : chargingAmount 
categorical [675.0, 625.0, 50.0, 70.0, 40.0, 400.0, 100.0, 20.0, 135.0, 55.0, 500.0, 480.0, 225.0, 125.0, 1100.0, 80.0, 110.0, 420.0, 45.0, 34.0, 330.0, 210.0, 181.0, 27.0, 200.0, 180.0, 375.0, 160.0, 122.0, 150.0, 380.0, 270.0, 75.0, 395.0, 98.0]
--------------------------------------------------------
column name : couponCode 
categorical []
--------------------------------------------------------
column name : couponDiscountRate 
categorical []
--------

# Log 데이터 내 , resultResponse 분석

In [32]:
'resultResponse': '{"id":"2246cd7e-4d8a-4301-a88a-ac5c5593e78e","items":[{"name":"P1000 '
                   'Worth '
                   'Voucher","code":"G00000003940","description":"PUREGOLD","amount":{"value":"1100","details":{"discount":"1.4210854715202004e-14","subtotal":"982.14"}},"totalAmount":{"value":"1100","details":{"discount":"1.4210854715202004e-14","subtotal":"982.14"}}}],"metadata":{},"requestReferenceNumber":"190703000201","receiptNumber":null,"createdAt":"2019-07-03T07:13:07.000Z","updatedAt":"2019-07-03T07:13:23.000Z","paymentScheme":"master-card","expressCheckout":true,"refundedAmount":"0","canPayPal":false,"expiredAt":"2019-07-03T08:13:07.000Z","status":"COMPLETED","paymentStatus":"PAYMENT_FAILED","paymentDetails":{"responses":{"efs":{"unhandledError":[{"logref":"7a86c703-6507-0307-1323-918407685403","message":"[2005] '
                   'Decline - Do not '
                   'honor","receiptNo":"918407685403","code":"2005","links":[{"rel":"self","href":"https://cdn.paymaya.com/sandbox/payments_api/paymayap3/paymayap3.html"}]}]}},"paymentAt":null,"3ds":true},"buyer":{"contact":{"phone":"639293159012"},"firstName":"Revan '
                   'D","billingAddress":{},"shippingAddress":{}},"merchant":{"currency":"PHP","email":"jed@sharetreats.com","locale":"en","homepageUrl":"http://www.sharetreats.ph","isEmailToMerchantEnabled":true,"isEmailToBuyerEnabled":true,"isPaymentFacilitator":false,"isPageCustomized":true,"supportedSchemes":["Mastercard","Visa","JCB"],"canPayPal":false,"payPalEmail":null,"payPalWebExperienceId":null,"expressCheckout":true,"name":"SHARE '
                   'TREATS"},"totalAmount":{"amount":"1100","currency":"PHP","details":{"subtotal":"982.14"}},"redirectUrl":{"success":"https://www.sharetreats.ph/callback/paymaya_charging/success","failure":"https://www.sharetreats.ph/callback/paymaya_charging/failure","cancel":"https://www.sharetreats.ph/callback/paymaya_charging/cancel"},"transactionReferenceNumber":"7a86c703-6507-0307-1323-918407685403"}'

SyntaxError: invalid syntax (<ipython-input-32-68c533e7f941>, line 1)