In [1]:
import pandas as pd
import json
from collections import Counter

# PJZ Dataset

In [2]:
pjz_df = pd.read_excel('/home/xogns5037/K-CAT/lth/data/PJZ_en_ko.xlsx')
pjz_df.head(10)

Unnamed: 0,source,label,message_idx,author,time,en,ko,time.1
0,PJ chats,1,0,decoy,0.611111,Hey Its Mads,"안녕, 나 매즈야","""14:40"""
1,PJ chats,1,0,Billy Joe,0.611111,Hey babes,"안녕, 자기야","""14:40"""
2,PJ chats,1,0,Billy Joe,0.611806,Almost done with work. So glad,거의 일 다 끝났어. 정말 기뻐,"""14:41"""
3,PJ chats,1,0,decoy,0.611806,Hey:) that's good right?,안녕 :) 그거 좋은 거지?,"""14:41"""
4,PJ chats,1,0,Billy Joe,0.6125,Yea,응,"""14:42"""
5,PJ chats,1,0,decoy,0.613194,So what's new lol,그럼 뭐 새로운 일 있어? ㅋㅋ,"""14:43"""
6,PJ chats,1,0,Billy Joe,0.613194,Nothing,없어,"""14:43"""
7,PJ chats,1,0,Billy Joe,0.613194,did u touch it in the shower. Lol,샤워할 때 그거 만졌어? ㅋㅋ,"""14:43"""
8,PJ chats,1,0,decoy,0.613889,No had to hurry up lol,"아니, 빨리 해야 했어 ㅋㅋ","""14:44"""
9,PJ chats,1,0,decoy,0.613889,Hold on a min,잠깐만,"""14:44"""


In [3]:
pjz_df = pjz_df[pjz_df['ko'] != 'na']
pjz_df = pjz_df[['source', 'label', 'message_idx', 'author', 'ko']]
pjz_df.rename(columns={'ko': 'text'}, inplace=True)
pjz_df

Unnamed: 0,source,label,message_idx,author,text
0,PJ chats,1,0,decoy,"안녕, 나 매즈야"
1,PJ chats,1,0,Billy Joe,"안녕, 자기야"
2,PJ chats,1,0,Billy Joe,거의 일 다 끝났어. 정말 기뻐
3,PJ chats,1,0,decoy,안녕 :) 그거 좋은 거지?
4,PJ chats,1,0,Billy Joe,응
...,...,...,...,...,...
9055,PJ chats,1,621,decoy,응.
9056,PJ chats,1,621,jackjohnsons7,그래서 너 하트포드 어디 거리에서 살아?
9057,PJ chats,1,621,decoy,ㅋㅋㅋ
9058,PJ chats,1,621,decoy,너한테 안 알려줘.


# AI Hub Dataset

In [4]:
def convert_chat_df(source, start, end):
    annotations_subject = []
    message_idx = []
    author = []
    text = []

    for i in range(start, end):
        for j in range(1, 21):
            if j < 10:
                with open(f'./K-CAT/lth/data/AIHub/labeling/{source}/{source.upper()}_{i}_0{j}.json') as f:
                    source_data = json.load(f)
            else:
                with open(f'./K-CAT/lth/data/AIHub/labeling/{source}/{source.upper()}_{i}_{j}.json') as f:
                    source_data = json.load(f)  
            
            if source_data['info'][0]['annotations']['speaker_type'] == '1:1':
                annotations_subject.append(source_data['info'][0]['annotations']['subject'])
                for t in source_data['info'][0]['annotations']['text'].split('\n'):
                    split_idx = t.find(':')
                    n, s = t[:split_idx], t[split_idx+1:]
                    message_idx.append(f'{i}_{j}')
                    author.append(n.strip())
                    text.append(s.strip())
                    
    df = pd.DataFrame({'source': source, 'label': 0, 'message_idx': message_idx, 'author': author, 'text': text})
    
    return df, Counter(annotations_subject)

In [5]:
kakao_df, k_category_cnt = convert_chat_df('kakao', 899, 909)
facebook_df, f_category_cnt = convert_chat_df('facebook', 101, 111)
instagram_df, i_category_cnt = convert_chat_df('instagram', 61, 71)
nateon_df, n_category_cnt = convert_chat_df('nateon', 22, 32)


In [16]:
all_categories = set(k_category_cnt.keys()) | set(f_category_cnt.keys()) | set(i_category_cnt.keys()) | set(n_category_cnt.keys())

category_data = []

for category in sorted(all_categories):
    category_data.append([
        category,
        k_category_cnt.get(category, 0),
        f_category_cnt.get(category, 0),
        i_category_cnt.get(category, 0),
        n_category_cnt.get(category, 0)
    ])

category_df = pd.DataFrame(category_data, columns=['Category', 'K Count', 'F Count', 'I Count', 'N Count'])
category_df

Unnamed: 0,Category,K Count,F Count,I Count,N Count
0,가족,8,10,10,5
1,건강,8,10,10,5
2,게임,8,10,10,5
3,계절/날씨,8,10,10,5
4,교육,8,10,11,5
5,교통,8,10,10,5
6,군대,8,10,10,5
7,미용,8,10,9,5
8,반려동물,8,10,10,5
9,방송/연예,8,10,10,5


# Final Dataset

In [6]:
final_df = pd.concat([pjz_df, kakao_df, facebook_df, instagram_df, nateon_df])
final_df.reset_index(drop=True, inplace=True)
final_df

Unnamed: 0,source,label,message_idx,author,text
0,PJ chats,1,0,decoy,"안녕, 나 매즈야"
1,PJ chats,1,0,Billy Joe,"안녕, 자기야"
2,PJ chats,1,0,Billy Joe,거의 일 다 끝났어. 정말 기뻐
3,PJ chats,1,0,decoy,안녕 :) 그거 좋은 거지?
4,PJ chats,1,0,Billy Joe,응
...,...,...,...,...,...
24248,nateon,0,26_20,2,역시 좋은 애니에 좋은 음악까지...
24249,nateon,0,26_20,2,중요한 역할을 하는 거 같아!
24250,nateon,0,26_20,1,맞아! 하울의 움직이는 성도 너무 좋았어!
24251,nateon,0,26_20,2,헉 나 진짜 좋아하는 애니야.


In [7]:
final_df.to_csv('./K-CAT/lth/data/GroomingDataset.csv', index=False)

In [8]:
final_df = pd.read_csv('./K-CAT/lth/data/GroomingDataset.csv')
final_df

Unnamed: 0,source,label,message_idx,author,text
0,PJ chats,1,0,decoy,"안녕, 나 매즈야"
1,PJ chats,1,0,Billy Joe,"안녕, 자기야"
2,PJ chats,1,0,Billy Joe,거의 일 다 끝났어. 정말 기뻐
3,PJ chats,1,0,decoy,안녕 :) 그거 좋은 거지?
4,PJ chats,1,0,Billy Joe,응
...,...,...,...,...,...
24248,nateon,0,26_20,2,역시 좋은 애니에 좋은 음악까지...
24249,nateon,0,26_20,2,중요한 역할을 하는 거 같아!
24250,nateon,0,26_20,1,맞아! 하울의 움직이는 성도 너무 좋았어!
24251,nateon,0,26_20,2,헉 나 진짜 좋아하는 애니야.
