In [None]:
# 다른 경로에 있는 모듈을 가져오기 위해 절대 경로 추가

import os
import sys

root = os.path.join(os.path.abspath(os.getcwd()), '..')
root = os.path.normpath(root)

if root not in sys.path:
    sys.path.insert(0, root)

In [None]:
# 필요한 라이브러리 로드

import emoji
import json
import matplotlib.pyplot as plt
import os
import pandas as pd
import re
import torch
from communav.utils import db
from sklearn.metrics import accuracy_score, fbeta_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from soynlp.normalizer import repeat_normalize
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup

In [None]:
# 데이터 로드

connection = db.get_connection()
cursor = connection.cursor()

cursor.execute('''
    SELECT
        articles.title,
        articles.text,
        GROUP_CONCAT(comments.text ORDER BY comments.id SEPARATOR '\n') AS comments
    FROM everytime_original_articles AS articles
    LEFT JOIN everytime_original_comments AS comments ON articles.id = comments.article_id
    GROUP BY articles.id
''')
original_articles = cursor.fetchall()
print(f'Loaded {len(original_articles)} articles.')

cursor.close()
db.close_connection()

In [None]:
# 데이터 일부 출력

for article in original_articles[:20]:
    print(article)

In [None]:
# 데이터 전처리 함수 정의

def preprocess(value):
    if not value:
        return ''

    value = re.sub(r'\n{2,}', '\n', str(value))
    value = value.strip()
    return value

In [None]:
# 코퍼스 생성

corpora = []

for article in original_articles:
    title = preprocess(article['title'])
    text = preprocess(article['text'])
    comments = preprocess(article['comments'])

    if not title or not text:
        continue

    corpora.append(f'{title}\n{text}\n{comments}'.strip())

In [None]:
# 코퍼스 일부 출력

for corpus in corpora[:20]:
    print(corpus)
    print()

In [None]:
# 코퍼스 저장

with open('corpus.txt', 'w') as f:
    f.write('\n\n'.join(corpora))