In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import re
import urllib.request
import mecab
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import warnings
warnings.filterwarnings(action='ignore')

# 데이터 로드

In [2]:
path = '../데이터/기업별_뉴스/~0-126(2018-2022)'

In [3]:
import os
filelist = os.listdir(path)

In [4]:
filelist[:5]

['DeepSearch-CMG제약-news-2018-01-01-2018-12-31-20230601-191902.xlsx',
 'DeepSearch-CMG제약-news-2019-01-01-2019-12-31-20230601-192349.xlsx',
 'DeepSearch-CMG제약-news-2020-01-01-2020-12-31-20230601-192831.xlsx',
 'DeepSearch-CMG제약-news-2021-01-01-2021-12-31-20230601-193311.xlsx',
 'DeepSearch-CMG제약-news-2022-01-01-2022-12-31-20230601-193756.xlsx']

In [5]:
filelist1 = []
for file in filelist:
    if '2018-01-01-2018-12-31' not in file:
        filelist1.append(file)

In [6]:
filelist1[:5]

['DeepSearch-CMG제약-news-2019-01-01-2019-12-31-20230601-192349.xlsx',
 'DeepSearch-CMG제약-news-2020-01-01-2020-12-31-20230601-192831.xlsx',
 'DeepSearch-CMG제약-news-2021-01-01-2021-12-31-20230601-193311.xlsx',
 'DeepSearch-CMG제약-news-2022-01-01-2022-12-31-20230601-193756.xlsx',
 'DeepSearch-HLB-news-2019-01-01-2019-12-31-20230601-194725.xlsx']

In [7]:
df22 = []
for name in filelist1:
    df = pd.read_excel(f'../데이터/기업별_뉴스/~0-126(2018-2022)/{name}',sheet_name = '#02 Documents')
    df['corp'] = f'{name}'
    df22.append(df)

In [8]:
df22[0].head(2)

Unnamed: 0,date,category,section,publisher,author,title,content_url,attachment,industry.label,industry.score,industry.name,polarity.label,polarity.score,polarity.name,corp
0,2019-12-31,news,economy,머니S,한아름,"""내가 제일 잘나가""… 브랜드 평판 가장 높은 제약사는?",,,C21,1.0,의료용 물질 및 의약품 제조업,,,,DeepSearch-CMG제약-news-2019-01-01-2019-12-31-20...
1,2019-12-30,news,economy,한국경제TV,라이온봇,"YG PLUS, 주가 반등 현재는 +9.3%... 이 시각 50만3383주 거래",,,M71,1.0,전문 서비스업,1.0,0.9,긍정,DeepSearch-CMG제약-news-2019-01-01-2019-12-31-20...


In [9]:
label_O=[]
label_X=[]
for df in df22:
    label_O.append(df[df['polarity.label'].notna()])
    label_X.append(df[df['polarity.label'].isna()])
    

In [10]:
label_O = pd.concat(label_O)

In [11]:
label_X = pd.concat(label_X)

In [12]:
label_X[:2]

Unnamed: 0,date,category,section,publisher,author,title,content_url,attachment,industry.label,industry.score,industry.name,polarity.label,polarity.score,polarity.name,corp
0,2019-12-31,news,economy,머니S,한아름,"""내가 제일 잘나가""… 브랜드 평판 가장 높은 제약사는?",,,C21,1.0,의료용 물질 및 의약품 제조업,,,,DeepSearch-CMG제약-news-2019-01-01-2019-12-31-20...
2,2019-12-18,news,economy,한국경제,,"18일, 코스닥 외국인 순매도상위에 반도체 업종 5종목",,,C21,0.757,의료용 물질 및 의약품 제조업,,,,DeepSearch-CMG제약-news-2019-01-01-2019-12-31-20...


# 전처리

In [13]:
mecab =mecab.MeCab()

In [14]:
stop_word = pd.read_csv('./stop_word_2.csv')

In [15]:
label_X['tokenized'] = label_X['title'].apply(mecab.morphs)

In [16]:
del_stopword = []
for row in label_X['tokenized']:
    tmp =[]
    for word in row:
        if word not in stop_word:
            tmp.append(word)
    del_stopword.append(tmp)

In [17]:
label_X['del_stopword'] = del_stopword

In [18]:
X_train=pd.read_csv('./final_X_train.csv')

In [19]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [20]:
x=label_X['del_stopword']

In [21]:
X_encoded = tokenizer.texts_to_sequences(x)

In [22]:
max_len=77

In [23]:
X_encoded = pad_sequences(X_encoded, maxlen=max_len)

In [24]:
# label_X[:2]

# 모델

In [25]:
from tensorflow.keras.models import load_model

In [27]:
model=tf.keras.models.load_model(
    './NLP/best_model_1018_labelbal.h5')

In [28]:
pred = model.predict(X_encoded)



In [29]:
label = np.argmax(pred,axis=1)

In [30]:
label_X['polarity.label'] = label

In [33]:
label_X[:2]

Unnamed: 0,date,category,section,publisher,author,title,content_url,attachment,industry.label,industry.score,industry.name,polarity.label,polarity.score,polarity.name,corp,tokenized,del_stopword
0,2019-12-31,news,economy,머니S,한아름,"""내가 제일 잘나가""… 브랜드 평판 가장 높은 제약사는?",,,C21,1.0,의료용 물질 및 의약품 제조업,2,,,DeepSearch-CMG제약-news-2019-01-01-2019-12-31-20...,"["", 내, 가, 제일, 잘, 나, 가, ""…, 브랜드, 평판, 가장, 높, 은, ...","["", 내, 가, 제일, 잘, 나, 가, ""…, 브랜드, 평판, 가장, 높, 은, ..."
2,2019-12-18,news,economy,한국경제,,"18일, 코스닥 외국인 순매도상위에 반도체 업종 5종목",,,C21,0.757,의료용 물질 및 의약품 제조업,2,,,DeepSearch-CMG제약-news-2019-01-01-2019-12-31-20...,"[18, 일, ,, 코스닥, 외국인, 순매도, 상, 위, 에, 반도체, 업종, 5,...","[18, 일, ,, 코스닥, 외국인, 순매도, 상, 위, 에, 반도체, 업종, 5,..."


In [None]:
ㄹㅇㄹ

In [34]:
df1922 = pd.concat([label_O,label_X])

In [35]:
df1922.to_csv('./labelO_1922.csv', index=False)