In [1]:
# We will use the official tokenization script created by the Google team
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [2]:
!pip install sentencepiece



In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub

import tokenization

# Helper Functions

In [4]:
def bert_encode(texts, tokenizer, max_len=128):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [5]:
def build_model(bert_layer, max_len=128):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

# Load and Preprocess

- Load BERT from the Tensorflow Hub
- Load CSV files containing training data
- Load tokenizer from the bert layer
- Encode the text into tokens, masks, and segment flags

In [6]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

CPU times: user 11.5 s, sys: 2.34 s, total: 13.8 s
Wall time: 13.6 s


In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
!ls '/content/drive/My Drive/프로젝트'

'교육 데이터'		        data17_final.csv
'데청 플젝 데이터'	        data18_final.csv
'사회보호 데이터'	       '지방소멸화 논문.pdf'
'07.13 회의.hwp'	       'permutation importance best.PNG'
'데청프로젝트 baseline.ipynb'  'permutation importance worst.PNG'
 중앙일보서천군.csv	        result_news_1.csv
 중앙일보고성군.csv	        result_news_2.csv
 경향신문서천군.csv	        score.PNG
 중앙일보곡성군.csv	        test.csv
 경향신문고성군.csv	        train.csv
 중앙일보평창군.csv	        train_utf.csv
 경향신문곡성군.csv	       '데청프로젝트 voting.ipynb'
 경향신문평창군.csv	       '데청 플젝 데이터.zip'


In [9]:
data = pd.read_csv("/content/drive/My Drive/프로젝트/result_news_2.csv", encoding = "cp949")

In [10]:
중앙_곡성 = pd.read_csv("/content/drive/My Drive/프로젝트/중앙일보곡성군.csv")
중앙_고성 = pd.read_csv("/content/drive/My Drive/프로젝트/중앙일보고성군.csv")
중앙_서천 = pd.read_csv("/content/drive/My Drive/프로젝트/중앙일보서천군.csv")
중앙_평창 = pd.read_csv("/content/drive/My Drive/프로젝트/중앙일보평창군.csv")

경향_곡성 = pd.read_csv("/content/drive/My Drive/프로젝트/경향신문곡성군.csv")
경향_고성 = pd.read_csv("/content/drive/My Drive/프로젝트/경향신문고성군.csv")
경향_서천 = pd.read_csv("/content/drive/My Drive/프로젝트/경향신문서천군.csv")
경향_평창 = pd.read_csv("/content/drive/My Drive/프로젝트/경향신문평창군.csv")

In [11]:
test_곡성 = pd.concat([중앙_곡성, 경향_곡성], axis = 0)
test_고성 = pd.concat([중앙_고성, 경향_고성], axis = 0)
test_서천 = pd.concat([중앙_서천, 경향_서천], axis = 0)
test_평창 = pd.concat([중앙_평창, 경향_평창], axis = 0)

In [35]:
len(test_고성)

244

In [12]:
data.isnull().sum()

score      0
text       0
class_2    0
class_3    0
dtype: int64

In [13]:
data.head()

Unnamed: 0,score,text,class_2,class_3
0,-1,기고 치수의 백년대계 하천관리 일원화부터홍수와 같은 재해가 발생하면 통상적으로 천...,0,긍정
1,-3,김종인 혼자 짊어진 통합당 개혁 어디까지5 18 유공자 예우 강화 법안 발표서...,0,긍정
2,-1,단독 값없는 노동 에 막막 2030 연구자들에 주거 연구 공간 준다정부 비정규직...,0,긍정
3,0,경북서 21일 광화문 집회 관련 4명 등 6명 코로나19 추가 확진21일 경북 지역...,0,긍정
4,0,원주 체조교실발 코로나19 확진자 급증 4명 추가 12명으로 늘어강원 원주시의 ...,0,긍정


In [14]:
test_곡성["class_2"] = 0
test_고성["class_2"] = 0
test_서천["class_2"] = 0
test_평창["class_2"] = 0

In [15]:
drop_list =  list(data.loc[data["score"] == 0].index)

In [16]:
data.drop(drop_list, axis = 0, inplace = True)

In [17]:
data.reset_index(inplace = True)

In [18]:
len(data)

15570

In [19]:
data.head()

Unnamed: 0,index,score,text,class_2,class_3
0,0,-1,기고 치수의 백년대계 하천관리 일원화부터홍수와 같은 재해가 발생하면 통상적으로 천...,0,긍정
1,1,-3,김종인 혼자 짊어진 통합당 개혁 어디까지5 18 유공자 예우 강화 법안 발표서...,0,긍정
2,2,-1,단독 값없는 노동 에 막막 2030 연구자들에 주거 연구 공간 준다정부 비정규직...,0,긍정
3,7,-1,경기지역 일일 신규 확진 코로나 사태 후 최다 삼성전자 기흥캠퍼스 확진자 나와...,0,긍정
4,9,-1,서울 경찰 간부 자택서 숨진 채 발견서울 지역의 경찰서 소속 간부가 자택에서 숨진 ...,0,긍정


In [20]:
data["text"] = data["text"].astype("string")

In [21]:
test_곡성["기사 내용"] = test_곡성["기사 내용"].astype("string")
test_고성["기사 내용"] = test_고성["기사 내용"].astype("string")
test_서천["기사 내용"] = test_서천["기사 내용"].astype("string")
test_평창["기사 내용"] = test_평창["기사 내용"].astype("string")

In [22]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

train = data.drop("class_2", axis = 1)
train_labels = data["class_2"]

In [23]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [24]:
train_input = bert_encode(train["text"].values, tokenizer, max_len=128)

곡성_input = bert_encode(test_곡성["기사 내용"].values, tokenizer, max_len=128)
고성_input = bert_encode(test_고성["기사 내용"].values, tokenizer, max_len=128)
서천_input = bert_encode(test_서천["기사 내용"].values, tokenizer, max_len=128)
평창_input = bert_encode(test_평창["기사 내용"].values, tokenizer, max_len=128)

# Model: Build, Train, Predict, Submit

In [25]:
model = build_model(bert_layer, max_len=128)
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 128)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 1024), (None 335141889   input_word_ids[0][0]             
                                                                 input_mask[0][0]      

In [27]:
checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', save_best_only=True)

train_history = model.fit(
    train_input, train_labels,
    validation_split=0.3,
    epochs=5,
    callbacks=[checkpoint],
    batch_size=16
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [41]:
model.load_weights('model.h5')
곡성_pred = model.predict(곡성_input)
고성_pred = model.predict(고성_input)
서천_pred = model.predict(서천_input)
평창_pred = model.predict(평창_input)

곡성_label = 곡성_pred.round().astype('int')
고성_label = 고성_pred.round().astype('int')
서천_label = 서천_pred.round().astype('int')
평창_label = 평창_pred.round().astype('int')

In [29]:
곡성_label = pd.DataFrame(곡성_label, columns=["class_2"])
고성_label = pd.DataFrame(고성_label, columns=["class_2"])
서천_label = pd.DataFrame(서천_label, columns=["class_2"])
평창_label = pd.DataFrame(평창_label, columns=["class_2"])

In [46]:
test_곡성["class_2"] = 곡성_label
test_고성["class_2"] = 고성_label
test_서천["class_2"] = 서천_label
test_평창["class_2"] = 평창_label

In [52]:
test_곡성.reset_index(inplace = True)
test_고성.reset_index(inplace = True)
test_서천.reset_index(inplace = True)
test_평창.reset_index(inplace = True)

In [51]:
test_곡성.drop(["level_0", "Unnamed: 0", "index"], axis = 1, inplace = True)
test_고성.drop(["level_0", "Unnamed: 0", "index"], axis = 1, inplace = True)
test_서천.drop(["level_0", "Unnamed: 0", "index"], axis = 1, inplace = True)
test_평창.drop(["level_0", "Unnamed: 0", "index"], axis = 1, inplace = True)

In [55]:
test_곡성.to_csv("/content/drive/My Drive/프로젝트/곡성감성분석.csv")
test_고성.to_csv("/content/drive/My Drive/프로젝트/고성감성분석.csv")
test_서천.to_csv("/content/drive/My Drive/프로젝트/서천감성분석.csv")
test_평창.to_csv("/content/drive/My Drive/프로젝트/평창감성분석.csv")