# Section4. Project - FinBERT를 활용한 투자 의사 결정 보조 모델 😺

# **1. 드라이브 연동 및 라이브러리 불러오기**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install keybert

In [None]:
!pip install transformers

In [None]:
!pip install sentence_transformers

In [6]:
import time
import random
import warnings
import datetime
import itertools
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore') # to avoid warnings
from sentence_transformers import SentenceTransformer
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

In [7]:
"""
Sklearn Libraries
"""
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

"""
Transformer Libraries
"""
from transformers import BertTokenizer,  AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup

"""
Pytorch Libraries
"""
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler

# **2. 데이터 전처리**

## 데이터셋 로드

In [8]:
data = pd.read_csv("/content/drive/MyDrive/Section4_Project/all-data.csv", encoding='latin-1', names=['sentiment', 'NewsHeadline'])

In [9]:
# label 추가
def encode_sentiments_values(df):
    
    possible_sentiments = df.sentiment.unique()
    sentiment_dict = {}
    
    for index, possible_sentiment in enumerate(possible_sentiments):
        sentiment_dict[possible_sentiment] = index
    
    # Encode all the sentiment values
    df['label'] = df.sentiment.replace(sentiment_dict)
    
    return df, sentiment_dict
 
# Perform the encoding task on the data set
financial_data, sentiment_dict = encode_sentiments_values(data)

In [10]:
financial_data

Unnamed: 0,sentiment,NewsHeadline,label
0,neutral,"According to Gran , the company has no plans t...",0
1,neutral,Technopolis plans to develop in stages an area...,0
2,negative,The international electronic industry company ...,1
3,positive,With the new production plant the company woul...,2
4,positive,According to the company 's updated strategy f...,2
...,...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...,1
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...,0
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...,1
4844,negative,Net sales of the Paper segment decreased to EU...,1


## 데이터셋 split
- X_train : 0.9
  - X_train : 0.85
  - X_val : 0.15
- X_test : 0.1

In [11]:
X_data = financial_data['NewsHeadline']
y_data = financial_data['label']
print('본문의 개수: {}'.format(len(X_data)))
print('레이블의 개수: {}'.format(len(y_data)))

본문의 개수: 4846
레이블의 개수: 4846


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.1, random_state=0, stratify=y_data)

In [13]:
# CLS, SEP 붙이기 (문장의 시작, 끝)
sentences = ["[CLS] " + str(s) + " [SEP]" for s in X_train]

In [14]:
sentences[:15]

["[CLS] Last year 's third quarter result had been burdened by costs stemming from restructuring in the US . [SEP]",
 '[CLS] In addition , nine fixed-term employment contracts will not be extended and two people will leave the company under pension arrangements . [SEP]',
 "[CLS] Hobby Hall 's sales decrease 26 pct due to implementing a new information system that involved changing in the principal of posting sales . [SEP]",
 '[CLS] The Tecnomen Convergent Charging solution includes functionality for prepaid and post-paid billing , charging and rating of voice calls , video calls , raw data traffic and any type of content services in both mobile and fixed networks . [SEP]',
 "[CLS] That 's what I go to bed worrying about every night , ' he said . [SEP]",
 "[CLS] shock phase ' , consumers have once again started to plan and implement building projects . [SEP]",
 '[CLS] ALEXANDRIA , Va. , June 7 -- Michael G. Williams of Newbury Park , Calif. , has developed a network device . [SEP]',
 "[

## Tokenizer 정의

In [15]:
finbert_tokenizer = BertTokenizer.from_pretrained("ProsusAI/finbert", do_lower_case=True)
tokenized_texts = [finbert_tokenizer.tokenize(s) for s in sentences]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/758 [00:00<?, ?B/s]

## 정수 인코딩 & 제로 패딩

In [16]:
MAX_LEN = 150 #최대 시퀀스 길이 설정
input_ids = [finbert_tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [17]:
input_ids[0]

array([  101,  2197,  2095,  1005,  1055,  2353,  4284,  2765,  2018,
        2042, 10859,  2098,  2011,  5366, 29217,  2013, 18322,  1999,
        1996,  2149,  1012,   102,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

## Attention Mask

In [18]:
attention_masks = []

for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [19]:
print(attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


## Train set을 훈련셋과 검증셋으로 분리
- X_train : 0.85
- X_val : 0.15

In [20]:
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids,
                                                                                    np.array(y_train), 
                                                                                    random_state=2000, 
                                                                                    test_size=0.15)
                                                
train_masks, validation_masks, _, _ = train_test_split(attention_masks, 
                                                       input_ids,
                                                       random_state=2000, 
                                                       test_size=0.15)     
                                                       
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)

In [21]:
batch_size = 5

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

## test set 전처리

In [22]:
# [CLS] + 문장 + [SEP]
sentences = ["[CLS] " + str(s) + " [SEP]" for s in X_test]

# 라벨 데이터
labels = np.array(y_test)

# Word 토크나이저 토큰화
finbert_tokenizer = BertTokenizer.from_pretrained("ProsusAI/finbert", do_lower_case=True)
tokenized_texts = [finbert_tokenizer.tokenize(s) for s in sentences]

# 시퀀스 설정 및 정수 인덱스 변환 & 패딩
MAX_LEN = 150
input_ids = [finbert_tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# 어텐션 마스크
attention_masks = []
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)
    
# 파이토치 텐서로 변환
test_inputs = torch.tensor(input_ids)
test_labels = torch.tensor(labels)
test_masks = torch.tensor(attention_masks)

# 배치 사이즈 설정 및 데이터 설정
batch_size = 5
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# **3. 딥러닝 모델링**

## FinBERT 모델 불러오기
- pretrained 모델인 FinBERT 불러오기

In [23]:
# GPU 설정을 위한 디바이스 설정

if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [24]:
# pretrained된 FinBERT 모델 불러오기
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert",
                                                           num_labels=3,
                                                           output_attentions=False,
                                                           output_hidden_states=False)
model.cuda()

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [25]:
# 옵티마이저
optimizer = AdamW(model.parameters(),
                  lr=1e-5, # 학습률(learning rate)
                  eps = 1e-8 
                )

# epoch 수
epochs = 4

# 총 훈련 스텝 : 배치반복 횟수 * 에폭
total_steps = len(train_dataloader) * epochs

# 스케줄러 생성
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

## 모델 학습

In [26]:
# 정확도 계산 함수
def flat_accuracy(preds, labels):
    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    return np.sum(pred_flat == labels_flat) / len(labels_flat)
    
    
# 시간 표시 함수
def format_time(elapsed):

    # 반올림
    elapsed_rounded = int(round((elapsed)))
    
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [27]:
# random seed 고정
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# 그래디언트 초기화
model.zero_grad()

# 학습
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # 시작 시간 설정
    t0 = time.time()

    # 로스 초기화
    total_loss = 0

    # 훈련모드로 변경
    model.train()
        
    # 데이터 로더에서 배치만큼 반복하여 가져옴
    for step, batch in enumerate(train_dataloader):
        # 경과 정보 표시
        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch

        # Forward 수행                
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        
        # 로스 구함
        loss = outputs[0]

        # 총 로스 계산
        total_loss += loss.item()

        # Backward 수행으로 그래디언트 계산
        loss.backward()

        # 그래디언트 클리핑
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 그래디언트를 통해 가중치 파라미터 업데이트
        optimizer.step()

        # 스케줄러로 학습률 감소
        scheduler.step()

        # 그래디언트 초기화
        model.zero_grad()

    # 평균 로스 계산
    avg_train_loss = total_loss / len(train_dataloader)            

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    #시작 시간 설정
    t0 = time.time()

    # 평가 모드로 변경
    model.eval()

    # 변수 초기화
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # 데이터 로더에서 배치만큼 반복하여 가져옴
    for batch in validation_dataloader:
        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch
        
        # 그래디언트 계산 안함
        with torch.no_grad():     
            # Forward 수행
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # 로스 구함
        logits = outputs[0]

        # CPU로 데이터 이동
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # 출력 로짓과 라벨을 비교하여 정확도 계산
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
  Batch   500  of    742.    Elapsed: 0:01:26.

  Average training loss: 0.51
  Training epcoh took: 0:02:06

Running Validation...
  Accuracy: 0.90
  Validation took: 0:00:07

Training...
  Batch   500  of    742.    Elapsed: 0:01:24.

  Average training loss: 0.26
  Training epcoh took: 0:02:05

Running Validation...
  Accuracy: 0.90
  Validation took: 0:00:07

Training...
  Batch   500  of    742.    Elapsed: 0:01:24.

  Average training loss: 0.13
  Training epcoh took: 0:02:05

Running Validation...
  Accuracy: 0.89
  Validation took: 0:00:07

Training...
  Batch   500  of    742.    Elapsed: 0:01:24.

  Average training loss: 0.07
  Training epcoh took: 0:02:05

Running Validation...
  Accuracy: 0.89
  Validation took: 0:00:07

Training complete!


# **4. 모델 검증**

## 테스트셋 평가
- 일반화 성능 평가

In [61]:
#시작 시간 설정
t0 = time.time()

# 평가모드로 변경
model.eval()

# 변수 초기화
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

# 데이터로더에서 배치만큼 반복하여 가져옴
for step, batch in enumerate(test_dataloader):
    # 경과 정보 표시
    if step % 100 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataloader), elapsed))

    # 배치를 GPU에 넣음
    batch = tuple(t.to(device) for t in batch)
    
    # 배치에서 데이터 추출
    b_input_ids, b_input_mask, b_labels = batch
    
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    
    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # 출력 로짓과 라벨을 비교하여 정확도 계산
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

print("")
print("Test set Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
print("Test took: {:}".format(format_time(time.time() - t0)))


Test set Accuracy: 0.90
Test took: 0:00:05


- **Accuracy = 0.9**

# **5. 새로운 문장 테스트**

In [29]:
# 입력 데이터 변환
def convert_input_data(sentences):

    # BERT의 토크나이저로 문장을 토큰으로 분리
    tokenized_texts = [finbert_tokenizer.tokenize(sent) for sent in sentences]

    # 입력 토큰의 최대 시퀀스 길이
    MAX_LEN = 150

    # 토큰을 숫자 인덱스로 변환
    input_ids = [finbert_tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    
    # 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    # 어텐션 마스크 초기화
    attention_masks = []

    # 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정
    # 패딩 부분은 BERT 모델에서 어텐션을 수행하지 않아 속도 향상
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # 데이터를 파이토치의 텐서로 변환
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)

    return inputs, masks

In [30]:
# 문장 테스트
def test_sentences(sentences):

    # 평가모드로 변경
    model.eval()

    # 문장을 입력 데이터로 변환
    inputs, masks = convert_input_data(sentences)

    # 데이터를 GPU에 넣음
    b_input_ids = inputs.to(device)
    b_input_mask = masks.to(device)
            
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)

    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()

    return logits

In [31]:
logits = test_sentences(['growth is strong and we have plenty of liquidity'])
print(logits)

if np.argmax(logits) == 2 :
    print("positive")
elif np.argmax(logits) == 1 :
    print("negative")
elif np.argmax(logits) == 0 :
    print("neutral")

[[ 1.8999511  -4.385992    0.64329886]]
neutral


In [32]:
# 여러 문장 테스트
sentences = ["there is a shortage of capital, and we need extra financing", 
             "growth is strong and we have plenty of liquidity", 
             "there are doubts about our finances", 
             "profits are flat"]

labels = {0:'neutral', 1:'negative',2:'positive'}
for idx, sent in enumerate(sentences):
    print(sent, '----', labels[np.argmax(test_sentences([sentences[idx]]))])

there is a shortage of capital, and we need extra financing ---- negative
growth is strong and we have plenty of liquidity ---- neutral
there are doubts about our finances ---- neutral
profits are flat ---- neutral


# **6. KeyBERT를 이용한 키워드 추출**

In [33]:
def BERT(sent):

    array_text = pd.DataFrame(df[df['title'] == title]['text']).to_numpy()

    bow = []
    from keybert import KeyBERT
    kw_extractor = KeyBERT('distilbert-base-nli-mean-tokens')
    for j in range(len(array_text)):
        keywords = kw_extractor.extract_keywords(array_text[j][0])
        bow.append(keywords)
    
    new_bow = []
    for i in range(0, len(bow)):
        for j in range(len(bow[i])):
            new_bow.append(bow[i][j])
            
    keyword = pd.DataFrame(new_bow, columns=['keyword', 'weight'])
    print(keyword.groupby('keyword').agg('sum').sort_values('weight', ascending=False).head(20))

In [34]:
from keybert import KeyBERT

doc = "there is a shortage of capital, and we need extra financing"
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(doc)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [36]:
sentences = ["there is a shortage of capital, and we need extra financing", 
             "growth is strong and we have plenty of liquidity", 
             "there are doubts about our finances", 
             "profits are flat"]
for s in sentences:
  kw_model = KeyBERT()
  keywords = kw_model.extract_keywords(s, top_n=3)
  print(keywords)

[('financing', 0.6011), ('capital', 0.5458), ('shortage', 0.3663)]
[('liquidity', 0.6635), ('growth', 0.5825), ('strong', 0.2882)]
[('finances', 0.6445), ('doubts', 0.4613)]
[('profits', 0.6755), ('flat', 0.5144)]


# **7. 구글 Economy 기사 크롤링**
- test 구현을 위한 구글 뉴스 기사 크롤링

In [37]:
#step1.프로젝트에 필요한 패키지 불러온다.
from bs4 import BeautifulSoup as bs
import requests

#step2.크롤링할 url 주소를 입력한다. (네이버에서 코로나 검색 후, 뉴스 탭 클릭)
url = 'https://news.google.com/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx6TVdZU0FtVnVHZ0pWVXlnQVAB/sections/CAQiSENCQVNNQW9JTDIwdk1EbHpNV1lTQW1WdUdnSlZVeUlQQ0FRYUN3b0pMMjB2TUdkbWNITXpLZ3NTQ1M5dEx6Qm5abkJ6TXlnQSoqCAAqJggKIiBDQkFTRWdvSUwyMHZNRGx6TVdZU0FtVnVHZ0pWVXlnQVABUAE?hl=en-US&gl=US&ceid=US%3Aen'

# #step2-1.만약 다른 키워드를 매번 다르게 입력하고 싶다면 아래와 같이 하셔도 됩니다.
# query = input('검색할 키워드를 입력하세요: ')
# url = 'https://search.naver.com/search.naver?where=news&sm=tab_jum&query='+'%s'%query

#step3.requests 패키지의 함수를 이용해 url의 html 문서를 가져온다.
response = requests.get(url)
html_text=response.text

#step4.bs4 패키지의 함수를 이용해서 html 문서를 파싱한다.
soup = bs(html_text, 'html.parser')

#step5.bs4 패키지의 select_one 함수와 선택자 개념을 이용해서 뉴스기사 제목을 하나 가져온다.
print(soup.select_one('a.DY5T1d.RZIKme').get_text())

#step6.bs4 패키지의 select 함수와 선택자 개념을 이용해서 뉴스기사 제목을 모두 가져온다.
titles = soup.select('a.DY5T1d.RZIKme')

for i in titles:
    title = i.get_text()
    print(title)


World Bank warns of recession and stagflation: American and Chinese economies on the edge?
World Bank warns of recession and stagflation: American and Chinese economies on the edge?
The Fed's impact hasn't been felt in the economy yet, says Short Hills' Steve Weiss
Gravitas: India could escape a global recession wave
CNBC Fed Survey: Fed's efforts to slow inflation will likely cause a recession
US Inflation to Stay Well Above Fed's Target
Investors skeptical Fed could curb inflation without causing recession: Bloomberg
US Inflation to Stay Well Above Fed’s Target
Pain from inflation is more broad-based than recession, says former FDIC Chair Sheila Bair
The US Inflation Spiral Now Has Global Proportions
Evercore's Emanuel Sees Signs of Downturn in Inflation
Consumers Feeling Pressure From Inflation, Fear Recession
IMF Cuts World GDP Outlook, Warns of Global Recession
'Edge of a global recession': IMF slashes world economic forecast again
IMF downgrades global economic outlook
Losing mom

In [38]:
# 데이터프레임으로 정리

df = pd.DataFrame({'title':titles})
df

Unnamed: 0,title
0,[World Bank warns of recession and stagflation...
1,[The Fed's impact hasn't been felt in the econ...
2,[Gravitas: India could escape a global recessi...
3,[CNBC Fed Survey: Fed's efforts to slow inflat...
4,[US Inflation to Stay Well Above Fed's Target]
...,...
207,"[The EU has agreed to ration gas, but some cou..."
208,[Europe agrees weakened gas curbs plan]
209,[Winter Is Coming in Europe. Russian Gas Isn’t.]
210,[Larry Kudlow: Where's the domestic spending f...


In [39]:
# label 컬럼 추가
df['label'] = df['title'].apply(test_sentences).apply(np.argmax) 

In [40]:
# title 컬럼에 대한 감성 분석 컬럼(sentiment) 추가
def sent(x):
    if x == 0 :
        return 'neutral'
    elif x == 1 :
        return 'negative'
    else :
        return 'positive'

df['sentiment'] = df['label'].apply(sent)

In [41]:
# 중요 키워드 추출 열(keywords) 추가
def keyword(title):
  kw_model = KeyBERT()
  return kw_model.extract_keywords(title.text, top_n=3)

df['keywords'] = df['title'].apply(keyword)

In [42]:
# title 컬럼의 대괄호([]) 제거
def regex(title):
  return title.text
df['title'] = df['title'].apply(regex)

In [43]:
# label 컬럼 제거, 순서 변경
df.drop('label', axis=1, inplace=True)
df = df[['title', 'keywords', 'sentiment']]
df

Unnamed: 0,title,keywords,sentiment
0,World Bank warns of recession and stagflation:...,"[(recession, 0.5877), (stagflation, 0.3829), (...",neutral
1,The Fed's impact hasn't been felt in the econo...,"[(fed, 0.3964), (economy, 0.2943), (weiss, 0.2...",neutral
2,Gravitas: India could escape a global recessio...,"[(gravitas, 0.5669), (recession, 0.4255), (ind...",positive
3,CNBC Fed Survey: Fed's efforts to slow inflati...,"[(recession, 0.4964), (inflation, 0.4915), (fe...",negative
4,US Inflation to Stay Well Above Fed's Target,"[(inflation, 0.5703), (fed, 0.4165), (target, ...",neutral
...,...,...,...
207,"The EU has agreed to ration gas, but some coun...","[(eu, 0.3973), (gas, 0.3831), (ration, 0.3535)]",neutral
208,Europe agrees weakened gas curbs plan,"[(curbs, 0.3362), (europe, 0.3), (gas, 0.2907)]",neutral
209,Winter Is Coming in Europe. Russian Gas Isn’t.,"[(winter, 0.4355), (russian, 0.4023), (gas, 0....",neutral
210,Larry Kudlow: Where's the domestic spending fr...,"[(spending, 0.5171), (kudlow, 0.3987), (larry,...",neutral


In [60]:
df.loc[[6,43, 47, 54, 139, 149, 155, 171, 182, 193]]

Unnamed: 0,title,keywords,sentiment
6,US Inflation to Stay Well Above Fed’s Target,"[(inflation, 0.5782), (fed, 0.4455), (target, ...",neutral
43,Watch China's Property Crisis Burns Middle Class,"[(china, 0.3371), (property, 0.3252), (burns, ...",neutral
47,Walmart Rings More Alarm Bells for the US Economy,"[(walmart, 0.5714), (economy, 0.4186), (alarm,...",neutral
54,US new home sales drop to lowest level in 2 years,"[(sales, 0.4259), (lowest, 0.3081), (home, 0.2...",negative
139,Oil Extends Gain With Tight Supply Offsetting ...,"[(gain, 0.4148), (oil, 0.3757), (offsetting, 0...",positive
149,GM earnings fall short of estimates amid suppl...,"[(earnings, 0.4709), (gm, 0.4187), (estimates,...",negative
155,Google gives jittery stocks a lift ahead of Fed,"[(jittery, 0.5463), (stocks, 0.3969), (fed, 0....",neutral
171,Amazon hikes Prime membership prices by up to ...,"[(inflation, 0.4315), (prices, 0.3715), (amazo...",negative
182,"Bitcoin (BTC), Ethereum (ETC) Stay Firm Ahead ...","[(btc, 0.3712), (ethereum, 0.3446), (fed, 0.33...",positive
193,Breaking: Aussie CPI comes in line with expect...,"[(cpi, 0.5327), (aussie, 0.4345), (aud, 0.431)]",positive


In [45]:
# sentiment 분포 확인
df['sentiment'].value_counts()

neutral     188
negative     16
positive      8
Name: sentiment, dtype: int64

- neutral이 압도적으로 많은 것을 알 수 있다!