# Prepare the model

In [10]:
from transformers import pipeline
sentiment_model = pipeline(task="text-classification", model="./roberta-base_twitter")

In [24]:
sentiment_model("I love this movie")

[{'label': 'LABEL_1', 'score': 0.5284806489944458}]

In [25]:
def analysis_text(text):
    return sentiment_model(text)

# Prepare the document

In [61]:
import re

delimiters = "[， 。]"

def split_text_by_length(text, max_length=500):
    words = re.split(delimiters, text)
    split_texts = []
    current_part = []
    current_length = 0

    for word in words:
        if current_length + len(word) + 1 <= max_length:
            current_part.append(word)
            current_length += len(word) + 1
        else:
            split_texts.append(' '.join(current_part))
            current_part = [word]
            current_length = len(word) + 1

    if current_part:
        split_texts.append(' '.join(current_part))

    return split_texts

In [70]:
import json
import pandas as pd
from dateutil import parser

file_path = './news_data/cd_large.json'

with open(file_path, 'r', encoding='utf-8') as file:
    news_collection = json.load(file)

format_str = "%Y-%m-%d"
news_record = []

print(f"All news data count : {len(news_collection)}")

for index, news in enumerate(news_collection):
    date = parser.parse(news["time"]).strftime(format_str)
    news_detail = {
        "date":date,
    }
    content = news["content"]
    res = analysis_text(split_text_by_length(content))

    # 這邊直接判斷哪種情緒出現在文章比較多
    bullish = 0
    bearish = 0
    for data in res:
        label = data["label"]
        if label == "LABEL_1":
            bullish += 1
        elif label == "LABEL_0":
            bearish += 1
    if bullish > bearish:
        news_detail["label"]="bullish"
    elif bearish > bullish:
        news_detail["label"]="bearish"
    else:
        news_detail["label"]="neutral"
    if (index+1)%50==0:
        print(f"process data : {index+1}")
    news_record.append(news_detail)

df = pd.DataFrame(news_record)
df.to_json("cd_large.json", orient="records")

        


All news data count : 12083
process data : 50
process data : 100
process data : 150
process data : 200
process data : 250
process data : 300
process data : 350
process data : 400
process data : 450
process data : 500
process data : 550
process data : 600
process data : 650
process data : 700
process data : 750
process data : 800
process data : 850
process data : 900
process data : 950
process data : 1000
process data : 1050
process data : 1100
process data : 1150
process data : 1200
process data : 1250
process data : 1300
process data : 1350
process data : 1400
process data : 1450
process data : 1500
process data : 1550
process data : 1600
process data : 1650
process data : 1700
process data : 1750
process data : 1800
process data : 1850
process data : 1900
process data : 1950
process data : 2000
process data : 2050
process data : 2100
process data : 2150
process data : 2200
process data : 2250
process data : 2300
process data : 2350
process data : 2400
process data : 2450
process data