# Prepare the model

In [1]:
from transformers import pipeline
tweet_model = pipeline(task="text-classification", model="./roberta-base_twitter")
imdb_model = pipeline(task="text-classification", model="./roberta-base_imdb")

  from .autonotebook import tqdm as notebook_tqdm


In [80]:
imdb_model("I love this movie")

[{'label': 'LABEL_1', 'score': 0.9983030557632446}]

In [2]:
def analysis_text(text, model="tweet"):
    if model == "tweet":
        return tweet_model(text)
    elif model == "imdb":
        return imdb_model(text)

# Prepare news document

In [3]:
import re

delimiters = "[， 。]"

def split_text_by_length(text, max_length=500):
    words = re.split(delimiters, text)
    split_texts = []
    current_part = []
    current_length = 0

    for word in words:
        if current_length + len(word) + 1 <= max_length:
            current_part.append(word)
            current_length += len(word) + 1
        else:
            split_texts.append(' '.join(current_part))
            current_part = [word]
            current_length = len(word) + 1

    if current_part:
        split_texts.append(' '.join(current_part))

    return split_texts

In [4]:
import json
import pandas as pd
from dateutil import parser

file_path = './news_data/origin/tb.json'

with open(file_path, 'r', encoding='utf-8') as file:
    news_collection = json.load(file)

format_str = "%Y-%m-%d"
tweets_record = []

print(f"All news data count : {len(news_collection)}")

for index, tweet in enumerate(news_collection):
    date = parser.parse(tweet["time"]).strftime(format_str)
    news_detail = {
        "date":date,
    }
    content = tweet["content"]
    res = analysis_text(split_text_by_length(content))

    # 這邊直接判斷哪種情緒出現在文章比較多
    bullish = 0
    bearish = 0
    for data in res:
        label = data["label"]
        if label == "LABEL_1":
            bullish += 1
        elif label == "LABEL_0":
            bearish += 1
    if bullish > bearish:
        news_detail["label"]="bullish"
    elif bearish > bullish:
        news_detail["label"]="bearish"
    else:
        news_detail["label"]="neutral"
    if (index+1)%50==0:
        print(f"process data : {index+1}")
    tweets_record.append(news_detail)

df = pd.DataFrame(tweets_record)
df.to_json("./news_data/news_with_label/tb.json", orient="records", indent=4)


All news data count : 5495
process data : 50
process data : 100
process data : 150
process data : 200
process data : 250
process data : 300
process data : 350
process data : 400
process data : 450
process data : 500
process data : 550
process data : 600
process data : 650
process data : 700
process data : 750
process data : 800
process data : 850
process data : 900
process data : 950
process data : 1000
process data : 1050
process data : 1100
process data : 1150
process data : 1200
process data : 1250
process data : 1300
process data : 1350
process data : 1400
process data : 1450
process data : 1500
process data : 1550
process data : 1600
process data : 1650
process data : 1700
process data : 1750
process data : 1800
process data : 1850
process data : 1900
process data : 1950
process data : 2000
process data : 2050
process data : 2100
process data : 2150
process data : 2200
process data : 2250
process data : 2300
process data : 2350
process data : 2400
process data : 2450
process data 

# Prepare twitter document

In [86]:
import json
import pandas as pd
from dateutil import parser

file_path = './tweeter_data/filter/VitalikButerin.json'

with open(file_path, 'r', encoding='utf-8') as file:
    news_collection = json.load(file)

format_str = "%Y-%m-%d"
tweets_record = []

print(f"All tweet data count : {len(news_collection)}")

for index, tweet in enumerate(news_collection):
    date = parser.parse(tweet["date"]).strftime(format_str)
    news_detail = {
        "date":date,
    }
    tweet_text = tweet.get("tweet_text", "")
    repost_text = tweet.get("repost_text", "")
    content = ""
    if tweet_text is not None:
        content += f"{tweet_text}\n"
    if repost_text is not None:
        content += f"{repost_text}\n"
    res = analysis_text(text=content, model="imdb")

    bullish = 0
    bearish = 0
    for data in res:
        label = data["label"]
        if label == "LABEL_1":
            bullish += 1
        elif label == "LABEL_0":
            bearish += 1
    if bullish > bearish:
        news_detail["label"]="bullish"
    elif bearish > bullish:
        news_detail["label"]="bearish"
    else:
        news_detail["label"]="neutral"
    if (index+1)%50==0:
        print(f"process data : {index+1}")
    tweets_record.append(news_detail)

df = pd.DataFrame(tweets_record)
df.to_json("./tweeter_data/tweeter_with_label/VitalikButerin.json", orient="records", indent=4)

All tweet data count : 73
process data : 50
