In [16]:
import os
import json
import pandas as pd

In [17]:
def load_suno_files(filename: str) -> pd.DataFrame:
    """
    Loads JSON files from Suno Research data.
    """

    ticker = filename.replace('suno-', '').replace('.json', '')

    with open(f'./results/{filename}', encoding='utf8') as json_file:
        data = json.load(json_file)

    df_news = pd.DataFrame(data)

    # Convert date to datetime format
    df_news['date'] = pd.to_datetime(df_news['date'], format='%d/%m/%Y %H:%M')

    # Remove duplicates
    df_news = df_news.drop_duplicates(subset=['date', 'title'], keep ='first')

    # Set date column as index
    df_news.set_index('date', inplace=True)

    # Order by date
    df_news.sort_index(inplace=True)

    # Add ticker Info
    df_news['ticker'] = ticker.upper()

    return df_news

In [18]:
directory = 'results'

df_news_final = pd.DataFrame()

frames = list()

# Loop over files in the directory
for filename in os.scandir(directory):
    df_news = load_suno_files(filename.name)

    print (filename.name, df_news.loc['2018-01-02':'2022-07-02'].shape)
    frames.append(df_news)

for df in frames:
    df_news_final = df_news_final.append(df)

suno-b3sa3.json (605, 6)
suno-eqtl3.json (43, 6)
suno-itub4.json (588, 6)
suno-petr4.json (1344, 6)
suno-vale3.json (1012, 6)


In [19]:
df_news_final.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3738 entries, 2018-08-29 11:52:00 to 2022-08-03 10:49:00
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   topic        3738 non-null   object
 1   title        3738 non-null   object
 2   search_date  3738 non-null   object
 3   url          3738 non-null   object
 4   tags         3738 non-null   object
 5   ticker       3738 non-null   object
dtypes: object(6)
memory usage: 204.4+ KB


In [20]:
df_news_final.sample(10)

Unnamed: 0_level_0,topic,title,search_date,url,tags,ticker
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-04-06 11:01:00,Negócios,Acionistas minoritários da Petrobras (PETR4) a...,2022-05-03 16:12:00,https://www.suno.com.br/noticias/acionistas-mi...,[Petrobras (PETR4)],PETR4
2020-09-23 17:01:00,Negócios,Ações da Localiza (RENT3) e Unidas disparam ma...,2022-05-23 23:04:33,https://www.suno.com.br/noticias/acoes-da-loca...,"[B3 (B3SA3), Localiza (RENT3), Unidas (LCAM3)]",B3SA3
2021-10-04 21:11:00,Negócios,Radar: Petrobras (PETR4) vai se desfazer de fa...,2022-05-03 16:30:17,https://www.suno.com.br/noticias/radar-petrobr...,"[3R Petroleum (RRRP3), B3 (B3SA3), Banco Inter...",VALE3
2020-08-24 21:01:00,Negócios,Petrobras (PETR4) inicia processo para venda d...,2022-05-03 16:21:51,https://www.suno.com.br/noticias/petrobras-pet...,"[PETR3, Petrobras (PETR4)]",PETR4
2021-03-13 12:00:00,Entrevistas,Capitânia: Indústria de fundos imobiliários po...,2022-08-04 00:04:27,https://www.suno.com.br/noticias/capitania-ind...,"[Fundos Imobiliários (FIIs), B3 (B3SA3), Bolsa...",B3SA3
2022-05-16 10:39:00,Negócios,Índice do agronegócio estreia na B3 (B3SA3) co...,2022-08-04 00:00:54,https://www.suno.com.br/noticias/indice-bolsa-...,"[B3 (B3SA3), Agências, Agronegócio, B3 (B3SA3)]",B3SA3
2019-12-16 09:48:00,Negócios,Agenda do Dia: Gol; Vale; Petrobras; Engie; Sa...,2022-05-03 16:44:41,https://www.suno.com.br/noticias/agenda-do-dia...,"[agenda do dia, Gol Linhas Aéreas (GOLL11/GOLL...",VALE3
2021-05-03 17:27:00,Negócios,Petrobras (PETR4): Empresas devem participar d...,2022-05-03 16:12:03,https://www.suno.com.br/noticias/leilao-parcer...,"[PETR3, Petrobras (PETR4)]",PETR4
2021-11-03 09:40:00,Negócios,Fique de olho nos resultados do Itaú (ITUB4) e...,2022-08-04 01:19:24,https://www.suno.com.br/noticias/fique-de-olho...,"[Itaú Unibanco (ITUB3/ITUB4), Allied (ALLD3), ...",ITUB4
2020-09-28 17:27:00,Mercado,"Ibovespa fecha em queda de 2,41%, a 94.666,37 ...",2022-05-03 16:20:54,https://www.suno.com.br/noticias/ibovespa-fech...,"[Compass (PASS3), Cosan (CSAN3), Ibovespa, PET...",PETR4


In [21]:
# Save df object
df_news_final.to_pickle('./all_news_suno.pkl')

In [22]:
df_news_final.to_csv('./final/df_suno_with_mentions.csv', sep=';', encoding='utf8')