In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv('./data/no_transcript_final_dataset.csv')

In [33]:
def add_to_dict(asset_df:pd.DataFrame, portfolio_dict:dict, date:str, nlp:str):
    date = pd.to_datetime(date)
    date = date + pd.DateOffset(months=1)
    date = date.strftime('%Y-%m-%d')
    portfolio_dict[date] = {'ticker':asset_df['ticker'].tolist(),
                            'score':asset_df[nlp].tolist(), 
                            'weight':asset_df['normalized_weights'].tolist(),
                            'decile':asset_df['decile'].tolist()}
    return portfolio_dict

def get_decile(sub_data:pd.DataFrame, nlp:str)->pd.DataFrame:
    decile_size = len(sub_data) // 10
    sorted_data = sub_data.sort_values(nlp, ascending=False)
    decile_df = pd.DataFrame()
    for i in range(1,11):
        start_idx = (i - 1) * decile_size
        end_idx = start_idx + decile_size
        ith_decile = sorted_data.iloc[start_idx:end_idx]
        ith_decile["normalized_weights"] = 1/len(ith_decile)
        ith_decile.loc[:,'decile'] = i
        decile_df = pd.concat([decile_df, ith_decile])
    return decile_df

In [36]:
from tqdm import tqdm
import pickle
import warnings
import os
from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action='ignore', category=(SettingWithCopyWarning))
os.makedirs('./data/portfolio_decile_value', exist_ok=True)
nlp_columns = ['GHR-bigram-tf-idf','GHR-bigram-unigram-tf-idf','GHR-bigram-LM-unigram-tf-idf','enhanced-bow','vader-standardized']
nlp = 'GHR-bigram-tf-idf'
data['date'] = pd.to_datetime(data['date'])

start_yr, end_yr = data['date'].min().year,data['date'].max().year
for nlp in tqdm(nlp_columns):
    portfolio_dict = {}
    for yr in range(start_yr,end_yr+1):
        for mh in range(1,13):
            sub_data = data[(data['date'].dt.year == yr) & (data['date'].dt.month == mh)]
            if len(sub_data) < 100: # Skip if there are less than 100 data points
                continue
            decile_df = get_decile(sub_data, nlp) # Get the decile portfolio
            portfolio_dict = add_to_dict(decile_df, portfolio_dict, f'{yr}-{mh}-01', nlp)
    with open(f'./data/portfolio_decile/portfolio_dict_{nlp}.pkl', 'wb') as handle:
        pickle.dump(portfolio_dict, handle)

100%|██████████| 5/5 [00:02<00:00,  2.25it/s]
