In [1]:
import pandas as pd
import random

In [2]:
stocks_info = pd.read_csv(r'data\sp500-with-gics.csv')
# stocks_info

In [3]:
unique_sectors_list = stocks_info['Sector'].unique().tolist()
unique_sub_industries_list = stocks_info['SubIndustry'].unique().tolist()
# unique_sectors_list


In [4]:
positive_quarterly_returns = pd.read_csv(r"data\positive_quarterly_return_rates.csv")
tickers = positive_quarterly_returns.columns.to_list()
tickers.remove("Date")

positive_stocks_info = stocks_info[stocks_info['Ticker'].isin(tickers)]

melted_df = pd.melt(positive_quarterly_returns, id_vars=['Date'], var_name='Ticker', value_name='Return')

# melted_df

In [5]:
ticker_and_sector_df = stocks_info[['Ticker', 'Sector']]
merged_df = melted_df.merge(ticker_and_sector_df, on='Ticker', how='left')
merged_df

Unnamed: 0,Date,Ticker,Return,Sector
0,2000-03-31 00:00:00+00:00,A,0.676857,Health Care
1,2000-06-30 00:00:00+00:00,A,0.361111,Health Care
2,2000-09-30 00:00:00+00:00,A,-0.248087,Health Care
3,2000-12-31 00:00:00+00:00,A,-0.313825,Health Care
4,2001-03-31 00:00:00+00:00,A,0.006180,Health Care
...,...,...,...,...
21307,2022-12-31 00:00:00+00:00,YUM,-0.060535,Consumer Discretionary
21308,2023-03-31 00:00:00+00:00,YUM,0.168929,Consumer Discretionary
21309,2023-06-30 00:00:00+00:00,YUM,0.054058,Consumer Discretionary
21310,2023-09-30 00:00:00+00:00,YUM,0.035287,Consumer Discretionary


In [6]:

sector_count = {}
sector_dict = {}
for sector in unique_sectors_list:
    sector_df = merged_df[merged_df["Sector"] == sector]
    positive_returns_pivot = sector_df.pivot(index='Date', columns='Ticker', values='Return').reset_index()

    stock_names_per_sector = positive_returns_pivot.columns.to_list()
    stock_names_per_sector.remove("Date")
    sector_count[sector] = len(stock_names_per_sector)
    sector_dict[sector] = stock_names_per_sector

    csv_filename = f"{sector}_positive_returns.csv"
    positive_returns_pivot.to_csv(r'data/positive_returns_per_sector/' + csv_filename, index=False)

In [55]:
# sector_count

In [56]:
# sector_dict

In [8]:
def get_n_random_stock_per_sector(stocks_num_per_sector, sector_dict):
    output_stocks_list = []
    for sector in sector_dict:
        max_stocks_per_sector = len(sector_dict[sector])
        if stocks_num_per_sector <= max_stocks_per_sector:
            output_stocks_list.append(random.sample(sector_dict[sector], stocks_num_per_sector))
        else:
            output_stocks_list.append(random.sample(sector_dict[sector], max_stocks_per_sector))
    return [stock for sector_stocks in output_stocks_list for stock in sector_stocks] # initially 'output_stocks_list' is a list of n-elements lists (one per sector)

In [None]:
stocks_num_per_sector = 3
n_stocks_per_sector = get_n_random_stock_per_sector(stocks_num_per_sector, sector_dict)
columns_to_keep = ["Date"] + n_stocks_per_sector
output_df = positive_quarterly_returns[columns_to_keep]
# output_df.to_csv(r"data/random_data/n_stocks_per_sector.csv", index=False)
output_df.to_csv(r"data/random_data/3_stocks_per_sector.csv", index=False)


In [13]:
# output_df

In [10]:
df = pd.read_csv(r'data\random_data\3_stocks_per_sector.csv')
df

Unnamed: 0,Date,GD,CHRW,ALK,VRTX,STE,UHS,MSFT,JKHY,AAPL,...,PPG,ESS,PSA,O,MKC,PG,TSN,CVX,VLO,COP
0,2000-03-31 00:00:00+00:00,-0.033251,0.104629,-0.055649,0.404206,-0.233333,0.130930,0.250839,0.295181,0.086113,...,0.037838,0.032249,-0.003586,-0.034981,-0.085548,-0.045100,-0.082143,-0.095947,-0.083833,-0.068123
1,2000-06-30 00:00:00+00:00,0.041488,-0.103987,-0.171429,0.214642,0.031056,0.315436,-0.220375,0.309123,0.190954,...,-0.109925,0.108994,-0.036610,-0.017299,0.116813,-0.443375,-0.359525,0.113223,0.672422,0.035410
2,2000-09-30 00:00:00+00:00,0.035898,0.357779,-0.066810,1.367123,-0.126506,0.359693,-0.119669,0.391418,-0.200187,...,-0.126920,0.179584,0.120254,0.226798,0.072460,-0.056549,-0.130625,-0.062673,-0.018908,0.104360
3,2000-12-31 00:00:00+00:00,0.202535,0.130671,-0.136259,0.518519,0.462069,0.281426,-0.260938,-0.095002,-0.545134,...,-0.133279,0.318996,-0.001500,0.002311,-0.086771,0.225485,0.103330,0.016826,0.131385,0.252793
4,2001-03-31 00:00:00+00:00,0.201208,0.117370,0.264706,-0.230183,0.235849,0.179356,-0.266385,0.379496,-0.386599,...,0.162415,-0.004906,0.050370,0.072452,0.236058,0.158884,0.215190,-0.000944,0.018296,-0.095588
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,2022-12-31 00:00:00+00:00,-0.014875,-0.028043,-0.030605,0.035187,-0.191921,-0.123657,-0.070610,0.015988,0.026761,...,-0.026434,-0.066325,-0.030485,-0.146242,-0.120170,-0.114747,-0.212641,0.045039,0.049200,0.231622
92,2023-03-31 00:00:00+00:00,0.133585,-0.079894,0.056377,-0.038362,0.094296,0.541096,-0.002014,-0.054536,-0.120551,...,0.116725,-0.134918,-0.061026,0.090206,0.164114,0.187812,-0.039827,0.155578,0.076324,0.039017
93,2023-06-30 00:00:00+00:00,-0.060330,0.081182,-0.001660,0.105972,0.029230,-0.103393,0.201893,-0.142153,0.330645,...,0.109317,-0.002780,0.134003,-0.002972,0.010183,-0.007402,-0.058399,-0.014585,0.162457,-0.031319
94,2023-09-30 00:00:00+00:00,-0.065977,-0.008965,0.263721,0.098758,0.152827,0.218944,0.179295,0.121194,0.159814,...,0.069262,0.147103,-0.037772,-0.030231,0.050462,0.026385,-0.119859,-0.065905,-0.144471,-0.027884
