In [1]:
import pandas as pd
import random

In [2]:
stocks_info = pd.read_csv(r'data\sp500-with-gics.csv')
# stocks_info

In [3]:
unique_sectors_list = stocks_info['Sector'].unique().tolist()
unique_sub_industries_list = stocks_info['SubIndustry'].unique().tolist()
# unique_sectors_list


In [4]:
positive_quarterly_returns = pd.read_csv(r"data\positive_quarterly_return_rates.csv")
tickers = positive_quarterly_returns.columns.to_list()
tickers.remove("Date")

positive_stocks_info = stocks_info[stocks_info['Ticker'].isin(tickers)]

melted_df = pd.melt(positive_quarterly_returns, id_vars=['Date'], var_name='Ticker', value_name='Return')

# melted_df

In [5]:
ticker_and_sector_df = stocks_info[['Ticker', 'Sector']]
merged_df = melted_df.merge(ticker_and_sector_df, on='Ticker', how='left')
merged_df

Unnamed: 0,Date,Ticker,Return,Sector
0,2000-03-31 00:00:00+00:00,A,0.676857,Health Care
1,2000-06-30 00:00:00+00:00,A,0.361111,Health Care
2,2000-09-30 00:00:00+00:00,A,-0.248087,Health Care
3,2000-12-31 00:00:00+00:00,A,-0.313825,Health Care
4,2001-03-31 00:00:00+00:00,A,0.006180,Health Care
...,...,...,...,...
21307,2022-12-31 00:00:00+00:00,YUM,-0.060535,Consumer Discretionary
21308,2023-03-31 00:00:00+00:00,YUM,0.168929,Consumer Discretionary
21309,2023-06-30 00:00:00+00:00,YUM,0.054058,Consumer Discretionary
21310,2023-09-30 00:00:00+00:00,YUM,0.035287,Consumer Discretionary


In [6]:

sector_count = {}
sector_dict = {}
for sector in unique_sectors_list:
    sector_df = merged_df[merged_df["Sector"] == sector]
    positive_returns_pivot = sector_df.pivot(index='Date', columns='Ticker', values='Return').reset_index()

    stock_names_per_sector = positive_returns_pivot.columns.to_list()
    stock_names_per_sector.remove("Date")
    sector_count[sector] = len(stock_names_per_sector)
    sector_dict[sector] = stock_names_per_sector

    csv_filename = f"{sector}_positive_returns.csv"
    positive_returns_pivot.to_csv(r'data/positive_returns_per_sector/' + csv_filename, index=False)

In [55]:
# sector_count

In [56]:
# sector_dict

In [7]:
def get_n_random_stock_per_sector(stocks_num_per_sector, sector_dict):
    output_stocks_list = []
    for sector in sector_dict:
        max_stocks_per_sector = len(sector_dict[sector])
        if stocks_num_per_sector <= max_stocks_per_sector:
            output_stocks_list.append(random.sample(sector_dict[sector], stocks_num_per_sector))
        else:
            output_stocks_list.append(random.sample(sector_dict[sector], max_stocks_per_sector))
    return [stock for sector_stocks in output_stocks_list for stock in sector_stocks] # initially 'output_stocks_list' is a list of n-elements lists (one per sector)

In [None]:
# stocks_num_per_sector = 1
# n_stocks_per_sector = get_n_random_stock_per_sector(stocks_num_per_sector, sector_dict)
# columns_to_keep = ["Date"] + n_stocks_per_sector
# output_df = positive_quarterly_returns[columns_to_keep]
# # output_df.to_csv(r"data/random_data/n_stocks_per_sector.csv", index=False)
# output_df.to_csv(r"data/random_data/1_stocks_per_sector.csv", index=False)


In [9]:
for iter in range(2, 11):
    stocks_num_per_sector = 1
    n_stocks_per_sector = get_n_random_stock_per_sector(stocks_num_per_sector, sector_dict)
    columns_to_keep = ["Date"] + n_stocks_per_sector
    output_df = positive_quarterly_returns[columns_to_keep]
    # output_df.to_csv(r"data/random_data/n_stocks_per_sector.csv", index=False)
    output_df.to_csv(r"data/random_data/1_stocks_per_sector_" + f"{iter}_iter.csv", index=False)

In [13]:
# output_df

In [9]:
df = pd.read_csv(r'data\random_data\1_stocks_per_sector.csv')
df

Unnamed: 0,Date,MMM,A,SNPS,OMC,ROST,EIX,GS,NUE,PSA,GIS,WMB
0,2000-03-31 00:00:00+00:00,-0.000663,0.676857,-0.085440,0.088923,-0.123028,-0.039864,0.159146,0.086062,-0.003586,-0.086236,-0.059709
1,2000-06-30 00:00:00+00:00,-0.020183,0.361111,-0.305052,-0.016579,0.291153,-0.315104,0.196261,-0.086919,-0.036610,0.048503,0.411607
2,2000-09-30 00:00:00+00:00,-0.079104,-0.248087,-0.197531,-0.046827,-0.224283,0.231182,-0.098965,-0.269955,0.120254,0.074413,-0.033879
3,2000-12-31 00:00:00+00:00,0.097314,-0.313825,0.011966,-0.184021,-0.160446,-0.024390,0.216722,-0.161039,-0.001500,-0.060425,0.030177
4,2001-03-31 00:00:00+00:00,0.307949,0.006180,0.194257,0.070974,0.162345,-0.239351,-0.130029,0.286328,0.050370,0.244497,-0.116739
...,...,...,...,...,...,...,...,...,...,...,...,...
91,2022-12-31 00:00:00+00:00,-0.109799,0.061981,0.052286,-0.001225,0.208492,-0.057474,0.007280,0.087345,-0.030485,0.042881,-0.047281
92,2023-03-31 00:00:00+00:00,0.094365,0.188984,0.008518,0.310116,0.351007,0.094579,0.164936,0.150704,-0.061026,0.067820,0.106338
93,2023-06-30 00:00:00+00:00,-0.134853,-0.079682,0.216654,0.162556,-0.059134,0.109113,-0.050147,0.182491,0.134003,0.038971,-0.057412
94,2023-09-30 00:00:00+00:00,-0.018099,-0.132831,0.115879,0.011249,0.026342,-0.010036,0.007892,0.075661,-0.037772,-0.096819,0.111861
