# MOCK Get Data

## Setup

In [32]:
import pandas as pd
import numpy as np

In [78]:
input_dir = 'inputs/'

company_info_path = input_dir + 'company_info.csv'
monthly_returns_path = input_dir + 'monthly_returns.csv'
company_themes_path = input_dir + 'company_themes.csv'

## Create Mock Company Information

In [34]:
# Randomly generate 25 unique CIKs and put them in a dataframe
company_info = pd.DataFrame(np.random.randint(1000000, 9999999, 25), columns=['CIK'])

# Randomly assign eight industry sectors to each CIK
company_info['sector'] = np.random.choice(
    ['Energy', 'Materials', 'Industrials', 'Consumer Discretionary', 'Consumer Staples', 'Health Care', 'Financials', 'Information Technology', 'Communication Services', 'Utilities'],
    25)

# Randomly assign a market cap from 100M to 100B for each CIK
company_info['market_cap'] = np.random.randint(100, 100000, 25)

# Randomly generate three ESG scores of varying ranges/distriubtions for each
company_info['esg1'] = np.random.normal(50, 10, 25)
company_info['esg2'] = np.random.uniform(0, 100, 25)
company_info['esg3'] = np.random.standard_gamma(1, 25)

In [35]:
company_info

Unnamed: 0,CIK,sector,market_cap,esg1,esg2,esg3
0,5220704,Health Care,39611,54.703106,48.160889,0.220305
1,2332044,Utilities,82878,41.660446,97.178136,1.947844
2,9637119,Financials,98896,34.81785,46.831737,0.389516
3,3355642,Materials,23420,63.150812,81.00476,0.34054
4,1883457,Financials,37909,36.628471,1.818304,2.353531
5,2069450,Information Technology,93710,59.247939,88.082833,0.00917
6,3542095,Health Care,73549,54.320938,23.435024,0.213535
7,6649678,Health Care,68854,64.127276,34.15951,2.127681
8,9763004,Health Care,80415,44.937475,52.142473,2.905877
9,9970121,Communication Services,94857,51.042855,74.201731,0.253233


In [36]:
# Save mock data to a csv file
company_info.to_csv(company_info_path, index=False)

## Create Mock Monthly Returns Data

In [72]:
# For each month from 2013-01 to 2020-01, randomly generate a return for each CIK
monthly_returns = pd.DataFrame(columns=['CIK', 'date', 'ret'])

dates = pd.date_range(start='2013-01-01', end='2020-01-01', freq='MS')

for cik in company_info['CIK']:
  # Generate a set of monthly returns for each CIK
  curr_monthly_returns = pd.DataFrame({
      'CIK': [cik for _ in range(len(dates))],
      'date': dates,
      'ret': np.random.normal(0, 1, len(dates))})
  
  # Append the current CIK's monthly returns to the dataframe
  monthly_returns = pd.concat([monthly_returns, curr_monthly_returns])

# Rescale the returns to be between -1 and 1
monthly_returns['ret'] = monthly_returns['ret'] / monthly_returns['ret'].abs().max()

# Add predicted returns for each CIK
monthly_returns['pred_ret1'] = monthly_returns['ret'] + np.random.normal(0, 0.2, len(monthly_returns)) - 0.1
monthly_returns['pred_ret2'] = monthly_returns['ret'] + np.random.normal(0, 0.2, len(monthly_returns)) - 0.1
monthly_returns['pred_ret3'] = monthly_returns['ret'] + np.random.normal(0, 0.2, len(monthly_returns)) - 0.1


In [73]:
monthly_returns

Unnamed: 0,CIK,date,ret,pred_ret1,pred_ret2,pred_ret3
0,5220704,2013-01-01,0.042038,-0.000309,-0.047066,-0.413661
1,5220704,2013-02-01,-0.098542,-0.394995,0.163334,-0.315712
2,5220704,2013-03-01,-0.210259,-0.364391,-0.291918,-0.145896
3,5220704,2013-04-01,0.038368,-0.005837,-0.150492,0.006581
4,5220704,2013-05-01,0.151490,0.122270,-0.050050,0.602736
...,...,...,...,...,...,...
80,1051688,2019-09-01,0.820370,1.100433,0.532494,0.940478
81,1051688,2019-10-01,0.243218,0.135231,0.458307,0.240781
82,1051688,2019-11-01,-0.221654,-0.303337,-0.308688,-0.138715
83,1051688,2019-12-01,-0.032243,-0.045724,-0.448300,0.201187


In [74]:
monthly_returns.to_csv(monthly_returns_path, index=False)

## Create Mock Themes

In [75]:
# Randomly generate pairs of CIKs and investing themes
company_themes = pd.DataFrame(columns=['CIK', 'theme'])

themes = ['environment', 'social', 'governance', 'energy', 'ethics']

for cik in company_info['CIK']:
  # Select a random number of themes for each CIK
  num_themes = np.random.randint(0, 10)

  # Generate a set of themes for each CIK
  curr_company_themes = pd.DataFrame({
      'CIK': [cik for _ in range(num_themes)],
      'theme': np.random.choice(themes, num_themes)})
  
  # Append the current CIK's themes to the dataframe
  company_themes = pd.concat([company_themes, curr_company_themes])

In [79]:
company_themes

Unnamed: 0,CIK,theme
0,5220704,energy
0,2332044,social
1,2332044,ethics
2,2332044,governance
3,2332044,social
...,...,...
0,1051688,social
1,1051688,energy
2,1051688,ethics
3,1051688,social


In [80]:
company_themes.to_csv(company_themes_path, index=False)