In [71]:
import os
from bs4 import BeautifulSoup

In [72]:
files = os.listdir("data/dumps")

In [73]:
data = []
for file in files:

    html_doc = open(f"data/dumps/{file}").read()

    soup = BeautifulSoup(html_doc, 'html.parser')
    rows = soup.find("tbody").find_all('tr')
    
    category = file.replace(".html","") 

    for row in rows: 
        td = row.find_all("td")
        index = td[0].get_text()
        name = td[1].get_text()
        symbol = td[2].get_text() 
        percentage = td[3].get_text().split("%")[0]

        record = dict(index=index, name=name, symbol=symbol, percentage=percentage, category=category)
    
        data.append(record)

In [74]:
import pandas as pd

In [75]:
pd.DataFrame(data).to_csv("data/stocks.csv",index=False)

In [76]:
df = pd.read_csv("data/stocks.csv")

In [77]:
df.head()

Unnamed: 0,index,name,symbol,percentage,category
0,1,Unitedhealth Group Inc,UNH,8.902782,DIA
1,2,Microsoft Corp,MSFT,6.882032,DIA
2,3,Goldman Sachs Group Inc,GS,6.569872,DIA
3,4,Home Depot Inc,HD,6.194837,DIA
4,5,Caterpillar Inc,CAT,5.360532,DIA


In [78]:
len(df)

734

In [79]:
len(df["category"].unique())

4

In [80]:
len(df["symbol"].unique())

519

In [81]:
import json

In [82]:
json.dump(df['symbol'].unique().tolist(),open('data/symbols.json','w'))

In [83]:
json.dump(df['category'].unique().tolist(), open('data/mutual_funds.json','w'))

### Downloading data from Yahoo

Yahoo Finance has a publicly available REST-API with limited functionaly.
The REST-API requires the following arguments to download data: "symbol", "period1", "period2", "interval".

period1 = 1641016800 (2022-01-01 00:00:00 UTC)
period2 = 1708495200 (2024-03-20 00:00:00 UTC)
interval = 1d 

Using `jq` a command-line utility I can create a script for download data from Yahoo's REST-API. 

```bash 
go-stock-price-prediction -symbol $symbol -period1 $period1 -period2 $period2 -interval 1d
```



In [84]:
indexFunds = df['category'].unique().tolist()

In [85]:
## Visualizations 

In [86]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
for indexFund in indexFunds:
    try: 
        plt.figure(figsize=(10, 500))
    
        x = df[df['category']==indexFund]['percentage'].tolist()
        labels = df[df['category']==indexFund]['symbol'].tolist()

        plt.pie(x, labels = labels,labeldistance=1.1)
        
        plt.title(label=f"{indexFund} Index Fund made of {len(labels)} securities.")
        #plt.show()
        plt.savefig(f'data/plots/{indexFund}-piechart.png')
    except Exception as err: 
        print(f'error: {err}')

In [None]:
df.head()

In [None]:
matrix = df.pivot_table(values='percentage',index='symbol',columns='category').dropna(axis=0)

In [None]:
matrix

In [None]:
matrix.describe()

In [None]:
matrix.columns.tolist()

In [None]:
matrix.index.tolist()

In [None]:
matrix.sum()

In [None]:
df.groupby('category')['percentage'].sum()

In [None]:
df.groupby('category')['percentage'].sum() - matrix.sum()