In [17]:
!pip install lxml

Collecting lxml
  Downloading lxml-6.0.0-cp310-cp310-macosx_10_9_universal2.whl.metadata (6.6 kB)
Downloading lxml-6.0.0-cp310-cp310-macosx_10_9_universal2.whl (8.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: lxml
Successfully installed lxml-6.0.0


In [16]:
import yfinance as yf
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import os
from io import StringIO

In [17]:
os.makedirs("data/raw", exist_ok=True)


In [18]:
ticker = 'ULTRACEMCO.NS'
start_date = '2024-01-01'
end_date = '2024-12-31'

try:
    data = yf.download(ticker, interval="1d")
    data.reset_index(inplace=True)
except Exception as e:
    print("error fetching data:", e)
    data = pd.DataFrame()

if not data.empty:
    data["Date"] = pd.to_datetime(data["Date"])
    numeric_cols = ["Open", "High", "Low", "Close", "Volume"]
    data[numeric_cols] = data[numeric_cols].apply(pd.to_numeric, errors="coerce")

required_cols = ["Date", "Open", "High", "Low", "Close", "Volume"]
missing_cols = [c for c in required_cols if c not in data.columns]
na_counts = data[required_cols].isna().sum()

print("Missing columns:", missing_cols)
print("NA counts:\n", na_counts)
print("Shape:", data.shape)

  data = yf.download(ticker, interval="1d")
[*********************100%***********************]  1 of 1 completed

Missing columns: []
NA counts:
 Price   Ticker       
Date                     0
Open    ULTRACEMCO.NS    0
High    ULTRACEMCO.NS    0
Low     ULTRACEMCO.NS    0
Close   ULTRACEMCO.NS    0
Volume  ULTRACEMCO.NS    0
dtype: int64
Shape: (21, 6)





In [20]:
timestamp = datetime.now().strftime("%Y%m%d-%H%M")
api_file = f"/Users/paramshah/Desktop/bootcamp/homework/homework4/data/raw/api_yfinance_ULTRACEMCO.NS_{timestamp}.csv"
data.to_csv(api_file, index=False)
print(f"saved api data to {api_file}")

saved api data to /Users/paramshah/Desktop/bootcamp/homework/homework4/data/raw/api_yfinance_ULTRACEMCO.NS_20250818-1629.csv


In [6]:
data.head()

Price,Date,Close,High,Low,Open,Volume
Ticker,Unnamed: 1_level_1,ULTRACEMCO.NS,ULTRACEMCO.NS,ULTRACEMCO.NS,ULTRACEMCO.NS,ULTRACEMCO.NS
0,2025-07-18,12419.27832,12478.900396,12362.637349,12441.139748,98339
1,2025-07-21,12497.780273,12633.917341,12296.058925,12480.887353,512521
2,2025-07-22,12373.567383,12619.01158,12217.556294,12569.32652,533617
3,2025-07-23,12294.071289,12436.170561,12280.159472,12381.516995,189617
4,2025-07-24,12226.5,12320.901617,12152.966109,12294.071684,180080


In [7]:
url = "https://www.pmsaifworld.com/top-best-30-pmss-in-india-ranked-as-per-information-ratio/"
resp = requests.get(url)
resp.raise_for_status()

soup = BeautifulSoup(resp.text, "html.parser")

table = soup.find("table")
if not table:
    raise ValueError("No table found on the PMSAIFWorld page.")

data=[]

In [8]:
print(table)

<table class="table table-qrc" id="strategyTable" width="100%"> <thead>
<tr>
<th>Strategy</th>
<th>AUM</th>
<th>Relative Aplha (1Y)</th>
<th>% of +ve months (Fund Data)</th>
<th>Alpha (1Y)</th>
<th>Beta (1Y)</th>
<th>SD (1Y)</th>
<th>Sharpe Ratio (1Y)</th>
<th>Alpha (SI)</th>
<th>Info Ratio (SI)</th>
<th>Consistency Ratio</th>
</tr>
</thead><tbody><tr><td>Buoyant Capital Opportunities Multi-cap</td><td>5,435.66</td><td class="red">-0.5%</td><td>60.91</td><td>9.87%</td><td>0.93</td><td>14.90%</td><td>0.73</td><td>7.92%</td><td>2.51</td><td>61.82</td></tr><tr><td>Sameeksha Capital Equity Fund</td><td>1114</td><td>3.1%</td><td>58.93</td><td>13.47%</td><td>0.73</td><td>13.20%</td><td>1.10</td><td>8.30%</td><td>2.22</td><td>63.39</td></tr><tr><td>Green Lantern Capital LLP Growth Fund</td><td>1075</td><td>1.83%</td><td>54.35</td><td class="red">-3.50%</td><td>0.56</td><td>13.09%</td><td>0.00</td><td>9.57%</td><td>2.16</td><td>55.43</td></tr><tr><td>Stallion Asset Core Fund</td><td>5568.37</t

In [9]:
headers = [header.text for header in soup.find_all('th')]

for row in soup.find_all('tr')[1:]: #skip header row
    cols = row.find_all('td')
    data.append([col.text.strip() for col in cols])

df = pd.DataFrame(data, columns=headers)

numeric_columns = ['AUM', 'Relative Aplha (1Y)', '% of +ve months (Fund Data)', 'Alpha (1Y)', 'Beta (1Y)', 'SD (1Y)', 'Sharpe Ratio (1Y)', 'Alpha (SI)', 'Info Ratio (SI)', 'Consistency Ratio']

for col in numeric_columns:
    df[col] = df[col].str.replace('%', '').str.replace(',', '').replace('', pd.NA)
    df[col] = pd.to_numeric(df[col], errors='coerce') 

In [14]:
output_dir = '/Users/paramshah/Desktop/bootcamp/homework/homework4/data/raw'
os.makedirs(output_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d-%H%M")
output_file = os.path.join(output_dir, f"scrape_pmsaifworld.com_top30PMS_{timestamp}.csv")
df.to_csv(output_file, index=False)
print(f"CSV file has been created successfully at {output_file}.")

CSV file has been created successfully at /Users/paramshah/Desktop/bootcamp/homework/homework4/data/raw/scrape_pmsaifworld.com_top30PMS_20250818-1628.csv.
