Disclaimer: The copyright of data from www.tanarang.com belongs to Tanarang.
This is being used purely for academic, non-commercial purposes.

In [1]:
import os
import sys
import csv
import pandas as pd
import requests
from bs4 import BeautifulSoup
import dill

from dataclasses import dataclass
from enum import Enum
from typing import List, Dict

In [2]:
SCRIPT_DIR = os.path.dirname(os.path.abspath(""))
sys.path.append(SCRIPT_DIR)

In [3]:
import mogra
from conversion_utils import TanarangParsedRaag
from mogra.datatypes import SSwar

In [None]:
ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"

## Srape List of Raags

In [None]:
index_url = "https://www.tanarang.com/english/raagIndex_eng.htm"
resp = requests.get(index_url, headers={"User-Agent": ua})
if resp.status_code == 200:
    index_soup = BeautifulSoup(resp.text, "html.parser")

In [None]:
# Find the first table
table = index_soup.find("table")


# Initialize a list of tuples
raag_names = []
refs = []

# Find all rows within the table and append lists of (name, link)
rows = table.find_all('tr')
for row in rows:
    cols = [td.find('a') for td in row.find_all('td')]
    names_links = [(a_tag.text.strip(), a_tag.get('href')) for a_tag in cols]
    for nn, ll in names_links:
        raag_names.append(nn)
        refs.append(ll)

In [None]:
with open("raaglist.csv", "w") as fp:
    wr = csv.writer(fp, quoting=csv.QUOTE_ALL)
    for name, ref in zip(raag_names, refs):
        wr.writerow([name, ref])

## Srape List of Raags [Read Copy]

In [4]:
raag_names = []
refs = []
with open("raaglist.csv", "r") as fp:
    wr = csv.reader(fp, delimiter=",")
    for line in wr:
        raag_names.append(line[0])
        refs.append(line[1])

## Scrape Raag Infos

In [None]:
# with open("bhoop_soup.html", "w", encoding="utf-8") as fp:
#     fp.write(str(soup))
# with open("bhoop_soup.html", "r") as fp:
#     bhoop_soup = BeautifulSoup(fp.read())

In [None]:
def infotable_from_soup(raag_soup) -> pd.DataFrame:
    # Find the first table
    table = raag_soup.find("table")

    # Find all rows within the table
    rows = table.find_all('tr')

    # Initialize a list to store row data
    data = []
    headers = ["info_type", "info"]

    # Loop over the rows (excluding the header row)
    for row in rows:
        cols = row.find_all(['td', 'th'])  # This handles both 'td' and 'th' if 'th' is used within the table body
        cols = [ele.text.strip() for ele in cols]
        data.append(cols)  # Add the data

    # Convert list of row data into a pandas DataFrame
    df = pd.DataFrame(data, columns=headers)
    
    return df

In [None]:
for name, ref in zip(raag_names, refs):
    url = f"https://www.tanarang.com/english/{ref}"
    resp = requests.get(url, headers={"User-Agent": ua})
    if resp.status_code == 200:
        raag_soup = BeautifulSoup(resp.text, "html.parser")
    df = infotable_from_soup(raag_soup)
    df.to_pickle(f"infotables/{name}.pkl")

## Scrape Raag Info [Read Copy]

In [None]:
for name in raag_names:
    df = pd.read_pickle(f"infotables/{name}.pkl")
    print(name)
    print(df)

## Deconstruct Info Table

In [None]:
with open("raags_temp.pkl", "ab") as fp:
    for name in sorted(raag_names):
        df = pd.read_pickle(f"infotables/{name}.pkl")
        try:
            parsed_raag = TanarangParsedRaag(df, name, verbose=False)
        except:
            print("PROBLEM at", name)
            break
    
        dill.dump(parsed_raag, fp)

If something goes wrong, manually edit the df 

In [58]:
# df.loc[5][1] = "R P N S'- S' P R"

In [None]:
# # Save and Reload
# df.to_pickle(f"infotables/{name}.pkl")
# df = pd.read_pickle(f"infotables/{name}.pkl")

## Manual Edits

In [None]:
# TODO:
# Get alt names + alt spellings + devanagari

In [None]:
# TODO: some additions
# Abhogi
# Amritavarshini
# Husseini Kanada
# Din Ki Puriya
# Marukauns
# Shuddha Baradi
# Mangal Bhairav
# Shobhawari
# Sundarkali
# Tilang Bahar

In [49]:
raag_db = {}
with open("raags_temp.pkl", "rb") as fp:
    for name in sorted(raag_names):
        raag_db[name] = dill.load(fp)

In [50]:
del raag_db["Shobhawari"]
del raag_db["Suha Sughrai"]
del raag_db["Sundarkali"]
del raag_db["Tilang Bahar"]

raag_db["Basant"].vaadi = SSwar("`", "S")
raag_db["Sundarkauns"].vaadi = SSwar("", "m")
raag_db["Sundarkauns"].samvaadi = SSwar("", "S")
raag_db["Sundarkauns"].prahar = "night 2nd"
raag_db["Sundarkauns"].thaat = "Not Defined"
raag_db["Yaman"].aaroha = TanarangParsedRaag.string_to_swars(",N R G M D N S'")
raag_db["Yaman"].avaroha = TanarangParsedRaag.string_to_swars("S' N D P M G R S ,N R S")
raag_db["Yaman"].vaadi = SSwar("", "G")
raag_db["Yaman"].samvaadi = SSwar("", "N")

In [51]:
with open("raags_new.pkl", "wb") as fp:
    for rd in raag_db.values():
        rd.df = None
        rd.verbose = None
        dill.dump(rd, fp)

## Reading the DB

In [52]:
import dill

In [156]:
with open("raags_new.pkl", "rb") as fp:
    raag_db = {}
    while True:
        try:
            rd = dill.load(fp)
            raag_db[rd.name] = rd
        except EOFError:
            break

In [None]:
raag_db["Yaman"].__dict__

Converting the db to a portable pickle

In [158]:
def make_str(ss):
    try:
        rr = ss.__str__()
    except:
        rr = str(ss)
    assert type(rr) == str
    return rr

raag_dict_s = {}
for name, rd in raag_db.items():
    rd_dict = rd.__dict__
    rd_dict["aaroha"] = [make_str(ss) for ss in rd_dict["aaroha"]]
    rd_dict["avaroha"] = [make_str(ss) for ss in rd_dict["avaroha"]]
    rd_dict["mukhyanga"] = [[make_str(ss) for ss in mm] for mm in rd_dict["mukhyanga"]]
    rd_dict["aarohi_nyas"] = [make_str(ss) for ss in rd_dict["aarohi_nyas"]]
    rd_dict["avarohi_nyas"] = [make_str(ss) for ss in rd_dict["avarohi_nyas"]]
    rd_dict["vaadi"] = make_str(rd_dict["vaadi"])
    rd_dict["samvaadi"] = make_str(rd_dict["samvaadi"])
    raag_dict_s[name] = rd_dict

In [None]:
raag_dict_s["Yaman"]

In [160]:
import pickle
pickle.dump(raag_dict_s, open("raag_dicts_2.pkl", "wb"))

In [None]:
type(raag_dict_s["Yaman"]["avaroha"][0])