In [1]:
import json
import re
import pandas as pd
import os
import asyncio
import aiohttp
from tqdm import tqdm

In [2]:
print(os.getcwd())

D:\Projects\extract-mounts


In [3]:
async def get_json_from_javascript(text: str):
    var_graph = re.search(r"(?:^|\s|;)var\s+graph\s*=\s*([^']+})", text)
    string_graph = var_graph.group(1)
    json_graph = json.loads(string_graph)
    return json_graph

In [4]:
async def get_so2_values(graph_json):
    so2_values = {
        'Date time': graph_json['data'][2]['x'],
        'Value': graph_json['data'][2]['y'],
        'Graph': graph_json['data'][2]['text'],
    }
    so2_df = pd.DataFrame.from_dict(so2_values)
    so2_df['Type'] = 'SO2'
    return so2_df

In [5]:
async def get_thermal_values(graph_json):
    thermal_values = {
        'Date time': graph_json['data'][0]['x'],
        'Value': graph_json['data'][0]['y'],
        'Graph': graph_json['data'][0]['text'],
    }
    thermal_df = pd.DataFrame.from_dict(thermal_values)
    thermal_df['Type'] = 'Thermal'
    return thermal_df

In [6]:
def convert_to_date(date_time):
    return date_time.strftime("%Y-%m-%d")

In [7]:
def convert_to_time(date_time):
    return date_time.strftime("%H:%M:%S")

In [8]:
async def run(text: str, volcano_code: str, volcano_name: str, json_file: str, 
              filter_value: float|int = 0) -> (pd.DataFrame, str, str) :
    
    graph_json = await get_json_from_javascript(text)

    with open(json_file, "w") as write_file:
        json.dump(graph_json['data'], write_file, indent=2)
        
    so2 = await get_so2_values(graph_json)
    thermal = await get_thermal_values(graph_json)
    
    df = pd.concat([
        so2,
        thermal
    ])
    
    df['Date time'] = pd.to_datetime(df['Date time'])
    df['Date'] = df['Date time'].apply(convert_to_date)
    df['Time'] = df['Date time'].apply(convert_to_time)
    df['Code'] = volcano_code
    df['Volcano Name'] = volcano_name
    df.set_index('Date time', inplace=True)
    
    filtered_df = df[df['Value'] > filter_value]      
    return filtered_df, volcano_name, volcano_code

In [9]:
async def fetch(session, url):
    async with session.get(url) as response:
        assert response.status == 200
        return await response.text()

In [10]:
async def main() -> dict[str, pd.DataFrame]:
    print('🏃‍ Extracting....')
    print('==================')
    
    mounts_url: str = 'http://mounts-project.com/timeseries/'
    filter_value: float = 0.1
    
    output_directory = os.path.join(os.getcwd(), 'output')
    os.makedirs(output_directory, exist_ok=True)
    
    json_dir = os.path.join(output_directory, 'json')
    os.makedirs(json_dir, exist_ok = True)
    
    print(json_dir)
    
    volcanoes = [
    {
        "name" : "Lewotobi Laki-laki",
        "code" : 264180,
    },
    {
        "name" : "Marapi",
        "code" : 261140,
    },
    {
        "name" : "Anak Krakatau",
        "code" : 262000,
    },
    {
        "name" : "Kerinci",
        "code" : 261170,
    },
    {
        "name" : "Karangetang",
        "code" : 267020,
    },
    {
        "name" : "Dukono",
        "code" : 268010,
    },
    {
        "name" : "Ili Lewotolok",
        "code" : 264230,
    },
    {
        "name" : "Ibu",
        "code" : 268030,
    },
    {
        "name" : "Semeru",
        "code" : 263300,
    },
    {
        "name" : "Raung",
        "code" : 263340,
    },
    {
        "name" : "Ijen",
        "code" : 263350,
    },
    {
        "name" : "Slamet",
        "code" : 263180
    }
]
    
    dfs: dict[str, pd.DataFrame] = {}
    
    async with aiohttp.ClientSession() as session:
        tasks = set()
        for volcano in volcanoes:
            response_text = await fetch(session,'{}{}'.format(mounts_url, volcano['code']))
            json_file = os.path.join(json_dir, '{}.json'.format(volcano['code']))
            
            task = asyncio.create_task(run(
                text = response_text,
                volcano_code = volcano['code'],
                volcano_name = volcano['name'],
                json_file = json_file,
                filter_value = filter_value)
            )
            tasks.add(task)
            
        for t in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
            df, name, code = await t
            dfs['{} - {}'.format(name, code)] = df
            print('👌 {} Extracted!'.format(name))
    
    return dfs

In [11]:
dataframes = await main()

🏃‍ Extracting....
D:\Projects\extract-mounts\output\json


100%|██████████| 12/12 [00:00<00:00, 1560.48it/s]

👌 Kerinci Extracted!
👌 Ibu Extracted!
👌 Ijen Extracted!
👌 Ili Lewotolok Extracted!
👌 Raung Extracted!
👌 Lewotobi Laki-laki Extracted!
👌 Marapi Extracted!
👌 Karangetang Extracted!
👌 Semeru Extracted!
👌 Dukono Extracted!
👌 Anak Krakatau Extracted!
👌 Slamet Extracted!





# Save to excel and CSV

In [12]:
dataframes.keys()

dict_keys(['Kerinci - 261170', 'Ibu - 268030', 'Ijen - 263350', 'Ili Lewotolok - 264230', 'Raung - 263340', 'Lewotobi Laki-laki - 264180', 'Marapi - 261140', 'Karangetang - 267020', 'Semeru - 263300', 'Dukono - 268010', 'Anak Krakatau - 262000', 'Slamet - 263180'])

In [13]:
async def export_to_excel(excel_directory: str, csv_directory: str, 
                          df: pd.DataFrame, filename: str) -> (str, str, str, any):
    path_excel = os.path.join(excel_directory, '{}.xlsx'.format(filename))
    path_csv = os.path.join(csv_directory, '{}.csv'.format(filename))
    df.to_csv(path_csv)
    df.to_excel(path_excel, sheet_name='Join Data')
    return filename, path_excel, path_csv, df.index.max()

In [14]:
async def save(dfs: dict[str, pd.DataFrame], concated_dfs: list[pd.DataFrame] = None, 
               df_csv: pd.DataFrame = None):
    if concated_dfs is None:
        concated_dfs: list[pd.DataFrame] = []
        
    if df_csv is None:
        df_csv = pd.DataFrame()
    
    output_directory = os.path.join(os.getcwd(), 'output')
    
    excel_directory = os.path.join(output_directory, 'excel')
    os.makedirs(excel_directory, exist_ok=True)
    
    csv_directory = os.path.join(output_directory, 'csv')
    os.makedirs(csv_directory, exist_ok=True)
    
    tasks = set()
    for filename, df in dfs.items():
        concated_dfs.append(df)
        
        task = asyncio.create_task(export_to_excel(
                excel_directory = excel_directory,
                csv_directory =  csv_directory,
                filename = filename,
                df = df)
        )
        tasks.add(task)
        
    for t in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
        filename, excel_file, csv_file, last_update = await t
        print('💾 {} saved to: {}'.format(filename, excel_file))
        
        df_csv = pd.concat([
            df_csv, pd.DataFrame([
                {
                    "code" : filename.split(' - ')[1],
                    "volcano_name" : filename.split(' - ')[0],
                    "filename" : excel_file,
                    "csv": csv_file,
                    "updated_at" : last_update
                }]
        )], ignore_index=True)
    
    all_volcano_excel = os.path.join(excel_directory, 'All Volcano.xlsx')
    merged = pd.concat(concated_dfs)
    merged.to_excel(all_volcano_excel, sheet_name='Join Data')
    print('💾 All Volcano saved into: {}'.format(all_volcano_excel))
    
    df_csv.to_csv('output.csv', index=False)

In [15]:
saved = await save(dataframes)

100%|██████████| 12/12 [00:01<00:00, 11.01it/s]

💾 Kerinci - 261170 saved to: D:\Projects\extract-mounts\output\excel\Kerinci - 261170.xlsx
💾 Ibu - 268030 saved to: D:\Projects\extract-mounts\output\excel\Ibu - 268030.xlsx
💾 Ijen - 263350 saved to: D:\Projects\extract-mounts\output\excel\Ijen - 263350.xlsx
💾 Ili Lewotolok - 264230 saved to: D:\Projects\extract-mounts\output\excel\Ili Lewotolok - 264230.xlsx
💾 Raung - 263340 saved to: D:\Projects\extract-mounts\output\excel\Raung - 263340.xlsx
💾 Lewotobi Laki-laki - 264180 saved to: D:\Projects\extract-mounts\output\excel\Lewotobi Laki-laki - 264180.xlsx
💾 Marapi - 261140 saved to: D:\Projects\extract-mounts\output\excel\Marapi - 261140.xlsx
💾 Karangetang - 267020 saved to: D:\Projects\extract-mounts\output\excel\Karangetang - 267020.xlsx
💾 Semeru - 263300 saved to: D:\Projects\extract-mounts\output\excel\Semeru - 263300.xlsx
💾 Dukono - 268010 saved to: D:\Projects\extract-mounts\output\excel\Dukono - 268010.xlsx
💾 Anak Krakatau - 262000 saved to: D:\Projects\extract-mounts\output\exc




💾 All Volcano saved into: D:\Projects\extract-mounts\output\excel\All Volcano.xlsx
