# Divergence Analysis of PDF Yearbooks and Web Portal data

PDF Yearbooks -- Streamflow data that were parsed using LlamaParse from annual PDF yearbooks.\
Web Portal -- Streamflow data that were parsed using our [parser](./kazhydromet_parser/web_parser_hydro.py) from Web Portal.

NOTE: Both are from KazHydroMet

In [1]:
import pandas as pd
import os 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
path1 = './streamflow_default_units/' # aka Web Portal (also _khm)
path2 = './new/streamflow/' # aka PDF Yearbooks parsed by LlamaParse (also _llama)  

In [3]:
for file in os.listdir(path2):
    # print(f"... Analyzing the following {file} -- ")
    name = file.split('.')[0]
    
    if (file not in os.listdir(path1)):
        print("This is not in KazHydromet's website: " + file)
        continue

    df1 = pd.read_csv(path1 + file)
    df2 = pd.read_csv(path2 + file)

    # merging to get the scatterplot and equal dimensions
    merged = df1.merge(df2, on=['date'], how='outer', suffixes=('_khm', '_llama'))

    output_dir = 'new/analysis/' + name 
    os.makedirs(output_dir, exist_ok=True)

    # dataframe for unmatched values
    df = merged[merged['discharge_llama'] != merged['discharge_khm']]
    df.to_csv(output_dir + f'/non-matching_{name}.csv', index=False)
    # print("-------------DONE!-----------\n\n")
    # break

This is not in KazHydromet's website: 11242.csv
This is not in KazHydromet's website: 19013.csv
This is not in KazHydromet's website: 77895.csv


In [4]:
for file in os.listdir(path2):
    name = file.split('.')[0]
    
    if (file not in os.listdir(path1)):
        print("This is not in KazHydromet's website: " + file)
        continue

    df1 = pd.read_csv(path1 + file)
    df2 = pd.read_csv(path2 + file)

    # merging to get the scatterplot and equal dimensions
    merged = df1.merge(df2, on=['date'], how='outer', suffixes=('_khm', '_llama'))
    merged['discharge_llama'] = pd.to_numeric(merged['discharge_llama'], errors='coerce')
    merged['discharge_khm'] = pd.to_numeric(merged['discharge_khm'], errors='coerce')

    output_dir = 'new/analysis/' + name 
    os.makedirs(output_dir, exist_ok=True)

    # simple plot settings  
    plt.figure(figsize=(20, 10))
    plt.plot(merged['discharge_llama'], label='llamaparse', linestyle='--', marker='d')
    plt.plot(merged['discharge_khm'], label='khm', linestyle='-', marker='d')
    plt.title('Llamaparse and Khm comparison')
    plt.xlabel('Date')
    plt.ylabel('Discharge Values')
    plt.legend()
    plt.grid(True)
    plot_path = os.path.join(output_dir, f'plot_{name}.png')
    plt.savefig(plot_path)
    plt.close()
    # break

This is not in KazHydromet's website: 11242.csv
This is not in KazHydromet's website: 19013.csv
This is not in KazHydromet's website: 77895.csv


In [5]:
for file in os.listdir(path2):
    name = file.split('.')[0]
    
    if (file not in os.listdir(path1)):
        print("This is not in KazHydromet's website: " + file)
        continue

    df1 = pd.read_csv(path1 + file)
    df2 = pd.read_csv(path2 + file)

    # merging to get the scatterplot and equal dimensions
    merged = df1.merge(df2, on=['date'], how='outer', suffixes=('_khm', '_llama'))
    merged['discharge_llama'] = pd.to_numeric(merged['discharge_llama'], errors='coerce')
    merged['discharge_khm'] = pd.to_numeric(merged['discharge_khm'], errors='coerce')

    output_dir = 'new/analysis/' + name 
    os.makedirs(output_dir, exist_ok=True)

    # scatterplot settings
    plt.scatter(merged['discharge_llama'], merged['discharge_khm'])
    plt.xlabel('LlamaParse result')
    plt.ylabel('KazHydroMet')
    plt.title('Scatterplot for ' + name)
    scatter_path = os.path.join(output_dir, f'scatterplot_{name}.png')
    plt.savefig(scatter_path)
    plt.close()

This is not in KazHydromet's website: 11242.csv
This is not in KazHydromet's website: 19013.csv
This is not in KazHydromet's website: 77895.csv


In [7]:
for file in os.listdir(path2):
    # print(f"... Analyzing the following {file} -- ")
    name = file.split('.')[0]
    
    if (file not in os.listdir(path1)):
        print("This is not in KazHydromet's website: " + file)
        continue

    df1 = pd.read_csv(path1 + file)
    df2 = pd.read_csv(path2 + file)

    # merging to get the scatterplot and equal dimensions
    merged = df1.merge(df2, on=['date'], how='outer', suffixes=('_khm', '_llama'))
    merged['discharge_llama'] = pd.to_numeric(merged['discharge_llama'], errors='coerce')
    merged['discharge_khm'] = pd.to_numeric(merged['discharge_khm'], errors='coerce')
    
    output_dir = 'new/divergence_no_nans/' 
    os.makedirs(output_dir, exist_ok=True)

    # dataframe for unmatched values
    non_matching = merged[
        (merged['discharge_khm'].notna()) & 
        (merged['discharge_llama'].notna()) & 
        (merged['discharge_khm'] != merged['discharge_llama'])
    ]
    non_matching.to_csv(output_dir + f'/no_nans_{name}.csv', index=False)
    # print("-------------DONE!-----------\n\n")
    # break

This is not in KazHydromet's website: 11242.csv
This is not in KazHydromet's website: 19013.csv
This is not in KazHydromet's website: 77895.csv
