# Part I: Importing Libraries and retrieval/download of files from GovCanada webpage

* *Part I running time: ~20 mins*

##  Import libraries

In [None]:
# Import timeit, time libraries and start clock 
import timeit
start_time = timeit.default_timer()

# datetime timer
import datetime
import sys

# Data analysis 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Web scraping and file retrieval
import os
import glob
import urllib.request
import json
import re
import time
import datetime
import wget

# Database processing
import sqlite3
import dask.dataframe as dd

# Visualization and geo-data imports
import geopandas as gpd
import chart_studio.plotly as py
import plotly.graph_objects as go
#import plotly.edf_masterpress as pdf_master

# Offline mode
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

# Hyperlink / Web display
from IPython.core.display import display, HTML

# Formatting options
pd.options.display.float_format = '{:,.3f}'.format
np.set_printoptions(precision=2)

print('Library imports complete!')

## Download Hydrometric Station Location data from Hydat.sqlite3 database

* *Select STATIONS where HYD_STATUS = A (Active)*
* *Ignore STATIONS where HYD_STATUS = D (Deactivated/Inactive)*
* *Export the database table to a .csv file*
* *Check the file DataFrame file to see that it works!*
* *Close the connection*

In [None]:
os.chdir(r'C:/Users/pdudar/anaconda3/projects/CanadaWatQual/Hydro/sqlite3')

# Create connection to Hydat database
con = sqlite3.connect("Hydat.sqlite3")
cursor = con.cursor()

#all_database_stuff = cursor.execute("SELECT * FROM sqlite_master").fetchall()
df_stations = pd.read_sql_query('SELECT * from STATIONS where HYD_STATUS = "A"', con) #A = active stations

# Verify that result of SQL query is stored in the dataframe
df_stations.rename(columns={df_stations.columns[0]: "ID"}, inplace=True)
df_stations.rename(columns={df_stations.columns[2]: "PROV_TERR"}, inplace=True)
df_stations.rename(columns={df_stations.columns[8]: "DRAINAGE_AREA_GROSS_KM2"}, inplace=True)

df_stations['PROV_TERR_ID'] = df_stations['PROV_TERR'] + df_stations['ID']

# Drop extra columns 
df_stations.drop(labels = ['DRAINAGE_AREA_EFFECT','DATUM_ID','SED_STATUS'], inplace=True, axis=1)

print(df_stations.info())

df_stations.to_csv('HYDAT.csv', encoding='utf-8')

# Close the connection
con.close()

## Download multiple .csv files from Hydrometric .url directory using wget

* *Hyperlink:* <a href="https://dd.weather.gc.ca/hydrometric/csv/">[https://dd.weather.gc.ca/hydrometric/csv/](https://dd.weather.gc.ca/hydrometric/csv/)</a>

* *Downloading time: ~4 mins*

* *All provincial files for download:*

    "AB_daily_hydrometric.csv, BC_daily_hydrometric.csv, SK_daily_hydrometric.csv, /
    MB_daily_hydrometric.csv, ON_daily_hydrometric.csv, QC_daily_hydrometric.csv, /
    NB_daily_hydrometric.csv, NS_daily_hydrometric.csv, PE_daily_hydrometric.csv, /
    NL_daily_hydrometric.csv, NT_daily_hydrometric.csv, NU_daily_hydrometric.csv, /
    YT_daily_hydrometric.csv"

In [None]:
os.chdir(r"C:/Users/pdudar/anaconda3/projects/CanadaWatQual/Hydro/CanadaDaily")

# Download hydrometric daily sub directories fast:
!wget --wait=2 -r -np -nH -nd -e robots=off --cut-dirs=3 --reject "index.html*" --accept "AB_daily_hydrometric.csv, BC_daily_hydrometric.csv, SK_daily_hydrometric.csv, MB_daily_hydrometric.csv, ON_daily_hydrometric.csv, QC_daily_hydrometric.csv, NB_daily_hydrometric.csv, NS_daily_hydrometric.csv, PE_daily_hydrometric.csv, NL_daily_hydrometric.csv, NT_daily_hydrometric.csv, NU_daily_hydrometric.csv, YT_daily_hydrometric.csv" https://dd.weather.gc.ca/hydrometric/csv/ --no-check-certificate

# Template for file downloads
"""
"AB_daily_hydrometric.csv, BC_daily_hydrometric.csv, SK_daily_hydrometric.csv, /
MB_daily_hydrometric.csv, ON_daily_hydrometric.csv, QC_daily_hydrometric.csv, /
NB_daily_hydrometric.csv, NS_daily_hydrometric.csv, PE_daily_hydrometric.csv, /
NL_daily_hydrometric.csv, NT_daily_hydrometric.csv, NU_daily_hydrometric.csv, /
YT_daily_hydrometric.csv"

"""
print("Completed downloads!")

# Part II: Concatenate (Combine) all provincial DAILY .csv files into one DAILY .csv file

* *Runtime ~4 mins*

## Print all .csv files in CanadaDaily directory

In [None]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Web scraping and file retrieval
import os
import glob

# Change the directory to /CanadaDaily
os.chdir(r"C:/Users/pdudar/anaconda3/projects/CanadaWatQual/Hydro/CanadaDaily")

# Use glob to match the pattern '.csv'
ext = '.csv'
fnames = [i for i in glob.glob('*{}'.format(ext))]
print(fnames)
print('\n')

## Concat all provincial .csv file together and clean up

In [None]:
# Combine all files in the list and export as .csv
print('Concatenating all provincial daily hydrometric files into national file..\n')

dtypes = {" ID": str,
          "Date": str, 
          "Water Level / Niveau d'eau (m)": np.float32, 
          "Discharge / Débit (cms)": np.float32}
          
cols = list(dtypes.keys())

path = r"C:/Users/pdudar/anaconda3/projects/CanadaWatQual/Hydro/CanadaDaily"
df_daily = pd.concat((pd.read_csv(f, usecols=cols, dtype=dtypes,
                                  sep=',', low_memory=False) for f in fnames))

# Cleanup new file (rename and fill N/A values)
column_indices = [0, 1, 2, 3]
new_names = ["ID", "DATE", "WATER_LEVEL_M", "FLOWRATE_CMS"]
old_names = df_daily.columns[column_indices]
df_daily.rename(columns=dict(zip(old_names, new_names)), inplace=True)

# Fill N/A values
df_daily.fillna(value='', inplace=True)

# Merge the DAILY DataFrame with the HYDRO STATION DATABASE DataFrame
df_daily = pd.merge(df_daily, df_stations, on='ID')

cols = ["WATER_LEVEL_M", "FLOWRATE_CMS", "LATITUDE", "LONGITUDE", "DRAINAGE_AREA_GROSS_KM2"]
df_daily[cols] = pd.to_numeric(df_daily[cols].stack(), errors='coerce', downcast='float').unstack()

df_daily['STATION'] = df_daily['PROV_TERR_ID'] + "_" + df_daily['STATION_NAME']

# Replace blank spaces with _
df_daily['STATION'] = df_daily['STATION'].str.replace(' ', '_')
df_daily['STATION'] = df_daily['STATION'].str.replace('//', '_')
df_daily['STATION'] = df_daily['STATION'].str.replace('\\', '_')

# Drop unneeded columns
df_daily.drop(columns=['PROV_TERR_ID', 'REGIONAL_OFFICE_ID', 'HYD_STATUS', 'STATION_NAME'], inplace=True)

# Sample data on hourly basis
df_daily.info()

# Part III: Create and Clean Master DataFrame

## Create or Clean Master DataFrame (CAN_daily_hydrometric_master.csv)

- Read in the MASTER df using Dask
- Concat both DataFrames on the DATE column. This adds the newest dates to the MASTER df.
- Convert all numeric values to float32 and convert DATE column to datetime64[ns, utc] format.
- Compare the lengths of the DAILY df and the MASTER df
- Export the new MASTER df to .csv and overwrite the old MASTER .csv file

**Notes:**
     
* *The .csv file will continue to grow and is easiest to process using Dask for in-memory processing
  as processing memory above 16GB can be selected*

* *To save time and money, multiple VMs (Virtual Machine) such as Amazon EC2 or Microsoft Azure VM
  are the fastest method to plug into big computing power. Only consider this for very large real-time 
  datasets as it becomes pricey depending on the size of the datasets (GB, TB, PB, etc.)*
 
* *Runtime ~ 6 mins*

In [None]:
import os.path

# If Master file exists in filepath..
if os.path.isfile(r'C:/Users/pdudar/anaconda3/projects/CanadaWatQual/Hydro/CanadaDaily/og/CAN_daily_hydrometric_master.parquet'):
    
    print("File exists! Merge Daily DataFrame with Master DataFrame..")

    f = r'C:/Users/pdudar/anaconda3/projects/CanadaWatQual/Hydro/CanadaDaily/og/CAN_daily_hydrometric_master.parquet'

    # Import the new MASTER.csv as a DataFrmae and follow same cleaning process as above.
    df_master = pd.read_parquet(f, engine='pyarrow') # Process in 100MB chunk

    # Replace all NaN (null and N/A) numeric columns with np.NaN value
    float_cols_na = ['WATER_LEVEL_M', 
                     'FLOWRATE_CMS', 
                     'LATITUDE', 
                     'LONGITUDE', 
                     'DRAINAGE_AREA_GROSS_KM2']
    
    df_master[float_cols_na] = df_master[float_cols_na].replace({'NaN': np.nan})

    # Drop all null rows
    df_master.dropna(axis=0, inplace=True)
    
    print('Merge master and daily DataFrames. Add newest DATE rows to the master DataFrame..')
    df_final = pd.concat([df_master, df_daily], ignore_index=True).drop_duplicates().reset_index(drop=True)
    
    df_final['DATE'] = pd.to_datetime(df_final['DATE'], errors='coerce', utc=True)
    df_final = df_final.resample('H', on='DATE', axis=0)
    
    print('Export Master DataFrame to .csv format..')
    os.chdir(r'C:/Users/pdudar/anaconda3/projects/CanadaWatQual/Hydro/CanadaDaily/og')
    
    master_csv_file = r'CAN_daily_hydrometric_master.csv'
    df_final.to_csv(master_csv_file, encoding='utf-8')
    
    master_parq_file = r'CAN_daily_hydrometric_master.parquet'
    df_final.to_parquet(master_parq_file, engine='pyarrow')
    
    len_df_daily = len(df_daily)
    len_df_final = len(df_final)

    print(f'Daily DataFrame length: {len_df_daily} \n')
    print(f'Master DataFrame length: {len_df_final} \n')
    
    print(df_final.head())
   
else:  
        
    print('CAN_daily_hydrometric_master file does not exist! Create Master .csv file...')
    
    # Replace all NaN (null and N/A) numeric columns with np.NaN value
    float_cols = ["FLOWRATE_CMS", 
                  "WATER_LEVEL_M", 
                  "DRAINAGE_AREA_GROSS_KM2", 
                  "LATITUDE", 
                  "LONGITUDE"]

    df_daily[float_cols] = df_daily[float_cols].replace({'NaN': np.nan})

    # Drop all null rows
    df_daily.dropna(axis=0, inplace=True)
     
    print("Master DataFrame and master .csv file created!")

    print(df_daily.head())
    
    # Export DataFrame to .csv (named master)
    os.chdir(r'C:/Users/pdudar/anaconda3/projects/CanadaWatQual/Hydro/CanadaDaily/og')

    # Create new MASTER.csv file from DAILY DataFrame
    master_csv_file = r"CAN_daily_hydrometric_master.csv"
    df_daily.to_csv(master_csv_file, encoding='utf-8')
    
    master_parq_file = r'CAN_daily_hydrometric_master.parquet'
    df_daily.to_parquet(master_parq_file, engine='pyarrow')

## Delete individual provincial .csv files in CanadaDaily directory   

In [None]:
import os
import glob
import time

# Delete all provincial .csv files
for CleanUp in glob.glob(r'C:/Users/pdudar/anaconda3/projects/CanadaWatQual/Hydro/CanadaDaily/*.*'):
    print(f'Files in CanadaDaily folder: {CleanUp}')
    
    if CleanUp.endswith('.csv'):    
        os.remove(CleanUp)

## List of final files

**Files:**

* **Canadian Daily Hydrometric Stations: CAN_daily_hydrometric.csv**
   
   * *Location: C:/Users/pdudar/anaconda3/projects/CanadaWatQual/CanadaDaily*         
  
   
* **Canadian Daily Hydrometric Stations (MASTER): CAN_daily_hydrometric.master.csv**
   
   * *Location: C:/Users/pdudar/anaconda3/projects/CanadaWatQual/CanadaDaily/og*

# Part IV: Plotly Chart generation

## Create Hydrometric Daily Plotly Time Series Charts

* *Create .html charts for all Hydrometric Stations across Canada*
* *Discharge/Flow Rate (cms) is displayed on LEFT side of chart*
* *Water Level (m) is displayed on RIGHT side of chart*

In [None]:
master_file_path = r'C:/Users/pdudar/anaconda3/projects/CanadaWatQual/Hydro/CanadaDaily/og/CAN_daily_hydrometric_master.parquet'

if os.path.isfile(master_file_path):
    
    os.chdir(r'C:/Users/pdudar/anaconda3/projects/CanadaWatQual/Hydro/CanadaDaily/og')
    
    from plotly import __version__ 
    import cufflinks as cf 
    from plotly.offline import init_notebook_mode, iplot 

    import plotly.graph_objects as go
    import plotly.express as px
    from plotly.subplots import make_subplots

    # Create figure with secondary y-axis
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # to get the connection 
    init_notebook_mode(connected = True) 

    # Set Plotly to offline mode
    cf.go_offline 

    f = r'CAN_daily_hydrometric_master.parquet'
    df_master = pd.read_parquet(f, engine='pyarrow')
    
    # Create df by PROVINCE
    d_plotly = dict(tuple(df_master.groupby(['STATION'])))

    os.chdir(r'C:/Users/pdudar/anaconda3/projects/CanadaWatQual/Hydro/plotly')

    k = 1
    l = 0
    len_stations = len(df_master.STATION.unique())

    convert = float(3.0 * 0.0167) # Seconds to minutes
    chart_time = convert * len_stations

    print(f'Number of charts to create: {len_stations}')
    print(f'Chart processing time: {chart_time} minutes')

    ##### ----- Make Plotly Time Series Charts ----- #####
    from plotly import __version__ 
    import cufflinks as cf 
    from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 

    import plotly.graph_objects as go
    from plotly.subplots import make_subplots

    # Create figure with secondary y-axis
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # to get the connection 
    init_notebook_mode(connected = True) 

    # plotly also serves online, 
    # but we are using just a sample 
    cf.go_offline 

    print('Creating Plotly charts.. \n')

    d_plotly = dict(tuple(df_master.groupby(['STATION'])))

    k = 1
    l = 0
    len_stations = len(df_master.STATION.unique())

    start = datetime.datetime.now()

    for key in d_plotly.keys():
        d_plotly[key] = df_master[:][df_master.STATION == key]
        df_export_plotly = d_plotly[key]

        # Create figure with secondary y-axis
        subfig = make_subplots(specs=[[{"secondary_y": True}]])

        # Generate a Plotly TimeSeries chart for each monitoring station
        fig = px.line(df_export_plotly, 
                        x = 'DATE',
                        y = 'FLOWRATE_CMS',
                        title=key + ': ' + 'Canada Hydrometric Flow Data - Time Series Info',
                        hover_name = 'STATION', 
                        hover_data = ['FLOWRATE_CMS',
                                      'LATITUDE',
                                      'LONGITUDE',
                                      'DRAINAGE_AREA_GROSS_KM2']         
                        )

        fig2 = px.line(df_export_plotly, 
                        x='DATE',
                        y='WATER_LEVEL_M', 
                        hover_name = 'STATION', 
                        hover_data = ['WATER_LEVEL_M',
                                      'LATITUDE',
                                      'LONGITUDE',
                                      'DRAINAGE_AREA_GROSS_KM2']
                                   
                        )
           
        fig2.update_traces(yaxis="y2")

        subfig.add_traces(fig.data + fig2.data)
        subfig.layout.xaxis.title="Date"
        subfig.layout.yaxis.title="Flowrate (cms)"
        subfig.layout.yaxis2.type="linear"
        subfig.layout.yaxis2.title="Water Level (m)"

        # recoloring is necessary otherwise lines from fig und fig2 would share each color
        # e.g. Linear-, Log- = blue; Linear+, Log+ = red... we don't want this
        subfig.for_each_trace(lambda t: t.update(line=dict(color=t.marker.color))) 

        # Add figure title
        subfig.update_layout(
            title_text= key + ': ' + 'Canada Hydrometric Flow Data - Time Series Info'
        )

        # Set y-axes titles
        subfig.update_yaxes(title_text="<b>Flow Rate (cms)</b>", secondary_y=False)
        subfig.update_yaxes(title_text="<b>Water Level (m)</b>", secondary_y=True)

        subfig.update_xaxes(
            rangeslider_visible=True,
            rangeselector=dict(
                buttons=list([
                    dict(count=1, label="1DAY", step="day", stepmode="backward"),
                    dict(count=7, label="1WK", step="day", stepmode="backward"),
                    dict(count=14, label="2WK", step="day", stepmode="backward"),
                    dict(count=1, label="1MNTH", step="month", stepmode="backward"),
                    dict(count=3, label="3MNTH", step="month", stepmode="backward"),
                    dict(count=6, label="6MNTH", step="month", stepmode="backward"),
                    dict(count=1, label="YTD", step="year", stepmode="todate"),
                    dict(count=1, label="1YR", step="year", stepmode="backward"),
                    dict(step="all")
                ])
            )
        )
        
        # Chart colors
        subfig.update_layout(plot_bgcolor = "RGB(45,45,48)")  	#2d2d30
        subfig.update_layout(paper_bgcolor = "RGB(37,37,38)") #252526

        subfig.update_layout(
        font_color="RGB(131,148,150)",  	#839496
        title_font_color="RGB(131,148,150)",  	#839496
        legend_title_font_color="RGB(131,148,150)") #839496

        # Append station names to .html files
        subfig.write_html(key + '.html', include_plotlyjs='cdn')

        print('{} seconds: Completed {} plotly charts!'.format((datetime.datetime.now() - start).seconds, k))
        if l == len_stations:
            break
        else:
            l+=1

        k+=1

    print('Plotly charts completed! Charts saved in following '
        'directory: /CanadaWatQual/plotly')
    
else:
    
    os.chdir(r'C:/Users/pdudar/anaconda3/projects/CanadaWatQual/Hydro/CanadaDaily')


    # Set Plotly to offline mode
    cf.go_offline 
    

    d_plotly = dict(tuple(df_daily.groupby(['STATION'])))

    os.chdir(r'C:/Users/pdudar/anaconda3/projects/CanadaWatQual/Hydro/CanadaDailyHydrometricHydrometricPlotly')

    k = 1
    l = 0
    len_stations = len(df_daily.STATION.unique())

    convert = float(3.0 * 0.0166667)
    chart_time = convert * len_stations

    pd.options.display.float_format = '{:,.2f}'.format

    print(f'Number of charts to create: {len_stations}')
    print(f'Chart processing time: {chart_time} minutes.')

    os.chdir(r'C:/Users/pdudar/anaconda3/projects/CanadaWatQual/Hydro/plotly')

    pd.options.display.float_format = '{:,.1f}'.format
    
    ##### ----- Make Plotly Time Series Charts ----- #####
    from plotly import __version__ 
    import cufflinks as cf 
    from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 

    import plotly.graph_objects as go
    from plotly.subplots import make_subplots

    # Create figure with secondary y-axis
    fig_ii = make_subplots(specs=[[{"secondary_y": True}]])

    # to get the connection 
    init_notebook_mode(connected = True) 

    # plotly also serves online, 
    # but we are using just a sample 
    cf.go_offline 

    print('Creating Plotly charts..\n')

    d_plotly = dict(tuple(df_daily.groupby(['STATION'])))

    k = 1
    l = 0
    len_stations = len(df_daily.STATION.unique())

    start = datetime.datetime.now()

    for key in d_plotly.keys():
        d_plotly[key] = df_daily[:][df_daily.STATION == key]
        df_export_plotly = d_plotly[key]

        # Create figure with secondary y-axis
        subfig = make_subplots(specs=[[{"secondary_y": True}]])

        # Generate a Plotly TimeSeries chart for each monitoring station
        fig = px.line(df_export_plotly, 
                        x = 'DATE',
                        y = 'FLOWRATE_CMS',
                        title=key + ': ' + 'Canada Hydrometric Flow Data - Time Series with Range Slider: 2000-present',
                        hover_name = 'STATION', 
                        hover_data = [
                                    'FLOWRATE_CMS',
                                    'DATE',
                                    'LATITUDE',
                                    'LONGITUDE',
                                    'DRAINAGE_AREA_GROSS_KM2']
                        )

        fig2 = px.line(df_export_plotly, 
                        x='DATE',
                        y='WATER_LEVEL_M', 
                        hover_name = 'STATION', 
                        hover_data = [
                                    'WATER_LEVEL_M',
                                    'DATE',
                                    'LATITUDE',
                                    'LONGITUDE',
                                    'DRAINAGE_AREA_GROSS_KM2']
                        )
        
        fig2.update_traces(yaxis="y2")
    
        subfig.add_traces(fig.data + fig2.data)
        subfig.layout.xaxis.title="Date"
        subfig.layout.yaxis.title="Discharge/Flow Rate (cms)"
        subfig.layout.yaxis2.type="linear"
        subfig.layout.yaxis2.title="Water Level (m)"
          
        fig2.update_traces(yaxis="y2")

        subfig.add_traces(fig.data + fig2.data)
        subfig.layout.xaxis.title="Date"
        subfig.layout.yaxis.title="Discharge/Flow Rate (cms)"
        subfig.layout.yaxis2.type="linear"
        subfig.layout.yaxis2.title="Water Level (m)"

        # recoloring is necessary otherwise lines from fig und fig2 would share each color
        # e.g. Linear-, Log- = blue; Linear+, Log+ = red... we don't want this
        subfig.for_each_trace(lambda t: t.update(line=dict(color=t.marker.color))) 

        # Add figure title
        subfig.update_layout(
            title_text= key + ': ' + 'Canada Hydrometric Flow Data - Time Series with Range Slider: 2021-present'
        )

        # Set y-axes titles
        subfig.update_yaxes(title_text="<b>Discharge/Flow Rate (cms)</b>", secondary_y=False)
        subfig.update_yaxes(title_text="<b>Water Level (m)</b>", secondary_y=True)

        subfig.update_xaxes(
            rangeslider_visible=True,
            rangeselector=dict(
                buttons=list([
                    dict(count=1, label="1DAY", step="day", stepmode="backward"),
                    dict(count=7, label="1WK", step="day", stepmode="backward"),
                    dict(count=14, label="2WK", step="day", stepmode="backward"),
                    dict(count=1, label="1MNTH", step="month", stepmode="backward"),
                    dict(count=3, label="3MNTH", step="month", stepmode="backward"),
                    dict(count=6, label="6MNTH", step="month", stepmode="backward"),
                    dict(count=1, label="YTD", step="year", stepmode="todate"),
                    dict(count=1, label="1YR", step="year", stepmode="backward"),
                    dict(step="all")
                ])
            )
        )

        # Chart colors
        subfig.update_layout(plot_bgcolor = "RGB(45,45,48)")  	#2d2d30
        subfig.update_layout(paper_bgcolor = "RGB(37,37,38)") #252526


        subfig.update_layout(
        font_color="RGB(131,148,150)",  	#839496
        title_font_color="RGB(131,148,150)",  	#839496
        legend_title_font_color="RGB(131,148,150)") #839496

        # Append station names to .html files
        subfig.write_html(key + '.html', include_plotlyjs='cdn')

        print('{} seconds: Completed {} plotly charts..'.format((datetime.datetime.now() - start).seconds, k))
        if l == len_stations:
            break
        else:
            l+=1

        k+=1

    print('Plotly charts completed! Charts saved in following '
        'directory: /CanadaWatQual/Hydro/plotly')

## Make provincial subfolders for .html TimeSeries files

In [None]:
import os

source_dir = r'C:/Users/pdudar/anaconda3/projects/CanadaWatQual/Hydro/plotly/'
html_dir = 'C:/Users/pdudar/anaconda3/projects/CanadaWatQual/Hydro/plotly/*.html'

subfolder_names = ['BC', 'AB', 'SK', 
                    'MB', 'ON', 'QC',
                    'NB', 'PE', 'NS', 
                    'NL', 'NU', 'NT', 
                    'YT']

for subfolder in subfolder_names:
    
    # Make pro directories. If they exist already ignore
    os.makedirs(os.path.join(source_dir, subfolder), exist_ok=True)
    
    # Print the subfolder directory names
    sub_dir = os.path.join(source_dir, subfolder)
    print(sub_dir)

## Copy all .html files from source directory to Provincial Subfolders



In [None]:
os.chdir(r'C:/Users/pdudar/anaconda3/projects/CanadaWatQual/Hydro/plotly')

import glob
import os
import shutil

source_dir = r'C:/Users/pdudar/anaconda3/projects/CanadaWatQual/Hydro/plotly'

# Define subfolder names
ab = '/AB'
bc = '/BC'
sk = '/SK'
mb = '/MB'
qc = '/QC'
ns = '/NS'
nt = '/NT'
yt = '/YT'
nu = '/NU'
pe = '/PE'
nl = '/NL'
nb = '/NB'

# Loop and look for all .html files by prov name (AB_....html)
# Copy files by name to prov folder
for f in source_dir:
    if (f.startswith("AB") and f.endswith('.html')): #
        shutil.copy(os.path.join(source_dir, f), ab) 
    elif (f.startswith("BC") and f.endswith('.html')):
        shutil.copy(os.path.join(source_dir, f), bc)
    elif (f.startswith("SK")and f.endswith('.html')):
        shutil.copy(os.path.join(source_dir, f), sk)
    elif (f.startswith("MB")and f.endswith('.html')):
        shutil.copy(os.path.join(source_dir, f), mb)
    elif (f.startswith("QC")and f.endswith('.html')):
        shutil.copy(os.path.join(source_dir, f), qc)
    elif (f.startswith("NS") and f.endswith('.html')):
        shutil.copy(os.path.join(source_dir, f), ns)
    elif (f.startswith("NT") and f.endswith('.html') ):
        shutil.copy(os.path.join(source_dir, f), nt)
    elif (f.startswith("NU")and f.endswith('.html')):
        shutil.copy(os.path.join(source_dir, f), yt)
    elif (f.startswith("PE")and f.endswith('.html')):
        shutil.copy(os.path.join(source_dir, f), yt)
    elif (f.startswith("NL")and f.endswith('.html')):
        shutil.copy(os.path.join(source_dir, f), yt)
    elif (f.startswith("NB")and f.endswith('.html')):
        shutil.copy(os.path.join(source_dir, f), yt)

## Delete original .html files from source directory

In [None]:
# Delete original .html files from source directory        
dir_path = r'C:\Users\pdudar\anaconda3\projects\CanadaWatQual\Hydro\plotly'
folder = os.listdir(dir_path)

for item in folder:
    if item.endswith(".html"):
        os.remove(os.path.join(dir_path, item))  

## Convert all Provincial subfolders to .zip format

- Compress each .zip folder

In [None]:
import os, os.path
import shutil

reports_path  = r'C:/Users/pdudar/anaconda3/projects/CanadaWatQual/Hydro/plotly'

def zip_prov_folders(reports_path):
    for (path, dirs, files) in os.walk(reports_path):
        for d in dirs:
            file_path = os.path.join(path, d)
            print('Compressing: ' + d )
            shutil.make_archive(d,'zip', file_path)
    print("Completed zipping! \n")
    
if __name__ == '__main__':
    zip_prov_folders(reports_path)

## Delete empty folders 

In [None]:
"""
import shutil

folder_location  = r'C:/Users/pdudar/anaconda3/projects/CanadaWatQual/Hydro/plotly'
def delete_empty_folders(folder_location):
    all_directories = list(os.walk(folder_location))
    for path, a, b in all_directories:
        if len(os.listdir(path)) == 0:  # Checking if the directory is empty or not
            shutil.rmtree(path)         # Delete the folder if it is empty

if __name__ == '__main__':
    delete_empty_folders(folder_location) #This path is just an example te it

"""

# Runtime

In [None]:

stop = timeit.default_timer()
total_time = stop - start_time

# output running time in a nice format.
mins, secs = divmod(total_time, 60)
hours, mins = divmod(mins, 60)

sys.stdout.write("Total running time: %d:%d:%d." % (hours, mins, secs))