# This notebook retrieves all USGS stations' discharge data located in NYS
##### Author: Omid Emamjomehzadeh (https://www.omidemam.com/)
##### Supervisor: Dr. Omar Wani (https://engineering.nyu.edu/faculty/omar-wani)
##### Hydrologic Systems Group @NYU (https://www.omarwani.com/)

In [13]:
# import libraries
import pandas as pd
import numpy as np
import dataretrieval.nwis as nwis
from tqdm import tqdm
import os
import datetime

# An example of using NWIS

In [7]:
# get information about sites
nys_stations, md2 = nwis.what_sites(
    stateCd="NY", parameterCd="00060"
)
nys_stations.head()

Unnamed: 0,agency_cd,site_no,station_nm,site_tp_cd,dec_lat_va,dec_long_va,coord_acy_cd,dec_coord_datum_cd,alt_va,alt_acy_va,alt_datum_cd,huc_cd,geometry
0,USGS,1199400,WEBATUCK CREEK NEAR SOUTH AMENIA NY,ST,41.780833,-73.555278,S,NAD83,,,,1100005.0,POINT (-73.55528 41.78083)
1,USGS,1199490,SWAMP RIVER NEAR DOVER PLAINS NY,ST,41.698889,-73.583611,S,NAD83,370.0,10.0,NGVD29,1100005.0,POINT (-73.58361 41.69889)
2,USGS,1200000,"TENMILE RIVER NEAR GAYLORDSVILLE, CT",ST,41.658764,-73.528683,H,NAD83,304.0,5.0,NGVD29,1100005.0,POINT (-73.52868 41.65876)
3,USGS,1209795,TRINITY LAKE RES NR POND RIDGE NY,ST,41.21454,-73.55457,S,NAD83,,,,1100006.0,POINT (-73.55457 41.21454)
4,USGS,1300000,BLIND BROOK AT RYE NY,ST,40.983333,-73.686944,S,NAD83,13.05,0.01,NGVD29,1100006.0,POINT (-73.68694 40.98333)


In [8]:
# Set the parameters needed for the web service call
siteID = '10109000'  # LOGAN RIVER ABOVE STATE DAM, NEAR LOGAN, UT
parameterCode = '00060'  # Discharge
startDate = '2020-09-01'
endDate = '2021-09-30'

# Get the data
discharge = nwis.get_iv(sites=siteID, parameterCd=parameterCode, start=startDate, end=endDate)
print('Retrieved ' + str(len(discharge[0])) + ' data values.')
discharge

Retrieved 37920 data values.


(                            site_no  00060 00060_cd
 datetime                                           
 2020-09-01 06:00:00+00:00  10109000  101.0        A
 2020-09-01 06:15:00+00:00  10109000   99.0        A
 2020-09-01 06:30:00+00:00  10109000  101.0        A
 2020-09-01 06:45:00+00:00  10109000   99.0        A
 2020-09-01 07:00:00+00:00  10109000   99.0        A
 ...                             ...    ...      ...
 2021-10-01 04:45:00+00:00  10109000   53.2        A
 2021-10-01 05:00:00+00:00  10109000   53.2        A
 2021-10-01 05:15:00+00:00  10109000   53.2        A
 2021-10-01 05:30:00+00:00  10109000   54.6        A
 2021-10-01 05:45:00+00:00  10109000   54.6        A
 
 [37920 rows x 3 columns],
 NWIS_Metadata(url=https://nwis.waterservices.usgs.gov/nwis/iv/?format=json&parameterCd=00060&startDT=2020-09-01&endDT=2021-09-30&sites=10109000))

In [10]:
# Discharge data
discharge_data = nwis.get_record(sites='04244000', service='iv', parameterCd='00060',start='1900-01-01')
discharge_data. head()

Unnamed: 0_level_0,site_no,00060,00060_cd
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-08-07 16:15:00+00:00,4244000,80.9,A
2014-08-07 16:30:00+00:00,4244000,82.7,A
2014-08-07 16:45:00+00:00,4244000,82.7,A
2014-08-07 17:00:00+00:00,4244000,84.5,A
2014-08-07 17:15:00+00:00,4244000,79.1,A


In [11]:
# download annual peaks from a single site
df = nwis.get_record(sites='01200000', service='peaks', start='1970-01-01')
df.head()

Unnamed: 0_level_0,agency_cd,site_no,peak_tm,peak_va,peak_cd,gage_ht,gage_ht_cd,year_last_pk,ag_dt,ag_tm,ag_gage_ht,ag_gage_ht_cd
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1970-04-03 00:00:00+00:00,USGS,1200000,,3940,,7.07,,,,,,
1971-09-14 00:00:00+00:00,USGS,1200000,,1630,,4.76,,,,,,
1972-03-18 00:00:00+00:00,USGS,1200000,,3160,,6.38,,,,,,
1973-02-03 00:00:00+00:00,USGS,1200000,,5610,,8.37,,,,,,
1973-12-22 00:00:00+00:00,USGS,1200000,,5730,,8.46,,,,,,


# Download the discharge data for all USGS station located in NYS

In [72]:
SAVE_DIR = r'D:\culvert repo\data\USGS\discharge_NYS'
#os.makedirs(SAVE_DIR, exist_ok=True)

def get_drainage_area_and_length(site_no):
    try:
        # Fetch site metadata
        site_info = nwis.get_record(sites=site_no, service='site')
        drain_area = site_info['drain_area_va'].values[0] if 'drain_area_va' in site_info.columns else None

        # Fetch discharge data
        discharge_data = nwis.get_record(sites=site_no, service='iv', start='1900-01-01')
        data_length = len(discharge_data)

        if data_length != 0:
            # Save discharge data to CSV
            discharge_file = os.path.join(SAVE_DIR, f"{site_no}.csv")
            discharge_data.to_csv(discharge_file)

            # Extract the features
            discharge_data.index = pd.to_datetime(discharge_data.index)
            start_time = discharge_data.index.min()
            end_time = discharge_data.index.max()

            time_diffs = discharge_data.index.to_series().diff().dropna()
            time_interval = time_diffs.mode()[0]

            return drain_area, start_time, end_time, time_interval, data_length
        else:
            return drain_area, None, None, None, data_length

    except Exception as e:
        print(f"Error retrieving data for site {site_no}: {e}")
        return None, None, None, None, None

def add_drainage_area_and_length(df, site_col='site_no'):
    
    tqdm.pandas(desc="Retrieving Data")
    df[['drain_Area','start_time','end_time','time_interval','data_Length']] = \
    df[site_col].progress_apply(lambda site: pd.Series(get_drainage_area_and_length(site)))
    return df

# 
nys_stations_dis = add_drainage_area_and_length(nys_stations)
nys_stations_dis.to_csv(r'D:\culvert repo\data\USGS\discharge_NYS\USGS_stations.csv')

Retrieving Drainage Areas and Data Lengths: 100%|████████████████████████████████| 1469/1469 [6:27:48<00:00, 15.84s/it]


In [12]:
# Date and time
now = datetime.datetime.now()
print(f"Date and time: {now}")

Date and time: 2025-05-09 11:41:43.420968


In [17]:
%load_ext watermark
# Print the Python version and some dependencies
%watermark -v -m -p numpy,pandas,dataretrieval,tqdm

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Python implementation: CPython
Python version       : 3.12.4
IPython version      : 8.20.0

numpy        : 2.0.2
pandas       : 2.2.2
dataretrieval: 0.0.0
tqdm         : 4.66.5

Compiler    : MSC v.1940 64 bit (AMD64)
OS          : Windows
Release     : 11
Machine     : AMD64
Processor   : Intel64 Family 6 Model 183 Stepping 1, GenuineIntel
CPU cores   : 24
Architecture: 64bit

