In [28]:
import pandas  as pd
import scipy

In [29]:
stn_ids = pd.read_fwf('http://noaa-ghcn-pds.s3.amazonaws.com/ghcnd-stations.txt', header=None, infer_nrows=1000)
stn_ids.columns = ['ID','LAT','LON','ELEV','UKN','NAME','GSN','WBAN']
stn_ids

Unnamed: 0,ID,LAT,LON,ELEV,UKN,NAME,GSN,WBAN
0,ACW00011604,17.1167,-61.7833,10.1,,ST JOHNS COOLIDGE FLD,,
1,ACW00011647,17.1333,-61.7833,19.2,,ST JOHNS,,
2,AE000041196,25.3330,55.5170,34.0,,SHARJAH INTER. AIRP,GSN,41196.0
3,AEM00041194,25.2550,55.3640,10.4,,DUBAI INTL,,41194.0
4,AEM00041217,24.4330,54.6510,26.8,,ABU DHABI INTL,,41217.0
...,...,...,...,...,...,...,...,...
127989,ZI000067969,-21.0500,29.3670,861.0,,WEST NICHOLSON,,67969.0
127990,ZI000067975,-20.0670,30.8670,1095.0,,MASVINGO,,67975.0
127991,ZI000067977,-21.0170,31.5830,430.0,,BUFFALO RANGE,,67977.0
127992,ZI000067983,-20.2000,32.6160,1132.0,,CHIPINGE,GSN,67983.0


In [30]:
#pick 5 cities and pull station IDs
def city_station(city):
    # function to sort through stn ids for city name so that it can be searched in next bit of code which only pulls by station for elements
    stn = stn_ids[stn_ids['NAME'].str.contains(city, na=False)]
    return stn[['ID', 'NAME']]

city_station('MIAMI')




Unnamed: 0,ID,NAME
7851,ASN00040417,MIAMI BARDON AVE
28034,CA005021732,MIAMI
28035,CA005021735,MIAMI HAWKEN
28036,CA005021736,MIAMI ORCHARD
28037,CA005021737,MIAMI THIESSEN
31838,CA1MB000023,MIAMI 10.4 SW
55825,US1AZGL0016,MIAMI 1.1 W
56542,US1AZPN0072,MIAMI 8.3 WSW
63599,US1FLMD0002,MIAMI 4.9 NNE
63601,US1FLMD0006,MIAMI LAKES 2.1 N


In [31]:
def get_available_years(station_id):
    df = pd.read_csv(
        "s3://noaa-ghcn-pds/csv/by_station/" + station_id + ".csv",
        storage_options={"anon": True},
        parse_dates=['DATE']
    )
    print(df['DATE'].dt.year.unique())  # Returns the unique years presen

get_available_years('USW00012839')

  df = pd.read_csv(


[1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961
 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975
 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989
 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003
 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
 2018 2019 2020 2021 2022 2023 2024]


In [32]:
#pull city station info into list based on airport data

city_airports= ['CHICAGO OHARE INTL AP','MIAMI INTL AP','LOS ANGELES INTL AP','JFK INTL AP','HOUSTON INTERCONTINENTAL AP']
station_ids = []

for airport in city_airports:
    station_info = city_station(airport)
    station_ids.append(station_info)


station_ids_df = pd.concat(station_ids, ignore_index=True)

station_ids_df['NAME'] = station_ids_df['NAME'].replace({'CHICAGO OHARE INTL AP':'Chicago','MIAMI INTL AP':'Miami','LOS ANGELES INTL AP': "Los Angeles",'JFK INTL AP': 'New York City','HOUSTON INTERCONTINENTAL AP':'Houston'})
station_ids_df

Unnamed: 0,ID,NAME
0,USW00094846,Chicago
1,USW00012839,Miami
2,USW00023174,Los Angeles
3,USW00094789,New York City
4,USW00012960,Houston


In [33]:
# set date range
import datetime

time_start = '1981-01-01'
time_end = datetime.datetime.now().strftime('%Y-%m-%d')


In [34]:
def station_data_values(station_id, time_start, time_end):
    #reads in data
    df = (
    pd.read_csv(
        "s3://noaa-ghcn-pds/csv/by_station/" + station_id + ".csv",
        storage_options={"anon": True},  # Anonymous access to S3
        dtype={'Q_FLAG': 'object', 'M_FLAG': 'object'},  # Define data types
        parse_dates=['DATE']  # Parse the 'DATE' column as datetime
    )
    .set_index('DATE')  # Set 'DATE' as the DataFrame index
    .sort_index()  
)
    df_range = df.loc[time_start:time_end]

     #find max tmax and min tmin
    df_tmax = df_range.loc[df_range['ELEMENT'] == 'TMAX', 'DATA_VALUE'] /10
    df_tmin = df_range.loc[df_range['ELEMENT'] == 'TMIN', 'DATA_VALUE'] /10
   
    #annual mean min temp
    ser=df_tmin[~((df_tmin.index.month==2)&(df_tmin.index.day==29))]
    tmin_mean =ser.groupby(ser.index.day_of_year).mean()
    ser2=df_tmax[~((df_tmax.index.month==2)&(df_tmax.index.day==29))]
    tmax_mean=ser2.groupby(ser2.index.day_of_year).mean()

    #record min
    record_tmin = ser.groupby(ser.index.day_of_year).min()
    record_tmax = ser2.groupby(ser2.index.day_of_year).max()  
    
    #create pandas df of all values

    df_grouped = pd.DataFrame(
        {'record_min_temp': record_tmin, 'average_min_temp': tmin_mean, 'average_max_temp': tmax_mean,'record_max_temp':record_tmax},
    )

    df_actual = pd.DataFrame(
        {'actual_low':df_tmin, 'actual_high': df_tmax}
    )

    df_actual['day_of_year'] = df_actual.index.day_of_year

    df_merged = pd.merge(df_actual, df_grouped, left_on='day_of_year', right_index=True, how='left')
    df_merged.drop('day_of_year', axis=1, inplace=True)
    
    return df_merged


In [35]:
#output and example
station_data_values('USW00094846', time_start, time_end)

Unnamed: 0_level_0,actual_low,actual_high,record_min_temp,average_min_temp,average_max_temp,record_max_temp
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1981-01-01,-3.3,0.6,-22.7,-7.618605,0.158140,9.4
1981-01-02,-8.3,-3.3,-22.7,-6.981395,0.660465,16.1
1981-01-03,-20.6,-5.6,-24.3,-7.104651,0.597674,14.4
1981-01-04,-22.8,-15.0,-22.8,-7.904651,0.530233,17.8
1981-01-05,-18.9,-4.4,-26.7,-8.427907,-0.211628,12.8
...,...,...,...,...,...,...
2023-12-20,-1.0,7.2,-24.9,-6.595349,1.006977,13.3
2023-12-21,0.6,8.9,-25.6,-5.965116,1.527907,13.3
2023-12-22,6.1,7.2,-27.8,-6.323256,2.044186,13.3
2023-12-23,6.1,9.4,-29.4,-6.662791,0.574419,16.7


In [55]:
stations_data = []

for station_id in station_ids_df['ID']:
    station_data = station_data_values(station_id,time_start,time_end)
    station_data['ID'] = station_id
    stations_data.append(station_data)

combined_stations_df = pd.concat(stations_data, ignore_index=False)


Unnamed: 0_level_0,actual_low,actual_high,record_min_temp,average_min_temp,average_max_temp,record_max_temp,ID
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1981-01-01,-3.3,0.6,-22.7,-7.618605,0.158140,9.4,USW00094846
1981-01-02,-8.3,-3.3,-22.7,-6.981395,0.660465,16.1,USW00094846
1981-01-03,-20.6,-5.6,-24.3,-7.104651,0.597674,14.4,USW00094846
1981-01-04,-22.8,-15.0,-22.8,-7.904651,0.530233,17.8,USW00094846
1981-01-05,-18.9,-4.4,-26.7,-8.427907,-0.211628,12.8,USW00094846
...,...,...,...,...,...,...,...
2024-07-05,26.1,37.2,20.0,23.761364,34.109091,38.3,USW00012960
2024-07-06,24.4,36.7,20.6,23.670455,34.011364,37.2,USW00012960
2024-07-07,24.4,32.2,20.6,23.720455,33.734091,38.9,USW00012960
2024-07-08,23.3,28.9,21.1,23.893182,33.834091,38.3,USW00012960


In [37]:
#combined_stations_df.to_csv("combined_stations.csv")

In [67]:

import datetime
from os.path import dirname, join
import os
import pandas as pd
from scipy.signal import savgol_filter

from bokeh.io import curdoc
from bokeh.layouts import column, row
from bokeh.models import ColumnDataSource, DataRange1d, Select
from bokeh.palettes import Blues4
from bokeh.plotting import figure
from bokeh.io import show, output_notebook, push_notebook

output_notebook()


STATISTICS = ['record_min_temp', 'average_min_temp', 'actual_low', 'actual_high', 'average_max_temp','record_max_temp']

def get_dataset(src,name, distribution):
    df = src[src.ID == name].copy()
    del df['ID']
    df['DATE'] = pd.to_datetime(df.DATE)
    #timedelta here instead of pd.DateOffset to avoid pandas bug < 0.18 (Pandas issue #11925)
    df['left'] = df.DATE - datetime.timedelta(days=0.5)
    df['right'] = df.DATE + datetime.timedelta(days=0.5)
    df = df.set_index(['DATE'])
    df.sort_index(inplace=True)
    if distribution == 'Smoothed':
        window, order = 51, 3
        for key in STATISTICS:
            df[key] = savgol_filter(df[key], window, order)

    return ColumnDataSource(data=df)

def make_plot(source, title):
    plot = figure(x_axis_type="datetime", width=800, tools="", toolbar_location=None)
    
    plot.title.text = title

    plot.quad(top='record_max_temp', bottom='record_min_temp', left='left', right='right',
              color=Blues4[2], source=source, legend_label="Record")
    plot.quad(top='average_max_temp', bottom='average_min_temp', left='left', right='right',
              color=Blues4[1], source=source, legend_label="Average")
    plot.quad(top='actual_high', bottom='actual_low', left='left', right='right',
              color=Blues4[0], alpha=0.5, line_color="black", source=source, legend_label="Actual")

    # fixed attributes
    plot.xaxis.axis_label = None
    plot.yaxis.axis_label = "Temperature (C)"
    plot.axis.axis_label_text_font_style = "bold"
    plot.x_range = DataRange1d(range_padding=0.0)
    plot.grid.grid_line_alpha = 0.3
    
    # Set x-axis range for the selected year
    start_date = pd.Timestamp(f'{year}-01-01')
    end_date = pd.Timestamp(f'{year}-12-31')
    plot.x_range = DataRange1d(start=start_date, end=end_date, range_padding=0.0)

    return plot

def update_plot(attrname, old, new):
    city = city_select.value
    year = year_select.value
    plot.title.text = "Weather data for " + cities[city]['title']

    src = get_dataset(df, cities[city]['ID'], distribution_select.value)
    source.data.update(src.data)


    start_date = pd.Timestamp(f'{year}-01-01')
    end_date = pd.Timestamp(f'{year}-12-31')
    plot.x_range.start = start_date
    plot.x_range.end = end_date

    push_notebook()

city = 'Chicago'
distribution = 'Discrete'
year = '1981'

cities = {
  'Chicago':
        {'ID': 'USW00094846',
        'title': 'CHICAGO'
        },
    'Miami':
        {'ID': 'USW00012839',
        'title': 'MIAMI'
        },
     'Los Angeles':
        {'ID': 'USW00023174',
        'title': 'LOS ANGELES'
        },
    'New York':
        {'ID': 'USW00094789', 
         'title': 'NEW YORK'
         },
    'Houston':
        {'ID': 'USW00012960', 
        'title': 'HOUSTON'
        },
    }


df = pd.read_csv('./combined_stations.csv')
source = get_dataset(df, cities[city]['ID'], distribution)
plot = make_plot(source, f"Weather data for {cities[city]['title']} in {year}")

# Dropdowns
city_select = Select(value=city, title='City', options=sorted(cities.keys()))
year_select = Select(value=year, title='Year', options=[str(i) for i in range(1981, 2024)])
distribution_select = Select(value=distribution, title='Distribution', options=['Discrete', 'Smoothed'])


distribution = distribution_select.value
city = city_select.value
year = year_select.value

#onchange
city_select.on_change('value', update_plot)
year_select.on_change('value', update_plot)
distribution_select.on_change('value', update_plot)

# Layout of the controls and plot
controls = column(city_select, year_select, distribution_select)
layout = row(plot, controls)

show(layout)


You are generating standalone HTML/JS output, but trying to use real Python
callbacks (i.e. with on_change or on_event). This combination cannot work.

Only JavaScript callbacks may be used with standalone output. For more
information on JavaScript callbacks with Bokeh, see:

    https://docs.bokeh.org/en/latest/docs/user_guide/interaction/js_callbacks.html

Alternatively, to use real Python callbacks, a Bokeh server application may
be used. For more information on building and running Bokeh applications, see:

    https://docs.bokeh.org/en/latest/docs/user_guide/server.html



In [39]:
cities[city]['ID']

'USW00094846'