In [1]:
# These three wget commands will download three years' worth of data
!if [ ! -f "SE2018.csv" ]; then curl https://meteonet.umr-cnrm.fr/dataset/data/SE/ground_stations/SE_ground_stations_2018.tar.gz -o SE_ground_stations_2018.tar.gz; else echo "SE2018.csv found"; fi

SE2018.csv found


In [2]:
# Let's untar and unzip them
!if [ ! -f "SE2018.csv" ]; then tar -xvf SE_ground_stations_2018.tar.gz && rm -f SE_ground_stations_2018.tar.gz; else echo "SE2018.csv found"; fi

SE2018.csv found


In [3]:
# Are they listed?
!ls -l -sh SE2*.csv

3.2G -rw-r--r-- 1 mitesh mitesh 3.2G Jan 23  2020 SE2016.csv
3.2G -rw-r--r-- 1 mitesh mitesh 3.2G Jan 23  2020 SE2017.csv
3.3G -rw-r--r-- 1 mitesh mitesh 3.3G Jan 23  2020 SE2018.csv


In [4]:
## load cudf.pandas kernel
%load_ext cudf.pandas

In [5]:
# import cudf
import cupy as cp
import pandas as pd

In [6]:
gdf = pd.read_csv('./SE2018.csv')
gdf = gdf.drop(columns=['dd','precip','td'])
# Change the date column to datetime datatype. Look at the DataFrame's info
gdf['date'] = pd.to_datetime(gdf['date'])
# gdf.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 43308315 entries, 0 to 43308314
Data columns (total 9 columns):
 #   Column      Dtype
---  ------      -----
 0   number_sta  int64
 1   lat         float64
 2   lon         float64
 3   height_sta  float64
 4   date        datetime64[ns]
 5   ff          float64
 6   hu          float64
 7   t           float64
        object
dtypes: datetime64[ns](1), float64(6), int64(1), object(1)
memory usage: 2.8+ GB


In [7]:
gdf['year'] = gdf['date'].dt.year
gdf['month'] = gdf['date'].dt.month
gdf['day'] = gdf['date'].dt.day
gdf['hour'] = gdf['date'].dt.hour
gdf['mins'] = gdf['date'].dt.minute

#Remember how to check the bottom of a DataFrame without displaying millions of lines?
# gdf.tail()

In [8]:
# Let's use cupy.logical_and(...) function to select the data from specific time range.
# We may combine more logical_and() functions to achieve more than 2 and conditions.
# You'll need to make sure the start and end times are part of the dataset, if you
# opted to use a partial dataset for the sake of GPU memory.

start_time = pd.Timestamp('2017-02-01T00')
end_time = pd.Timestamp('2018-11-01T00')
station_id = 84086001
gdf_period = gdf.loc[cp.logical_and(cp.logical_and(gdf['date']>start_time,gdf['date']<end_time),gdf['number_sta']==station_id)]
gdf_period.shape

(72395, 14)

In [9]:
## Set "date" as the index. See what that does?
gdf_period.set_index("date", inplace=True)
# gdf_period.tail()

In [19]:
## Now, resample by daylong intervals, and check the max data during the resampled period. 
## We use .reset_index() to reset the index instead of date.
gdf_day_max = gdf_period.resample('D').max().bfill().reset_index()

## Resample with monthlong intervals, and check the mean data during the resampled period.
## Focus on year 2018 as an example. 
gdf_month_mean = gdf_period[gdf_period["year"]==2018].groupby('month').mean().reset_index()

Unnamed: 0,month,number_sta,lat,lon,height_sta,ff,hu,t,year,day,hour,mins
0,1,84086001.0,43.81,5.15,672.0,5.723396,79.878845,279.848839,2018.0,16.637551,11.504933,27.0
1,2,84086001.0,43.81,5.15,672.0,5.715506,75.301488,274.199509,2018.0,14.5,11.5,27.0
2,3,84086001.0,43.810153,5.149389,672.0,6.310645,74.870565,279.366855,2018.0,16.0,11.5,27.0
3,4,84086001.0,43.811,5.146,672.0,5.734847,65.348194,285.854847,2018.0,15.5,11.5,27.0
4,5,84086001.0,43.811,5.146,672.0,3.986358,77.578898,287.464785,2018.0,16.0,11.5,27.0


## Section 5: Applying cuxfilter and Finding Daily Temperature Variances

In [11]:
# First, let's import the modules from cuXfilter we'll need.
import cuxfilter
from cuxfilter import themes, layouts
from cuxfilter.assets.custom_tiles import get_provider, Vendors

In [12]:
# It's time to perform the cross filtering operation.
cux_df = cuxfilter.DataFrame.from_dataframe(gdf_day_max)

# Let's make a plot.
chart1 = cuxfilter.charts.line(x='date',y='t',title='Max Temperature of Day')
d = cux_df.dashboard([chart1],layout_array=[[1]], theme=cuxfilter.themes.rapids, data_size_widget=True)
d.app()

ValueError: ClassSelector parameter 'InteractiveDatashader.source_df' value must be an instance of (DataFrame, DataFrame), not           date  number_sta     lat    lon  height_sta    ff     hu       t  \
0   2018-01-01    84086001  43.810  5.150       672.0  10.3   99.0  281.95   
1   2018-01-02    84086001  43.810  5.150       672.0  10.4   88.0  281.35   
2   2018-01-03    84086001  43.810  5.150       672.0   9.5   93.0  286.45   
3   2018-01-04    84086001  43.810  5.150       672.0   8.8   90.0  286.85   
4   2018-01-05    84086001  43.810  5.150       672.0   9.0   92.0  289.35   
..         ...         ...     ...    ...         ...   ...    ...     ...   
299 2018-10-27    84086001  43.811  5.146       672.0   7.2  100.0  285.45   
300 2018-10-28    84086001  43.811  5.146       672.0   7.3  100.0  283.25   
301 2018-10-29    84086001  43.811  5.146       672.0   9.1  100.0  280.15   
302 2018-10-30    84086001  43.811  5.146       672.0  11.2  100.0  281.55   
303 2018-10-31    84086001  43.811  5.146       672.0  13.6  100.0  283.55   

    psl\r  year  month  day  hour  mins  
0      \r  2018      1    1    23    54  
1      \r  2018      1    2    23    54  
2      \r  2018      1    3    23    54  
3      \r  2018      1    4    23    54  
4      \r  2018      1    5    23    54  
..    ...   ...    ...  ...   ...   ...  
299    \r  2018     10   27    23    54  
300    \r  2018     10   28    23    54  
301    \r  2018     10   29    23    54  
302    \r  2018     10   30    23    54  
303    \r  2018     10   31    23    54  

[304 rows x 14 columns].

In [13]:
# And the mean temperature
cux_df = cuxfilter.DataFrame.from_dataframe(gdf_month_mean)

# Let's make a plot.
chart2 = cuxfilter.charts.line(x='month',y='t',title='Mean Temperature of Month on Year 2018-01 ~ 2018-10')
d = cux_df.dashboard([chart2],layout_array=[[1]], theme=cuxfilter.themes.rapids, data_size_widget=True)
d.app()

ValueError: ClassSelector parameter 'InteractiveDatashader.source_df' value must be an instance of (DataFrame, DataFrame), not    month  number_sta        lat       lon  height_sta        ff         hu  \
0      1  84086001.0  43.810000  5.150000       672.0  5.723396  79.878845   
1      2  84086001.0  43.810000  5.150000       672.0  5.715506  75.301488   
2      3  84086001.0  43.810153  5.149389       672.0  6.310645  74.870565   
3      4  84086001.0  43.811000  5.146000       672.0  5.734847  65.348194   
4      5  84086001.0  43.811000  5.146000       672.0  3.986358  77.578898   
5      6  84086001.0  43.811000  5.146000       672.0  4.092944  69.817222   
6      7  84086001.0  43.811000  5.146000       672.0  4.041219  55.033965   
7      8  84086001.0  43.811000  5.146000       672.0  4.214624  61.451075   
8      9  84086001.0  43.811000  5.146000       672.0  3.779583  64.454722   
9     10  84086001.0  43.811000  5.146000       672.0  5.132343  75.868414   

            t    year        day       hour       mins  
0  279.848839  2018.0  16.637551  11.504933  27.000000  
1  274.199509  2018.0  14.500000  11.500000  27.000000  
2  279.366855  2018.0  16.000000  11.500000  27.000000  
3  285.854847  2018.0  15.500000  11.500000  27.000000  
4  287.464785  2018.0  16.000000  11.500000  27.000000  
5  291.707028  2018.0  15.500000  11.500000  27.000000  
6  296.290433  2018.0  16.009161  11.508959  26.991513  
7  295.031223  2018.0  16.000000  11.500000  27.000000  
8  292.081111  2018.0  15.500000  11.500000  27.000000  
9  286.593925  2018.0  16.000000  11.500000  27.000000  .