In [None]:
import pandas as pd
import numpy as np
import math
import datetime
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
# reading the files
# data from http://projects.knmi.nl/klimatologie/daggegevens/selectie.cgi

station = pd.read_csv('../data/knmi_stations_2018.csv',sep=';')
rain = pd.read_excel('../data/KNMI_20200427.xlsx')

station.head()
rain.head()

In [None]:
# cleaning blank spaces in columns and station names:
station.columns = station.columns.str.replace(" ","")
station['NAME'] = station['NAME'].str.replace(" ","")
rain.columns = rain.columns.str.replace(" ","")
rain = rain.replace(r'^\s+$', np.nan, regex=True)

# data type check:
station.dtypes #all correct
rain.dtypes #there are a lot of objects rather than ints

# # changing all the data types of objects to int
# column_rain = rain.columns[2:]
# for x in column_rain:
#     rain[x] = rain[x].astype(str).astype(int)
# rain.dtypes
# # Receiving an error because of all the NaN values, checking first for relevant data:

# change data type to DATE
rain['YYYYMMDD'] = pd.to_datetime(rain['YYYYMMDD'].astype(int), format='%Y%m%d')
rain['YYYYMMDD']

# checking for blank values:
null_cols = station.isna().sum()
null_cols[null_cols > 0] # no blanks

#identifying RTM station CODE:
rtmstation = station[station['NAME'].isin(['ROTTERDAM'])]['STN']

#identifying rain information from RTM station
rtmrain = rain[(rain['STN'] == 344)]

# checking for blank values in RTM:
null_cols = rtmrain.isna().sum()
null_cols[null_cols > 0] # has plenty of blanks

rtmrain.head()

In [None]:
# historical analysis on rainfall

# selecting relevant columns for precipitation analysis
# DR - duration of precipitation (in 0.1 hours)
# RH - sum of precipitation for one day (in 0.1 mm) (-1 voor <0.05 mm)
# RHX - Hoogste uursom van de neerslag (in 0.1 mm) (-1 voor <0.05 mm)

rtmrain = rtmrain[['YYYYMMDD','DR','RH','RHX']]

# adding columns for month, year and decade
rtmrain['month'] = pd.DatetimeIndex(rtmrain['YYYYMMDD']).month
rtmrain['year'] = pd.DatetimeIndex(rtmrain['YYYYMMDD']).year
rtmrain['decade'] = (pd.DatetimeIndex(rtmrain['YYYYMMDD']).year)/10
rtmrain['decade']= rtmrain['decade'].apply(lambda x: math.floor(x))*10

# checking for blank values in rtmrain:
null_cols = rtmrain.isna().sum()
null_cols[null_cols > 0] # has plenty of blanks

# it seems they only started measuring rainfall in 1974, so I will drop everything before that
rtmrain.reset_index()[6299:6310]

# Get indexes of rows with year before 1974
indexYear = rtmrain[rtmrain['year'] < 1974].index
 
# Delete these row indexes from dataFrame
rtmrain.drop(indexYear , inplace=True)

null_cols = rtmrain.isna().sum()
null_cols[null_cols > 0] # only one blank :-)

# #blank from 2010-01-29, random date, probably techn failure
# # changing to 76, as rainy profile looks similar to 2014-01-23
ind_nan = rtmrain[(rtmrain['DR'].isna() == True)].index
rtmrain['DR'][ind_nan] = 76

null_cols = rtmrain.isna().sum()
null_cols[null_cols > 0] #no more blanks

# changing all the data types of objects to int
column_rain = rtmrain.columns[1:4]
for x in column_rain:
    rtmrain[x] = rtmrain[x].astype(str).astype(int)
rtmrain.dtypes

In [None]:
# looking at outliers for possible mistakes in data

q = rtmrain["DR"].quantile(0.99)
rtmrain[rtmrain["DR"] > q].sort_values(by = 'DR')

q = rtmrain["RH"].quantile(0.99)
rtmrain[rtmrain["RH"] > q].sort_values(by = 'RH')

q = rtmrain["RHX"].quantile(0.99)
rtmrain[rtmrain["RHX"] > q].sort_values(by = 'RHX')

# I think it looks OK, only date 1975-06-23 looks a bit out there, but I wont drop lines

In [None]:
plt.style.use('fivethirtyeight')
def rainpermonth(years):
    plt.figure(figsize=(20, 8))
    for year in years:
        ini = datetime.datetime.strptime('{}-01-01'.format(year), '%Y-%m-%d')
        fin = datetime.datetime.strptime('{}-12-31'.format(year), '%Y-%m-%d')

        years = rtmrain[(rtmrain.YYYYMMDD >= ini) & (rtmrain.YYYYMMDD <= fin)]

        plt.plot(years.month, years.RH, label=year)
    plt.title('Comparativa de temperaturas')
    plt.xlabel('Fecha')
    plt.ylabel('Temp °C')

years = [1974, 2018 - ((2018-1974)/2), 2018]
rainpermonth(years)
plt.show()


In [None]:
# total precipitation per month

# decade_rain = rtmrain.pivot_table(index=['decade','year'],values=['RH'], aggfunc=sum).reset_index()
# # decade_rain
sns.lineplot(x='YYYYMMDD',y='RH',data=rtmrain)