## Santa Barbara Weather Forecast Model Evaluation
### UCSB Climate Variation and Changes Research Group
#### Advisor: Professor Charles Jones
#### Author: Pippa Lin

In [1]:
import numpy as np
import pandas as pd
import altair as alt
import math
import os
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

### Introduction:
This research project aims to nterpret present and future climates in Santa Barbara from the Numerical Weather Forecast (NWF) model. The data is collected from https://clivac.eri.ucsb.edu/clivac/wrfreal/index.html. In this project, I performed statistical analysis to compare NWF model data and weather station observation by season, generating root mean square error, mean bias, and correlation by hour to assess model performance.

### 1.Read the data:

In [2]:
folder_path = '/Users/pippalin/Desktop/Climate Research/xskill-mtic1/' 
file_names = os.listdir(folder_path)

# Initialize an empty list to store the data frames
dfs = []

# Loop through the list of file names and read each file into a Pandas DataFrame
for file_name in file_names:
    # Check if the file is a CSV file
    if file_name.endswith('.txt'):
        # Read the file into a Pandas DataFrame
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path, skiprows = 7,nrows = 73) # since files with > 73 rows has nan values after row 73
        # Change the column name of the data frame
        df.set_axis(['yyyy','mm', 'dd', 'hh', 'min','ss','modtemp','modrh','modwsp','modwd','yyyy1','mm1', 'dd1', 'hh1', 'mm1','ss1','obstemp','obsrh','obswsp','obswd'], axis=1, inplace=True)
        
        # Frist, move the obs-part up by one, covering the first row, and drop the last row:
        df.loc[0:len(df)-2, 'yyyy1':'obswd'] = df.loc[1:len(df)-1, 'yyyy1':'obswd'].values
        df = df.drop(df.index[-1])

        # Add a column hour
        df["hour"] = range(len(df))
        
        dfs.append(df)
        
# Concatenate the data frames into a single data frame
merged_df = pd.concat(dfs).sort_values(by=['yyyy','mm']).reset_index(drop=True)

In [3]:
merged_df

Unnamed: 0,yyyy,mm,dd,hh,min,ss,modtemp,modrh,modwsp,modwd,...,mm1,dd1,hh1,mm1.1,ss1,obstemp,obsrh,obswsp,obswd,hour
0,2020,4,25,0,0,0,27.817,44.783,1.64850,4.2949,...,4.0,25.0,0.0,47.0,0.0,30.000,19.0,5.370,11.0,0
1,2020,4,25,1,0,0,30.126,36.512,1.15990,326.0200,...,4.0,25.0,1.0,47.0,0.0,28.890,19.0,12.070,15.0,1
2,2020,4,25,2,0,0,29.504,23.131,6.52000,25.0390,...,4.0,25.0,2.0,47.0,0.0,27.220,21.0,19.220,16.0,2
3,2020,4,25,3,0,0,27.334,26.628,8.96630,15.1050,...,4.0,25.0,3.0,47.0,0.0,26.670,22.0,17.430,18.0,3
4,2020,4,25,4,0,0,26.637,27.462,8.18180,17.1510,...,4.0,25.0,4.0,47.0,0.0,26.670,20.0,18.780,16.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59224,2023,1,17,19,0,0,10.769,52.299,1.02220,242.2200,...,1.0,17.0,19.0,47.0,0.0,10.556,46.0,4.471,340.0,67
59225,2023,1,17,20,0,0,11.540,48.411,0.66567,190.7500,...,1.0,17.0,20.0,47.0,0.0,11.111,42.0,3.575,351.0,68
59226,2023,1,17,21,0,0,12.445,43.218,1.44200,50.4810,...,1.0,17.0,21.0,47.0,0.0,11.667,42.0,2.233,329.0,69
59227,2023,1,17,22,0,0,12.055,43.034,4.59610,12.5020,...,1.0,17.0,22.0,47.0,0.0,11.667,40.0,3.128,41.0,70


### 2.Group by Season
* March - May: Spring
* June - August: Summer
* September - November: Fall
* December - Feburary: Winter

In [4]:
Spr = merged_df[(merged_df.mm == 3) | (merged_df.mm == 4) | (merged_df.mm == 5)]
Sum = merged_df[(merged_df.mm == 6) | (merged_df.mm == 7) | (merged_df.mm == 8)]
Fal = merged_df[(merged_df.mm == 9) | (merged_df.mm == 10) | (merged_df.mm == 11)]
Win = merged_df[(merged_df.mm == 12) | (merged_df.mm == 1) | (merged_df.mm == 2)]

<br>

### 3.Calculate each Seasons's statistics

### Spring:
#### RMSE:
1. Calcualte $(mod-obs)^2$
2. Group by hour and sum up $(mod-obs)^2$
3. Divide the sum by n and take square root

In [5]:
# 1. Calculate the square difference
Spr["temp_dif"] = (Spr["obstemp"] - Spr["modtemp"])**2
Spr["rh_dif"] = (Spr["obsrh"] - Spr["modrh"])**2
Spr["wsp_dif"] = (Spr["obswsp"] - Spr["modwsp"])**2

In [6]:
# 2. Group by hour and sum
Spr_sum_temp = Spr.groupby("hour")["temp_dif"].sum()
Spr_sum_rh = Spr.groupby("hour")["rh_dif"].sum()
Spr_sum_wsp = Spr.groupby("hour")["wsp_dif"].sum()

In [7]:
# 3. Divide the sums by n and take square root, calculate rmse and make it a dataframe
count = Spr.groupby("hour").count()

# First, write a function to calculate rmse and return rmse
def rmse(data,count):
    rmse = []
    for i in range(len(data)):
        rmse.append(np.sqrt(data[i]/count[i]))
    rmse = pd.DataFrame(rmse,columns = ['rmse'])
    rmse['hour'] = range(0,72)
    return rmse

# Now we calculate rmse for each variable
Spr_rmse_temp = rmse(Spr_sum_temp,count["obstemp"])
Spr_rmse_rh = rmse(Spr_sum_rh,count["obsrh"])
Spr_rmse_wsp = rmse(Spr_sum_wsp,count["obswsp"])

#### Write a funtcion of plotting

In [8]:
def plotrmse(dataframe,yname,plotname,subtitle):
    # A straight line
    rule = alt.Chart(pd.DataFrame({'Component': [24,48,72]})).mark_rule(color='#D2386C').encode(x='Component')
    
    # Plotting
    plt = alt.Chart(dataframe).mark_line().encode(
    x = 'hour:Q',
    y = str(yname),
    color=alt.value("#FFAA00")
).properties(
    width = 800,
    height = 300,
    title={
      "text": [plotname],
      "subtitle": [str(subtitle)],
      "color": "green"
    })
    return plt + rule

In [9]:
plotrmse(Spr_rmse_temp,'rmse',"RMSE plot of Temperature in Spring","sample size:" +  str(Spr["temp_dif"].count()))

In [10]:
plotrmse(Spr_rmse_rh,'rmse',"RMSE plot of Relative Humidity in Spring","sample size:" +  str(Spr["rh_dif"].count()))

In [11]:
plotrmse(Spr_rmse_wsp,'rmse',"RMSE plot of Wind Speed in Spring","sample size:" +  str(Spr["wsp_dif"].count()))

<br>

#### MB:
1. Calcualte $(mod-obs)$
2. Group by hour and sum up $(mod-obs)$
3. Divide the sum by n

In [12]:
# 1. Calculate the difference
Spr["temp_dif"] = Spr["modtemp"] - Spr["obstemp"]
Spr["rh_dif"] = Spr["modrh"] - Spr["obsrh"]
Spr["wsp_dif"] = Spr["modwsp"] - Spr["obswsp"]

In [13]:
# 2. Group by hour and sum
Spr_sum_temp = Spr.groupby("hour")["temp_dif"].sum()
Spr_sum_rh = Spr.groupby("hour")["rh_dif"].sum()
Spr_sum_wsp = Spr.groupby("hour")["wsp_dif"].sum()

In [14]:
# 3. Divide the sums by n and take square root, calculate rmse and make it a dataframe
count = Spr.groupby("hour").count()

# First, write a function to calculate rmse and return rmse
def mb(data,count):
    mb = []
    for i in range(len(data)):
        mb.append(data[i]/count[i])
    mb = pd.DataFrame(mb,columns = ['mb'])
    mb['hour'] = range(0,72)
    return mb

# Now we calculate rmse for each variable
Spr_mb_temp = mb(Spr_sum_temp,count["obstemp"])
Spr_mb_rh = mb(Spr_sum_rh,count["obsrh"])
Spr_mb_wsp = mb(Spr_sum_wsp,count["obswsp"])

#### Plotting

In [15]:
plotrmse(Spr_mb_temp,'mb',"Mean Bias plot of Temperature in Spring","sample size:" +  str(Spr["temp_dif"].count()))

In [16]:
plotrmse(Spr_mb_rh,'mb',"Mean Bias plot of Relative Humidity in Spring","sample size:" +  str(Spr["rh_dif"].count()))

In [17]:
plotrmse(Spr_mb_wsp,'mb',"Mean Bias plot of Wind Speed in Spring","sample size:" +  str(Spr["wsp_dif"].count()))

<br>

#### Correlation:
1. Create a matrix with:
$$\begin{bmatrix} [mod0h] & [mod1h] & ... & [mod84h] \\ [obs0h] & [obs1h] & ... & [obs84h] \end{bmatrix}$$
where [mod0h] contains a list of all values from 0h
* Note: If the list contains None type value, then the correlation will be None. Therefore, we need to remove the null value in a list and also remove the corresponding obs/mod value. We can write a function to do so:
2. Calculate correlation function between mod_nh and obs_nh

In [18]:
# 1. Define a function which create lists with all values of that hour, also removing null values
def create_list(Season,var):
    return Season.groupby("hour")[var].apply(lambda x: x.tolist())

# Calculate the list
modtemp_list = create_list(Spr,"modtemp")
obstemp_list = create_list(Spr,"obstemp")
modrh_list = create_list(Spr,"modrh")
obsrh_list = create_list(Spr,"obsrh")
modwsp_list = create_list(Spr,"modwsp")
obswsp_list = create_list(Spr,"obswsp")

In [19]:
# 2. Since there are NaN for each list, we need to remove nan value and the corresponding index, 
# we first record the index of the missing value and remove i
def index(modlist,obslist):
    # Get the index matrix
    index = [[] for _ in range(len(modlist))]
    for i in range(len(modlist)):
        for j in range(len(modlist[i])):
            if pd.isna(modlist[i][j]) or pd.isna(obslist[i][j]):
                index[i].append(j)
    return index

In [20]:
# 3. Remove the corresponding value from the index
def removeNan(index,modlist,obslist):
    update_mod = [[] for _ in range(len(modlist))]
    update_obs = [[] for _ in range(len(obslist))]
    
    for i in range(len(modlist)):
        for j in range(len(modlist[i])):
            if j not in index[i]:
                update_mod[i].append(modlist[i][j])
                update_obs[i].append(obslist[i][j])
    return update_mod,update_obs

In [21]:
up_modtemp_list,up_obstemp_list = removeNan(index(modtemp_list,obstemp_list),modtemp_list,obstemp_list)
up_modrh_list,up_obsrh_list = removeNan(index(modrh_list,obsrh_list),modrh_list,obsrh_list)
up_modwsp_list,up_obswsp_list = removeNan(index(modwsp_list,obswsp_list),modwsp_list,obswsp_list)

In [22]:
# 4. Create a function to calculate correlation and create a list
def correlation(modlist,obslist):
    correlation = []
    for i in range(len(modlist)):
        correlation.append(np.corrcoef(modlist[i],obslist[i])[0][1])
        
    # Turn into dataframe
    correlation = pd.DataFrame(correlation,columns = ['corr'])
    correlation['hour'] = range(0,72)
    return correlation

In [23]:
corr_temp = correlation(up_modtemp_list,up_obstemp_list)
corr_rh = correlation(up_modrh_list,up_obsrh_list)
corr_wsp = correlation(up_modwsp_list,up_obswsp_list)

In [24]:
# 5. Create a function to calculate the sample size
def size(Alist):
    return sum(len(element) for element in Alist)

#### Plotting

In [25]:
plotrmse(corr_temp,'corr',"Correlation plot of Temperature in Spring","sample size:" + str(size(up_obstemp_list)))

In [26]:
plotrmse(corr_rh,'corr',"Correlation plot of Relative Humidity in Spring","sample size:" + str(size(up_obsrh_list)))

In [27]:
plotrmse(corr_wsp,'corr',"Correlation plot of Wind Speed in Spring","sample size:" + str(size(up_obswsp_list)))

<br>

### Summer:
#### RMSE:

In [28]:
# 1. Calculate the square difference
Sum["temp_dif"] = (Sum["obstemp"] - Sum["modtemp"])**2
Sum["rh_dif"] = (Sum["obsrh"] - Sum["modrh"])**2
Sum["wsp_dif"] = (Sum["obswsp"] - Sum["modwsp"])**2

In [29]:
# 2. Group by hour and sum
Sum_sum_temp = Sum.groupby("hour")["temp_dif"].sum()
Sum_sum_rh = Sum.groupby("hour")["rh_dif"].sum()
Sum_sum_wsp = Sum.groupby("hour")["wsp_dif"].sum()

In [30]:
# 3. Divide the sums by n and take square root, calculate rmse and make it a dataframe
count = Sum.groupby("hour").count()

# Now we calculate rmse for each variable
Sum_rmse_temp = rmse(Sum_sum_temp,count["obstemp"])
Sum_rmse_rh = rmse(Sum_sum_rh,count["obsrh"])
Sum_rmse_wsp = rmse(Sum_sum_wsp,count["obswsp"])

In [31]:
plotrmse(Sum_rmse_temp,'rmse',"RMSE plot of Temperature in Summer","sample size:" +  str(Sum["temp_dif"].count()))

In [32]:
plotrmse(Sum_rmse_rh,'rmse',"RMSE plot of Relative Humidity in Summer","sample size:" +  str(Sum["rh_dif"].count()))

In [33]:
plotrmse(Sum_rmse_wsp,'rmse',"RMSE plot of Wind Speed in Summer","sample size:" +  str(Sum["wsp_dif"].count()))

<br>

#### MB:

In [34]:
# 1. Calculate the difference
Sum["temp_dif"] = Sum["modtemp"] - Sum["obstemp"]
Sum["rh_dif"] = Sum["modrh"] - Sum["obsrh"]
Sum["wsp_dif"] = Sum["modwsp"] - Sum["obswsp"]

In [35]:
# 2. Group by hour and sum
Sum_sum_temp = Sum.groupby("hour")["temp_dif"].sum()
Sum_sum_rh = Sum.groupby("hour")["rh_dif"].sum()
Sum_sum_wsp = Sum.groupby("hour")["wsp_dif"].sum()

In [36]:
# 3. Divide the sums by n and take square root, calculate rmse and make it a dataframe
count = Sum.groupby("hour").count()

# Now we calculate rmse for each variable
Sum_mb_temp = mb(Sum_sum_temp,count["obstemp"])
Sum_mb_rh = mb(Sum_sum_rh,count["obsrh"])
Sum_mb_wsp = mb(Sum_sum_wsp,count["obswsp"])

In [37]:
plotrmse(Sum_mb_temp,'mb',"Mean Bias plot of Temperature in Summer","sample size:" +  str(Sum["temp_dif"].count()))

In [38]:
plotrmse(Sum_mb_rh,'mb',"Mean Bias plot of Relative Humidity in Summer","sample size:" +  str(Sum["rh_dif"].count()))

In [39]:
plotrmse(Sum_mb_wsp,'mb',"Mean Bias plot of Wind Speed in Summer","sample size:" +  str(Sum["wsp_dif"].count()))

<br>

#### Correlation:

In [40]:
# 1. Calculate the list
modtemp_list = create_list(Sum,"modtemp")
obstemp_list = create_list(Sum,"obstemp")
modrh_list = create_list(Sum,"modrh")
obsrh_list = create_list(Sum,"obsrh")
modwsp_list = create_list(Sum,"modwsp")
obswsp_list = create_list(Sum,"obswsp")

In [41]:
up_modtemp_list,up_obstemp_list = removeNan(index(modtemp_list,obstemp_list),modtemp_list,obstemp_list)
up_modrh_list,up_obsrh_list = removeNan(index(modrh_list,obsrh_list),modrh_list,obsrh_list)
up_modwsp_list,up_obswsp_list = removeNan(index(modwsp_list,obswsp_list),modwsp_list,obswsp_list)

In [42]:
corr_temp = correlation(up_modtemp_list,up_obstemp_list)
corr_rh = correlation(up_modrh_list,up_obsrh_list)
corr_wsp = correlation(up_modwsp_list,up_obswsp_list)

In [43]:
plotrmse(corr_temp,'corr',"Correlation plot of Temperature in Summer","sample size:" + str(size(up_obstemp_list)))

In [44]:
plotrmse(corr_rh,'corr',"Correlation plot of Relative Humidity in Summer","sample size:" + str(size(up_obsrh_list)))

In [45]:
plotrmse(corr_wsp,'corr',"Correlation plot of Wind Speed in Summer","sample size:" + str(size(up_obswsp_list)))

<br>

### Fall:
#### RMSE:

In [46]:
# 1. Calculate the square difference
Fal["temp_dif"] = (Fal["obstemp"] - Fal["modtemp"])**2
Fal["rh_dif"] = (Fal["obsrh"] - Fal["modrh"])**2
Fal["wsp_dif"] = (Fal["obswsp"] - Fal["modwsp"])**2

In [47]:
# 2. Group by hour and sum
Fal_sum_temp = Fal.groupby("hour")["temp_dif"].sum()
Fal_sum_rh = Fal.groupby("hour")["rh_dif"].sum()
Fal_sum_wsp = Fal.groupby("hour")["wsp_dif"].sum()

In [48]:
# 3. Divide the sums by n and take square root, calculate rmse and make it a dataframe
count = Fal.groupby("hour").count()

# Now we calculate rmse for each variable
Fal_rmse_temp = rmse(Fal_sum_temp,count["obstemp"])
Fal_rmse_rh = rmse(Fal_sum_rh,count["obsrh"])
Fal_rmse_wsp = rmse(Fal_sum_wsp,count["obswsp"])

In [49]:
plotrmse(Fal_rmse_temp,'rmse',"RMSE plot of Temperature in Fall","sample size:" +  str(Fal["temp_dif"].count()))

In [50]:
plotrmse(Fal_rmse_rh,'rmse',"RMSE plot of Relative Humidity in Fall","sample size:" +  str(Fal["rh_dif"].count()))

In [51]:
plotrmse(Fal_rmse_wsp,'rmse',"RMSE plot of Wind Speed in Fall","sample size:" +  str(Fal["wsp_dif"].count()))

<br>

#### MB:

In [52]:
# 1. Calculate the difference
Fal["temp_dif"] = Fal["modtemp"] - Fal["obstemp"]
Fal["rh_dif"] = Fal["modrh"] - Fal["obsrh"]
Fal["wsp_dif"] = Fal["modwsp"] - Fal["obswsp"]

In [53]:
# 2. Group by hour and sum
Fal_sum_temp = Fal.groupby("hour")["temp_dif"].sum()
Fal_sum_rh = Fal.groupby("hour")["rh_dif"].sum()
Fal_sum_wsp = Fal.groupby("hour")["wsp_dif"].sum()

In [54]:
# 3. Divide the sums by n and take square root, calculate rmse and make it a dataframe
count = Fal.groupby("hour").count()

# Now we calculate rmse for each variable
Fal_mb_temp = mb(Fal_sum_temp,count["obstemp"])
Fal_mb_rh = mb(Fal_sum_rh,count["obsrh"])
Fal_mb_wsp = mb(Fal_sum_wsp,count["obswsp"])

In [55]:
plotrmse(Fal_mb_temp,'mb',"Mean Bias plot of Temperature in Fall","sample size:" +  str(Fal["temp_dif"].count()))

In [56]:
plotrmse(Fal_mb_rh,'mb',"Mean Bias plot of Relative Humidity in Fall","sample size:" +  str(Fal["rh_dif"].count()))

In [57]:
plotrmse(Fal_mb_wsp,'mb',"Mean Bias plot of Wind Speed in Fall","sample size:" +  str(Fal["wsp_dif"].count()))

<br>

#### Correlation:

In [58]:
# 1. Calculate the list
modtemp_list = create_list(Fal,"modtemp")
obstemp_list = create_list(Fal,"obstemp")
modrh_list = create_list(Fal,"modrh")
obsrh_list = create_list(Fal,"obsrh")
modwsp_list = create_list(Fal,"modwsp")
obswsp_list = create_list(Fal,"obswsp")

In [59]:
up_modtemp_list,up_obstemp_list = removeNan(index(modtemp_list,obstemp_list),modtemp_list,obstemp_list)
up_modrh_list,up_obsrh_list = removeNan(index(modrh_list,obsrh_list),modrh_list,obsrh_list)
up_modwsp_list,up_obswsp_list = removeNan(index(modwsp_list,obswsp_list),modwsp_list,obswsp_list)

In [60]:
corr_temp = correlation(up_modtemp_list,up_obstemp_list)
corr_rh = correlation(up_modrh_list,up_obsrh_list)
corr_wsp = correlation(up_modwsp_list,up_obswsp_list)

In [61]:
plotrmse(corr_temp,'corr',"Correlation plot of Temperature in Fall","sample size:" + str(size(up_obstemp_list)))

In [62]:
plotrmse(corr_rh,'corr',"Correlation plot of Relative Humidity in Fall","sample size:" + str(size(up_obsrh_list)))

In [63]:
plotrmse(corr_wsp,'corr',"Correlation plot of Wind Speed in Fall","sample size:" + str(size(up_obswsp_list)))

<br>

### Winter:
#### RMSE:

In [64]:
# 1. Calculate the square difference
Win["temp_dif"] = (Win["obstemp"] - Win["modtemp"])**2
Win["rh_dif"] = (Win["obsrh"] - Win["modrh"])**2
Win["wsp_dif"] = (Win["obswsp"] - Win["modwsp"])**2

In [65]:
# 2. Group by hour and sum
Win_sum_temp = Win.groupby("hour")["temp_dif"].sum()
Win_sum_rh = Win.groupby("hour")["rh_dif"].sum()
Win_sum_wsp = Win.groupby("hour")["wsp_dif"].sum()

In [66]:
# 3. Divide the sums by n and take square root, calculate rmse and make it a dataframe
count = Win.groupby("hour").count()

# Now we calculate rmse for each variable
Win_rmse_temp = rmse(Win_sum_temp,count["obstemp"])
Win_rmse_rh = rmse(Win_sum_rh,count["obsrh"])
Win_rmse_wsp = rmse(Win_sum_wsp,count["obswsp"])

In [67]:
plotrmse(Win_rmse_temp,'rmse',"RMSE plot of Temperature in Winter","sample size:" +  str(Win["temp_dif"].count()))

In [68]:
plotrmse(Win_rmse_rh,'rmse',"RMSE plot of Relative Humidity in Winter","sample size:" +  str(Win["rh_dif"].count()))

In [69]:
plotrmse(Win_rmse_wsp,'rmse',"RMSE plot of Wind Speed in Winter","sample size:" +  str(Win["wsp_dif"].count()))

<br>

#### MB:

In [70]:
# 1. Calculate the difference
Win["temp_dif"] = Win["modtemp"] - Win["obstemp"]
Win["rh_dif"] = Win["modrh"] - Win["obsrh"]
Win["wsp_dif"] = Win["modwsp"] - Win["obswsp"]

In [71]:
# 2. Group by hour and sum
Win_sum_temp = Win.groupby("hour")["temp_dif"].sum()
Win_sum_rh = Win.groupby("hour")["rh_dif"].sum()
Win_sum_wsp = Win.groupby("hour")["wsp_dif"].sum()

In [72]:
# 3. Divide the sums by n and take square root, calculate rmse and make it a dataframe
count = Win.groupby("hour").count()

# Now we calculate rmse for each variable
Win_mb_temp = mb(Win_sum_temp,count["obstemp"])
Win_mb_rh = mb(Win_sum_rh,count["obsrh"])
Win_mb_wsp = mb(Win_sum_wsp,count["obswsp"])

In [73]:
plotrmse(Win_mb_temp,'mb',"Mean Bias plot of Temperature in Winter","sample size:" +  str(Win["temp_dif"].count()))

In [74]:
plotrmse(Win_mb_rh,'mb',"Mean Bias plot of Relative Humidity in Winter","sample size:" +  str(Win["rh_dif"].count()))

In [75]:
plotrmse(Win_mb_wsp,'mb',"Mean Bias plot of Wind Speed in Winter","sample size:" +  str(Win["wsp_dif"].count()))

<br>

#### Correlation

In [76]:
# 1. Calculate the list
modtemp_list = create_list(Win,"modtemp")
obstemp_list = create_list(Win,"obstemp")
modrh_list = create_list(Win,"modrh")
obsrh_list = create_list(Win,"obsrh")
modwsp_list = create_list(Win,"modwsp")
obswsp_list = create_list(Win,"obswsp")

In [77]:
up_modtemp_list,up_obstemp_list = removeNan(index(modtemp_list,obstemp_list),modtemp_list,obstemp_list)
up_modrh_list,up_obsrh_list = removeNan(index(modrh_list,obsrh_list),modrh_list,obsrh_list)
up_modwsp_list,up_obswsp_list = removeNan(index(modwsp_list,obswsp_list),modwsp_list,obswsp_list)

In [78]:
corr_temp = correlation(up_modtemp_list,up_obstemp_list)
corr_rh = correlation(up_modrh_list,up_obsrh_list)
corr_wsp = correlation(up_modwsp_list,up_obswsp_list)

In [79]:
plotrmse(corr_temp,'corr',"Correlation plot of Temperature in Winter","sample size:" + str(size(up_obstemp_list)))

In [80]:
plotrmse(corr_rh,'corr',"Correlation plot of Relative Humidity in Winter","sample size:" + str(size(up_obsrh_list)))

In [81]:
plotrmse(corr_wsp,'corr',"Correlation plot of Wind Speed in Winter","sample size:" + str(size(up_obswsp_list)))