## Santa Barbara Weather Forecast Model Evaluation
### UCSB Climate Variation and Changes Research Group
#### Advisor: Professor Charles Jones
#### Author: Pippa Lin

In [1]:
import numpy as np
import pandas as pd
import altair as alt
import math
import os
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

### Introduction:
This research project aims to nterpret present and future climates in Santa Barbara from the Numerical Weather Forecast (NWF) model. The data is collected from https://clivac.eri.ucsb.edu/clivac/wrfreal/index.html. In this project, I performed statistical analysis to compare NWF model data and weather station observation by season, generating root mean square error, mean bias, and correlation by hour to assess model performance.
1.Read the data:¶

### 1.Projection Function
This function combines wind speed and wind direction and projects it onto two perpendicular values: zonal and meridional.

In [2]:
def projection(direction, speed):
    zonal = 0  # Default value for zonal
    meridional = 0
    
    if (speed == 9999. or direction == 9999.):
        zonal = 9999
        meridional = 9999
     
    elif direction == 360.0:
        direction == 0.0
        
    elif (direction >= 0 and direction < 90):
        zonal = -speed * np.sin(direction)
        meridional = -speed * np.cos(direction)
    
    elif (direction >= 90 and direction < 180):
        beta = 180 - direction
        zonal = -speed * np.sin(beta)
        meridional = speed * np.cos(beta)
        
    elif (direction >= 180 and direction < 270):
        beta  = 270.0 - direction
        zonal =  speed * np.cos(beta)
        meridional =  speed * np.sin(beta)
    
    elif (direction >= 270 and direction < 360):
        beta = 360 - direction
        zonal = speed * np.sin(beta)
        meridional = -speed * np.cos(beta)
    
    else:
        zonal = np.nan
        meridional = np.nan

    return zonal, meridional



def projectionList(Listdirection, Listspeed):
    Listzonal = []
    Listmeridional = []
    for d,s in zip(Listdirection, Listspeed):
        Listzonal.append(projection(d,s)[0])
        Listmeridional.append(projection(d,s)[1])
    return Listzonal, Listmeridional

### 2.Read the data:
Read and tidy the data

In [3]:
folder_path = '/Users/pippalin/Desktop/Climate Research/xskill-mtic1/' 
file_names = os.listdir(folder_path)

# Initialize an empty list to store the data frames
dfs = []

# Loop through the list of file names and read each file into a Pandas DataFrame
for file_name in file_names:
    # Check if the file is a CSV file
    if file_name.endswith('.txt'):
        # Read the file into a Pandas DataFrame
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path, skiprows = 7,nrows = 73) # since files with > 73 rows has nan values after row 73
        # Change the column name of the data frame
        df.set_axis(['yyyy','mm', 'dd', 'hh', 'min','ss','modtemp','modrh','modwsp','modwd','yyyy1','mm1', 'dd1', 'hh1', 'mm1','ss1','obstemp','obsrh','obswsp','obswd'], axis=1, inplace=True)
        
        # Frist, move the obs-part up by one, covering the first row, and drop the last row:
        df.loc[0:len(df)-2, 'yyyy1':'obswd'] = df.loc[1:len(df)-1, 'yyyy1':'obswd'].values
        df = df.drop(df.index[-1])

        # Add a column hour
        df["hour"] = range(len(df))
        
        dfs.append(df)
        
# Concatenate the data frames into a single data frame
merged_df = pd.concat(dfs).sort_values(by=['yyyy','mm']).reset_index(drop=True)

### 3.Create the Zonal and Meridional columns:

In [4]:
merged_df["modzonal"],merged_df["modmeridional"] = projectionList(merged_df["modwd"], merged_df["modwsp"])
merged_df["obszonal"],merged_df["obsmeridional"] = projectionList(merged_df["obswd"], merged_df["obswsp"])

### 4.Group by Season
* March - May: Spring
* June - August: Summer
* September - November: Fall
* December - Feburary: Winter

In [5]:
Spr = merged_df[(merged_df.mm == 3) | (merged_df.mm == 4) | (merged_df.mm == 5)]
Sum = merged_df[(merged_df.mm == 6) | (merged_df.mm == 7) | (merged_df.mm == 8)]
Fal = merged_df[(merged_df.mm == 9) | (merged_df.mm == 10) | (merged_df.mm == 11)]
Win = merged_df[(merged_df.mm == 12) | (merged_df.mm == 1) | (merged_df.mm == 2)]

<br>

### 5.Calculate each Seasons's statistics

### Spring:
#### RMSE:
1. Calcualte $(mod-obs)^2$
2. Group by hour and sum up $(mod-obs)^2$
3. Divide the sum by n and take square root

In [6]:
# 1. Calculate the square difference
Spr["zonal_dif"] = (Spr["obszonal"] - Spr["modzonal"])**2
Spr["meridional_dif"] = (Spr["obsmeridional"] - Spr["modmeridional"])**2

In [7]:
# 2. Group by hour and sum
Spr_sum_zonal = Spr.groupby("hour")["zonal_dif"].sum()
Spr_sum_meridional = Spr.groupby("hour")["meridional_dif"].sum()

In [9]:
# 3. Divide the sums by n and take square root, calculate rmse and make it a dataframe
count = Spr.groupby("hour").count()

# First, write a function to calculate rmse and return rmse
def rmse(data,count):
    rmse = []
    for i in range(len(data)):
        rmse.append(np.sqrt(data[i]/count[i]))
    rmse = pd.DataFrame(rmse,columns = ['rmse'])
    rmse['hour'] = range(0,72)
    return rmse

# Now we calculate rmse for each variable
Spr_rmse_zonal = rmse(Spr_sum_zonal,count["obszonal"])
Spr_rmse_meridional = rmse(Spr_sum_zonal,count["obsmeridional"])

#### Write a funtcion of plotting

In [10]:
def plotrmse(dataframe,yname,plotname,subtitle):
    # A straight line
    rule = alt.Chart(pd.DataFrame({'Component': [24,48,72]})).mark_rule(color='#D2386C').encode(x='Component')
    
    # Plotting
    plt = alt.Chart(dataframe).mark_line().encode(
    x = 'hour:Q',
    y = str(yname),
    color=alt.value("#FFAA00")
).properties(
    width = 800,
    height = 300,
    title={
      "text": [plotname],
      "subtitle": [str(subtitle)],
      "color": "green"
    })
    return plt + rule

In [11]:
plotrmse(Spr_rmse_zonal,'rmse',"RMSE plot of Zonal in Spring","sample size:" +  str(Spr["zonal_dif"].count()))

In [12]:
plotrmse(Spr_rmse_meridional,'rmse',"RMSE plot of Meridional in Spring","sample size:" +  str(Spr["meridional_dif"].count()))

<br>

#### MB:
1. Calcualte $(mod-obs)$
2. Group by hour and sum up $(mod-obs)$
3. Divide the sum by n

In [13]:
# 1. Calculate the difference
Spr["zonal_dif"] = Spr["modzonal"] - Spr["obszonal"]
Spr["meridional_dif"] = Spr["modmeridional"] - Spr["obsmeridional"]

In [14]:
# 2. Group by hour and sum
Spr_sum_zonal = Spr.groupby("hour")["zonal_dif"].sum()
Spr_sum_meridional = Spr.groupby("hour")["meridional_dif"].sum()

In [15]:
# 3. Divide the sums by n and take square root, calculate rmse and make it a dataframe
count = Spr.groupby("hour").count()

# First, write a function to calculate rmse and return rmse
def mb(data,count):
    mb = []
    for i in range(len(data)):
        mb.append(data[i]/count[i])
    mb = pd.DataFrame(mb,columns = ['mb'])
    mb['hour'] = range(0,72)
    return mb

# Now we calculate rmse for each variable
Spr_mb_zonal = mb(Spr_sum_zonal,count["obszonal"])
Spr_mb_meridional = mb(Spr_sum_meridional,count["obsmeridional"])

#### Plotting

In [16]:
plotrmse(Spr_mb_zonal,'mb',"Mean Bias plot of Zonal in Spring","sample size:" +  str(Spr["zonal_dif"].count()))

In [17]:
plotrmse(Spr_mb_meridional,'mb',"Mean Bias plot of Meridional in Spring","sample size:" +  str(Spr["meridional_dif"].count()))

<br>

#### Correlation:
1. Create a matrix with:
$$\begin{bmatrix} [mod0h] & [mod1h] & ... & [mod84h] \\ [obs0h] & [obs1h] & ... & [obs84h] \end{bmatrix}$$
where [mod0h] contains a list of all values from 0h
* Note: If the list contains None type value, then the correlation will be None. Therefore, we need to remove the null value in a list and also remove the corresponding obs/mod value. We can write a function to do so:
2. Calculate correlation function between mod_nh and obs_nh

In [18]:
# 1. Define a function which create lists with all values of that hour, also removing null values
def create_list(Season,var):
    return Season.groupby("hour")[var].apply(lambda x: x.tolist())

# Calculate the list
modzonal_list = create_list(Spr,"modzonal")
obszonal_list = create_list(Spr,"obszonal")
modmeridional_list = create_list(Spr,"modmeridional")
obsmeridional_list = create_list(Spr,"obsmeridional")

In [19]:
# 2. Since there are NaN for each list, we need to remove nan value and the corresponding index, 
# we first record the index of the missing value and remove i
def index(modlist,obslist):
    # Get the index matrix
    index = [[] for _ in range(len(modlist))]
    for i in range(len(modlist)):
        for j in range(len(modlist[i])):
            if pd.isna(modlist[i][j]) or pd.isna(obslist[i][j]):
                index[i].append(j)
    return index

In [20]:
# 3. Remove the corresponding value from the index
def removeNan(index,modlist,obslist):
    update_mod = [[] for _ in range(len(modlist))]
    update_obs = [[] for _ in range(len(obslist))]
    
    for i in range(len(modlist)):
        for j in range(len(modlist[i])):
            if j not in index[i]:
                update_mod[i].append(modlist[i][j])
                update_obs[i].append(obslist[i][j])
    return update_mod,update_obs

In [21]:
up_modzonal_list,up_obszonal_list = removeNan(index(modzonal_list,obszonal_list),modzonal_list,obszonal_list)
up_modmeridional_list,up_obsmeridional_list = removeNan(index(modmeridional_list,obsmeridional_list),modmeridional_list,obsmeridional_list)

In [22]:
# 4. Create a function to calculate correlation and create a list
def correlation(modlist,obslist):
    correlation = []
    for i in range(len(modlist)):
        correlation.append(np.corrcoef(modlist[i],obslist[i])[0][1])
        
    # Turn into dataframe
    correlation = pd.DataFrame(correlation,columns = ['corr'])
    correlation['hour'] = range(0,72)
    return correlation

In [23]:
corr_zonal = correlation(up_modzonal_list,up_obszonal_list)
corr_meridional = correlation(up_modmeridional_list,up_obsmeridional_list)

In [24]:
# 5. Create a function to calculate the sample size
def size(Alist):
    return sum(len(element) for element in Alist)

#### Plotting

In [25]:
plotrmse(corr_zonal,'corr',"Correlation plot of Zonal in Spring","sample size:" + str(size(up_obszonal_list)))

In [26]:
plotrmse(corr_meridional,'corr',"Correlation plot of Meridional in Spring","sample size:" + str(size(up_obsmeridional_list)))

<br>

### Summer:
#### RMSE:

In [52]:
# 1. Calculate the square difference
Sum["zonal_dif"] = (Sum["obszonal"] - Sum["modzonal"])**2
Sum["meridional_dif"] = (Sum["obsmeridional"] - Sum["modmeridional"])**2

In [53]:
# 2. Group by hour and sum
Sum_sum_zonal = Sum.groupby("hour")["zonal_dif"].sum()
Sum_sum_meridional = Sum.groupby("hour")["meridional_dif"].sum()

In [54]:
# 3. Divide the sums by n and take square root, calculate rmse and make it a dataframe
count = Sum.groupby("hour").count()

# Now we calculate rmse for each variable
Sum_rmse_zonal = rmse(Sum_sum_zonal,count["obszonal"])
Sum_rmse_meridional = rmse(Sum_sum_meridional,count["obsmeridional"])

In [55]:
plotrmse(Sum_rmse_zonal,'rmse',"RMSE plot of Zonal in Summer","sample size:" +  str(Sum["zonal_dif"].count()))

In [56]:
plotrmse(Sum_rmse_meridional,'rmse',"RMSE plot of Meridional in Summer","sample size:" +  str(Sum["meridional_dif"].count()))

#### MB:

In [38]:
# 1. Calculate the difference
Sum["zonal_dif"] = Sum["modzonal"] - Sum["obszonal"]
Sum["meridional_dif"] = Sum["modmeridional"] - Sum["obsmeridional"]

In [39]:
# 2. Group by hour and sum
Sum_sum_zonal = Sum.groupby("hour")["zonal_dif"].sum()
Sum_sum_meridional = Sum.groupby("hour")["meridional_dif"].sum()

In [62]:
# 3. Divide the sums by n and take square root, calculate rmse and make it a dataframe
count = Sum.groupby("hour").count()

# Now we calculate rmse for each variable
Sum_mb_zonal = mb(Sum_sum_zonal,count["obszonal"])
Sum_mb_meridional = mb(Sum_sum_meridional,count["obsmeridional"])

In [63]:
plotrmse(Sum_mb_zonal,'mb',"Mean Bias plot of Zonal in Summer","sample size:" +  str(Sum["zonal_dif"].count()))

In [64]:
plotrmse(Sum_mb_meridional,'mb',"Mean Bias plot of Meridional in Summer","sample size:" +  str(Sum["meridional_dif"].count()))

#### Correlation:

In [43]:
# Calculate the list
modzonal_list = create_list(Sum,"modzonal")
obszonal_list = create_list(Sum,"obszonal")
modmeridional_list = create_list(Sum,"modmeridional")
obsmeridional_list = create_list(Sum,"obsmeridional")

In [44]:
up_modzonal_list,up_obszonal_list = removeNan(index(modzonal_list,obszonal_list),modzonal_list,obszonal_list)
up_modmeridional_list,up_obsmeridional_list = removeNan(index(modmeridional_list,obsmeridional_list),modmeridional_list,obsmeridional_list)

In [45]:
corr_zonal = correlation(up_modzonal_list,up_obszonal_list)
corr_meridional = correlation(up_modmeridional_list,up_obsmeridional_list)

In [46]:
plotrmse(corr_zonal,'corr',"Correlation plot of Zonal in Summer","sample size:" + str(size(up_obszonal_list)))

In [47]:
plotrmse(corr_meridional,'corr',"Correlation plot of Meridional in Summer","sample size:" + str(size(up_obsmeridional_list)))

<br>

### Fall:
#### RMSE:

In [48]:
# 1. Calculate the square difference
Fal["zonal_dif"] = (Fal["obszonal"] - Fal["modzonal"])**2
Fal["meridional_dif"] = (Fal["obsmeridional"] - Fal["modmeridional"])**2

In [49]:
# 2. Group by hour and sum
Fal_sum_zonal = Fal.groupby("hour")["zonal_dif"].sum()
Fal_sum_meridional = Fal.groupby("hour")["meridional_dif"].sum()

In [57]:
# 3. Divide the sums by n and take square root, calculate rmse and make it a dataframe
count = Fal.groupby("hour").count()

# Now we calculate rmse for each variable
Fal_rmse_zonal = rmse(Fal_sum_zonal,count["obszonal"])
Fal_rmse_meridional = rmse(Fal_sum_meridional,count["obsmeridional"])

In [58]:
plotrmse(Fal_rmse_zonal,'rmse',"RMSE plot of Zonal in Fall","sample size:" +  str(Fal["zonal_dif"].count()))

In [59]:
plotrmse(Fal_rmse_meridional,'rmse',"RMSE plot of Meridional in Fall","sample size:" +  str(Fal["meridional_dif"].count()))

#### MB:

In [60]:
# 1. Calculate the difference
Fal["zonal_dif"] = Fal["modzonal"] - Fal["obszonal"]
Fal["meridional_dif"] = Fal["modmeridional"] - Fal["obsmeridional"]

In [61]:
# 2. Group by hour and sum
Fal_sum_zonal = Fal.groupby("hour")["zonal_dif"].sum()
Fal_sum_meridional = Fal.groupby("hour")["meridional_dif"].sum()

In [65]:
# 3. Divide the sums by n and take square root, calculate rmse and make it a dataframe
count = Fal.groupby("hour").count()

# Now we calculate rmse for each variable
Fal_mb_zonal = mb(Fal_sum_zonal,count["obszonal"])
Fal_mb_meridional = mb(Fal_sum_meridional,count["obsmeridional"])

In [66]:
plotrmse(Fal_mb_zonal,'mb',"Mean Bias plot of Zonal in Fall","sample size:" +  str(Fal["zonal_dif"].count()))

In [68]:
plotrmse(Fal_mb_meridional,'mb',"Mean Bias plot of Meridional in Fall","sample size:" +  str(Fal["meridional_dif"].count()))

#### Correlation:

In [69]:
# Calculate the list
modzonal_list = create_list(Fal,"modzonal")
obszonal_list = create_list(Fal,"obszonal")
modmeridional_list = create_list(Fal,"modmeridional")
obsmeridional_list = create_list(Fal,"obsmeridional")

In [70]:
up_modzonal_list,up_obszonal_list = removeNan(index(modzonal_list,obszonal_list),modzonal_list,obszonal_list)
up_modmeridional_list,up_obsmeridional_list = removeNan(index(modmeridional_list,obsmeridional_list),modmeridional_list,obsmeridional_list)

In [71]:
corr_zonal = correlation(up_modzonal_list,up_obszonal_list)
corr_meridional = correlation(up_modmeridional_list,up_obsmeridional_list)

In [72]:
plotrmse(corr_zonal,'corr',"Correlation plot of Zonal in Fall","sample size:" + str(size(up_obszonal_list)))

In [73]:
plotrmse(corr_meridional,'corr',"Correlation plot of Meridional in Fall","sample size:" + str(size(up_obsmeridional_list)))

<br>

### Winter:
#### RMSE:

In [74]:
# 1. Calculate the square difference
Win["zonal_dif"] = (Win["obszonal"] - Win["modzonal"])**2
Win["meridional_dif"] = (Win["obsmeridional"] - Win["modmeridional"])**2

In [75]:
# 2. Group by hour and sum
Win_sum_zonal = Win.groupby("hour")["zonal_dif"].sum()
Win_sum_meridional = Win.groupby("hour")["meridional_dif"].sum()

In [76]:
# 3. Divide the sums by n and take square root, calculate rmse and make it a dataframe
count = Win.groupby("hour").count()

# Now we calculate rmse for each variable
Win_rmse_zonal = rmse(Win_sum_zonal,count["obszonal"])
Win_rmse_meridional = rmse(Win_sum_meridional,count["obsmeridional"])

In [77]:
plotrmse(Win_rmse_zonal,'rmse',"RMSE plot of Zonal in Winter","sample size:" +  str(Win["zonal_dif"].count()))

In [78]:
plotrmse(Win_rmse_meridional,'rmse',"RMSE plot of Meridional in Winter","sample size:" +  str(Win["meridional_dif"].count()))

#### MB:

In [79]:
# 1. Calculate the difference
Win["zonal_dif"] = Win["modzonal"] - Win["obszonal"]
Win["meridional_dif"] = Win["modmeridional"] - Win["obsmeridional"]

In [80]:
# 2. Group by hour and sum
Win_sum_zonal = Win.groupby("hour")["zonal_dif"].sum()
Win_sum_meridional = Win.groupby("hour")["meridional_dif"].sum()

In [81]:
# 3. Divide the sums by n and take square root, calculate rmse and make it a dataframe
count = Fal.groupby("hour").count()

# Now we calculate rmse for each variable
Win_mb_zonal = mb(Win_sum_zonal,count["obszonal"])
Win_mb_meridional = mb(Win_sum_meridional,count["obsmeridional"])

In [83]:
plotrmse(Win_mb_zonal,'mb',"Mean Bias plot of Zonal in Winter","sample size:" +  str(Win["zonal_dif"].count()))

In [84]:
plotrmse(Win_mb_meridional,'mb',"Mean Bias plot of Meridional in Winter","sample size:" +  str(Win["meridional_dif"].count()))

#### Correlation:

In [85]:
# Calculate the list
modzonal_list = create_list(Win,"modzonal")
obszonal_list = create_list(Win,"obszonal")
modmeridional_list = create_list(Win,"modmeridional")
obsmeridional_list = create_list(Win,"obsmeridional")

In [86]:
up_modzonal_list,up_obszonal_list = removeNan(index(modzonal_list,obszonal_list),modzonal_list,obszonal_list)
up_modmeridional_list,up_obsmeridional_list = removeNan(index(modmeridional_list,obsmeridional_list),modmeridional_list,obsmeridional_list)

In [87]:
corr_zonal = correlation(up_modzonal_list,up_obszonal_list)
corr_meridional = correlation(up_modmeridional_list,up_obsmeridional_list)

In [89]:
plotrmse(corr_zonal,'corr',"Correlation plot of Zonal in Winter","sample size:" + str(size(up_obszonal_list)))

In [90]:
plotrmse(corr_meridional,'corr',"Correlation plot of Meridional in Winter","sample size:" + str(size(up_obsmeridional_list)))