### Imports

In [33]:
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px

### Feature Engineering

In [75]:
data = pd.read_csv('./all_data.csv')
data = data.drop(['Unnamed: 0', 'Hurricane activity'], axis=1) # drop the label column

In [76]:
# How many NaN values?
null_info = data.isnull().sum()
null_info

NAO(J)        257
NAO            33
EA_WR          33
NP             30
NBRA          282
AMO(s)         69
TNA             9
PWR            21
SWMRR         144
BEST            0
PNA            33
TNI            11
SR             60
AMM             0
AMO             9
SF             45
Niño(3)       33
SOI            45
TSA             9
Niño(4)       33
QBO             9
PDO            12
GMOT            0
Niño(3.4)     33
EP_NP         105
AT            168
NOI           187
Niño(1+2)     33
WHWP            9
CIP           276
TPI(IPO)      144
Month           0
AO             33
WP             33
CAR            59
ONI            34
AAO           384
ENSO          382
MEI           381
GIAM          226
NTA            59
dtype: int64

In [77]:
# find and drop indices from dataframe that have more than a 100 missing entries
null_indices = list()
for idx, elem in enumerate(null_info):
  if elem > 100: null_indices.append(null_info.index[idx])
data = data.drop(null_indices, axis=1)

In [78]:
# Drop NaN values of the remaining indices 
data = data.dropna()
data

Unnamed: 0,NAO,EA_WR,NP,AMO(s),TNA,PWR,BEST,PNA,TNI,SR,...,GMOT,Niño(3.4),Niño(1+2),WHWP,Month,AO,WP,CAR,ONI,NTA
36,-0.42,-0.74,1008.32,0.090,0.06,-0.582,-1.13,-1.18,1.315,2.0,...,-0.44,-1.30,-0.46,-0.17,1,-0.085,-0.07,-0.46,-0.82,-0.08
37,0.35,-1.66,1011.16,0.089,-0.17,-0.715,-0.69,-2.11,1.461,2.0,...,-0.49,-1.04,-0.91,-0.30,2,-0.400,-0.58,-0.49,-0.54,-0.12
38,-1.47,0.11,1015.69,0.087,-0.07,-0.670,-0.24,-1.09,1.590,37.0,...,-0.19,-0.38,-0.75,-0.93,3,-1.934,-0.29,-0.50,-0.17,-0.08
39,-0.38,-2.31,1011.91,0.087,0.17,-0.665,0.33,0.47,1.457,-23.0,...,-0.09,-0.23,-0.25,-0.70,4,-0.776,-0.19,-0.42,0.18,0.03
40,-0.50,0.93,1012.74,0.086,0.14,-0.535,0.71,1.19,1.615,119.0,...,0.00,-0.01,0.17,-0.93,5,-0.863,0.70,-0.28,0.36,0.08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
826,-0.31,-0.93,1009.24,0.171,0.46,0.376,-0.14,1.44,0.690,1.0,...,1.31,-0.76,0.12,3.65,11,-0.611,1.01,0.47,-0.67,0.31
827,0.35,1.52,1014.11,0.174,0.52,0.332,-0.39,-0.65,1.097,-3.0,...,1.25,-0.50,0.43,1.05,12,1.786,0.97,0.45,-0.56,0.41
828,0.05,0.63,1008.76,0.176,0.49,0.310,-0.26,-0.29,1.447,-4.0,...,1.26,-0.43,0.93,-0.03,1,0.942,0.55,0.46,-0.34,0.29
829,0.69,1.14,1008.15,0.178,0.23,0.203,0.11,-0.05,1.465,-7.0,...,1.49,-0.08,1.36,0.43,2,0.340,-0.15,0.45,-0.16,0.23


In [79]:
# Standarize data 
data_stdz = (data-data.mean())/data.std()
data_stdz

Unnamed: 0,NAO,EA_WR,NP,AMO(s),TNA,PWR,BEST,PNA,TNI,SR,...,GMOT,Niño(3.4),Niño(1+2),WHWP,Month,AO,WP,CAR,ONI,NTA
36,-0.307286,-0.701752,-0.944256,0.760245,-0.093620,-1.397317,-1.203014,-1.049330,1.223580,-0.223129,...,-1.913876,-1.289344,-0.199380,-0.315970,-1.585099,0.026364,-0.076867,-2.108720,-1.021985,0.018719
37,0.398496,-1.536418,-0.310529,0.753539,-0.723903,-1.833390,-0.763315,-1.952131,1.330148,-0.223129,...,-2.036920,-0.992859,-0.607741,-0.382280,-1.296006,-0.284963,-0.543770,-2.247310,-0.687853,-0.094447
38,-1.269716,0.069407,0.700311,0.740129,-0.449867,-1.685847,-0.313622,-0.961962,1.424308,0.009943,...,-1.298659,-0.240243,-0.462546,-0.703627,-1.006914,-1.801076,-0.278276,-2.293506,-0.246321,0.018719
39,-0.270622,-2.126128,-0.143171,0.740129,0.207819,-1.669453,0.255989,0.552414,1.327229,-0.389609,...,-1.052572,-0.069194,-0.008812,-0.586310,-0.717822,-0.656578,-0.186727,-1.923933,0.171344,0.329925
40,-0.380614,0.813349,0.042038,0.733423,0.125608,-1.243215,0.635729,1.251357,1.442556,0.555997,...,-0.831093,0.181678,0.372324,-0.703627,-0.428729,-0.742564,0.628064,-1.277179,0.386143,0.471382
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
826,-0.206460,-0.874128,-0.738964,1.303379,1.002524,1.743726,-0.213690,1.494045,0.767382,-0.229788,...,2.392647,-0.673568,0.326951,1.632514,1.305825,-0.493502,0.911868,2.187574,-0.842985,1.122085
827,0.398496,1.348624,0.347745,1.323495,1.166945,1.599461,-0.463520,-0.534830,1.064458,-0.256425,...,2.244995,-0.377083,0.608266,0.306320,1.594917,1.875549,0.875248,2.095180,-0.711719,1.405000
828,0.123516,0.541175,-0.846073,1.336906,1.084734,1.527328,-0.333609,-0.185359,1.319929,-0.263084,...,2.269603,-0.297260,1.062000,-0.244560,-1.585099,1.041390,0.490740,2.141377,-0.449187,1.065502
829,0.710140,1.003870,-0.982191,1.350317,0.372241,1.176502,0.036139,0.047622,1.333068,-0.283062,...,2.835604,0.101855,1.452211,-0.009926,-1.296006,0.446409,-0.150107,2.095180,-0.234388,0.895754


### Identify Principal Components

In [80]:
# PCA
pca = PCA(n_components=data.shape[1])
pca.fit(data_stdz)

PCA(n_components=29)

In [85]:
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)

px.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={"x": "# Components", "y": "Explained Variance"},
    title="Variance Preservation vs. No. of Components"
)

We want a maximum of 5% error so we will keep 18 principal components. 

In [90]:
N = 17

pca = PCA(n_components=N)
pca.fit(data_stdz)

columns = ['pca_%i' % i for i in range(N)]
df_pca = pd.DataFrame(pca.transform(data_stdz), 
                      columns=columns, 
                      index=data_stdz.index)

# save pca data as a csv file
df_pca.to_csv('./pca_data.csv')
df_pca

Unnamed: 0,pca_0,pca_1,pca_2,pca_3,pca_4,pca_5,pca_6,pca_7,pca_8,pca_9,pca_10,pca_11,pca_12,pca_13,pca_14,pca_15,pca_16
36,-3.692652,1.064720,2.481742,-0.156687,1.036374,0.367197,1.857029,-0.011777,-0.715238,1.292764,-0.093660,-1.877221,0.004954,-0.307592,0.776496,-0.575624,-0.212542
37,-3.779100,0.111916,1.934559,-0.227213,0.814001,-0.458003,1.922303,0.381578,-1.254013,1.435323,0.962213,-1.843495,0.244916,-0.194615,0.307966,0.233593,-0.405048
38,-2.463728,-0.283550,3.573048,-0.319261,0.903758,-1.179203,1.494241,-0.188505,-0.034745,0.243445,0.101107,-1.150936,-0.168759,0.252522,0.739950,0.248905,0.374873
39,-0.722161,-0.002633,2.716974,0.171904,0.780939,0.497939,0.964245,-0.461404,-1.007813,1.869143,1.417531,-1.113765,-0.191527,0.218385,-0.549524,0.611376,0.181510
40,0.349971,-0.550860,2.791703,-0.768294,0.885078,1.137841,0.367640,-0.556279,0.627394,0.566557,-0.690921,-0.223606,-0.268457,0.074157,-0.503765,1.048616,0.153946
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
826,2.994224,3.815517,-0.950181,0.947620,1.478416,1.188994,-1.429024,-0.725886,0.709006,0.907728,0.825999,0.639658,-0.510919,1.736934,1.010234,0.692969,0.270107
827,2.209904,3.293926,-2.914603,-1.104317,1.642550,0.557622,-0.152599,-1.001228,1.654093,-0.905121,0.095753,-0.059870,-0.154884,0.534018,0.794698,1.492791,-0.527174
828,2.308623,2.565925,-2.290044,0.987892,1.747731,0.416762,1.496330,-0.037047,1.594517,0.820310,-1.003617,-0.443407,-0.148722,0.056004,0.586146,1.020319,-0.153829
829,2.689415,1.603789,-1.870723,0.747569,1.975692,1.015372,1.392730,0.773984,1.568676,0.486274,-0.886419,-0.899411,-0.190817,0.621907,1.296120,0.713241,-0.009335


This plot visualizes the first 5 components and reports the explained variance of the first 17 components 

In [93]:
# Visualize Principal components
N = 5
pca_view = PCA(n_components=N)
components = pca_view.fit_transform(data_stdz)
total_var =  pca.explained_variance_ratio_.sum() * 100 
label = pd.read_csv('./all_data.csv')['Hurricane activity'].loc[df_pca.index]

labels = {str(i): f"PC {i+1}" for i in range(N)}
labels['color'] = 'Hurricane Activity'


fig = px.scatter_matrix(
    components,
    color=label,
    dimensions=range(N),
    labels=labels,
    title=f'Total Explained Variance with 17 Principal Components: {total_var:.2f}%',
)
fig.update_traces(diagonal_visible=False)
fig.show()

In [106]:
# Save pca components 
# They will be used to transform time series data into input data 

df_components = pd.DataFrame(pca.components_)
df_components.to_csv('./pca_components.csv')
df_components

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,-0.034396,-0.029014,-0.071335,0.144634,0.226251,0.211769,0.272676,0.112435,-0.10653,-0.00169,...,0.218787,0.300534,0.234564,0.264409,0.002052,-0.055507,0.013421,0.237067,0.274931,0.241088
1,-0.093045,-0.10716,0.043144,0.271011,0.293678,0.185072,-0.268693,0.000394,-0.024369,0.073243,...,0.190936,-0.23405,-0.172974,0.136094,0.004784,-0.027719,-0.066492,0.22174,-0.259532,0.285938
2,-0.369483,0.023819,-0.140112,-0.022469,0.154525,-0.344379,0.095808,0.089482,0.045172,0.199324,...,-0.358714,0.035005,-0.050452,-0.001566,1e-05,-0.448558,-0.057392,-0.053727,0.111468,0.050114
3,-0.278777,-0.159914,-0.441841,-0.130757,-0.040882,0.161324,-0.073205,0.203292,0.061257,-0.273368,...,0.125517,-0.052388,0.005924,-0.10753,-0.27851,-0.392074,-0.238842,-0.096004,-0.107927,0.031952
4,-0.053794,0.054284,0.052375,0.052499,-0.014085,-0.073908,-0.075136,0.04662,0.708605,0.081281,...,-0.020221,-0.029205,0.466543,0.237242,0.033973,-0.000924,0.02288,0.009414,-0.017259,-0.044151
5,0.438867,0.12939,-0.374916,-0.088762,0.054994,0.029339,-0.067561,0.439139,0.092092,-0.084174,...,0.048699,-0.058368,0.045297,-0.044382,0.102514,0.185412,-0.088915,-0.027446,-0.062417,0.067212
6,-0.020031,-0.024642,-0.016423,-0.00173,0.046326,-0.030857,0.016532,-0.509336,0.061228,-0.295858,...,-0.005994,0.057794,0.054874,-0.086679,-0.535561,0.117974,-0.123649,0.024123,0.037108,0.115401
7,0.057695,-0.032944,-0.177959,0.343729,-0.247227,-0.047997,0.005299,-0.003244,-0.053133,0.431204,...,0.044899,0.03043,-0.034282,0.059796,-0.107952,0.051767,-0.617726,0.219659,0.028995,-0.230282
8,-0.010527,0.545015,-0.088797,0.042518,0.032405,-0.05935,0.030862,0.172315,-0.181502,0.161,...,-0.001368,0.003374,-0.145126,-0.069935,-0.125626,0.013216,0.103433,-0.004022,0.009278,0.032789
9,0.180254,-0.571477,-0.085915,0.127089,-0.076668,-0.111429,0.036305,0.189824,0.026028,0.262591,...,-0.000991,-0.016583,-0.063776,-0.053122,-0.437836,0.079902,0.452998,0.039183,0.01434,-0.046458
