This notebook describes the scraping of MEI data from the NOAA page.

In [1]:
import pandas as pd
import re
from IPython.core.display import HTML, display
from bs4 import BeautifulSoup as bs
import requests
display(HTML("<style>.container {width:90% !important}</style>"))

In [2]:
url = 'https://www.esrl.noaa.gov/psd/enso/mei/table.html'
response = requests.get(url)
textsoup = bs(response.text, 'html5lib')

In [3]:
table_re = re.compile(r'(^YEAR.+|^19.+|^20.+)')

In [4]:
# This is an improperly formatted html page lacking in proper tags for the table.
# Below, I resort to "manual" scraping.
headerFound = False
cntr = 0
for sti in textsoup.stripped_strings:
    for s in sti.splitlines():
        if table_re.match(s):
            mylist = re.sub('\t|\s+', ',', table_re.findall(s)[0]).split(',')
            if headerFound:
                df = df.append({col: elem for col, elem in zip(df.columns, mylist)}, ignore_index=True)
            else:
                df = pd.DataFrame(columns=mylist)
                headerFound=True

In [23]:
df.iloc[-1,-1] = 'NaN'

In [24]:
df.tail()

Unnamed: 0,YEAR,DECJAN,JANFEB,FEBMAR,MARAPR,APRMAY,MAYJUN,JUNJUL,JULAUG,AUGSEP,SEPOCT,OCTNOV,NOVDEC
64,2014,-0.27,-0.259,0.018,0.295,1.001,1.046,0.915,0.937,0.557,0.421,0.754,0.566
65,2015,0.417,0.464,0.614,0.916,1.583,2.097,1.981,2.334,2.479,2.201,2.271,2.12
66,2016,2.216,2.17,1.963,2.094,1.752,1.053,0.352,0.167,-0.118,-0.385,-0.209,-0.11
67,2017,-0.052,-0.043,-0.08,0.744,1.445,1.039,0.456,0.009,-0.478,-0.568,-0.285,-0.576
68,2018,-0.623,-0.731,-0.502,-0.432,0.465,0.469,0.076,0.132,0.509,0.468,0.698,


In [30]:
df2 = df.iloc[:, 1:].astype('f8')
df2.insert(0, 'YEAR', df.YEAR)

In [33]:
df = df2

The next bit is to unpack the bimonthly data format and get a dataframe containing monthly data (columns) for each year (rows)

In [34]:
# set up a shifted (by one year) column for the DECJAN data
df['DECJAN_SH'] = df.DECJAN.shift(-1)

In [35]:
df.head()

Unnamed: 0,YEAR,DECJAN,JANFEB,FEBMAR,MARAPR,APRMAY,MAYJUN,JUNJUL,JULAUG,AUGSEP,SEPOCT,OCTNOV,NOVDEC,DECJAN_SH
0,1950,-1.03,-1.133,-1.283,-1.071,-1.434,-1.412,-1.269,-1.042,-0.631,-0.441,-1.151,-1.235,-1.049
1,1951,-1.049,-1.152,-1.178,-0.511,-0.374,0.288,0.679,0.818,0.726,0.72,0.694,0.504,0.433
2,1952,0.433,0.138,0.071,0.224,-0.307,-0.756,-0.305,-0.374,0.31,0.265,-0.351,-0.098,0.044
3,1953,0.044,0.401,0.277,0.687,0.756,0.191,0.382,0.209,0.483,0.087,0.078,0.351,-0.036
4,1954,-0.036,-0.027,0.154,-0.616,-1.465,-1.558,-1.355,-1.456,-1.159,-1.335,-1.124,-1.088,-0.74


In [36]:
# average data for overlapping months rolling mean over overlapping months. Skip the first DECJAN column
df2 = df.rolling(window=2, axis=1).mean().iloc[:, 2:]

In [37]:
# change the column names to single month - use abbrev. 3-letter names.
df2.rename(columns={k: k[:3] for k in df.columns}, inplace=True)

In [38]:
# reintroduce the YEAR column
df2.insert(0, 'YEAR', df.YEAR)

In [39]:
df2.head()

Unnamed: 0,YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC
0,1950,-1.0815,-1.208,-1.177,-1.2525,-1.423,-1.3405,-1.1555,-0.8365,-0.536,-0.796,-1.193,-1.142
1,1951,-1.1005,-1.165,-0.8445,-0.4425,-0.043,0.4835,0.7485,0.772,0.723,0.707,0.599,0.4685
2,1952,0.2855,0.1045,0.1475,-0.0415,-0.5315,-0.5305,-0.3395,-0.032,0.2875,-0.043,-0.2245,-0.027
3,1953,0.2225,0.339,0.482,0.7215,0.4735,0.2865,0.2955,0.346,0.285,0.0825,0.2145,0.1575
4,1954,-0.0315,0.0635,-0.231,-1.0405,-1.5115,-1.4565,-1.4055,-1.3075,-1.247,-1.2295,-1.106,-0.914


In [40]:
df2.tail()

Unnamed: 0,YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC
64,2014,-0.2645,-0.1205,0.1565,0.648,1.0235,0.9805,0.926,0.747,0.489,0.5875,0.66,0.4915
65,2015,0.4405,0.539,0.765,1.2495,1.84,2.039,2.1575,2.4065,2.34,2.236,2.1955,2.168
66,2016,2.193,2.0665,2.0285,1.923,1.4025,0.7025,0.2595,0.0245,-0.2515,-0.297,-0.1595,-0.081
67,2017,-0.0475,-0.0615,0.332,1.0945,1.242,0.7475,0.2325,-0.2345,-0.523,-0.4265,-0.4305,-0.5995
68,2018,-0.677,-0.6165,-0.467,0.0165,0.467,0.2725,0.104,0.3205,0.4885,0.583,,


In [41]:
# format column names to match satellite dataframes used in subsequent notebooks
df2.rename(columns={k: '%s%s' %(k[0],k[1:].lower()) for k in df2.columns},inplace=True)

In [42]:
df2.head()

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,1950,-1.0815,-1.208,-1.177,-1.2525,-1.423,-1.3405,-1.1555,-0.8365,-0.536,-0.796,-1.193,-1.142
1,1951,-1.1005,-1.165,-0.8445,-0.4425,-0.043,0.4835,0.7485,0.772,0.723,0.707,0.599,0.4685
2,1952,0.2855,0.1045,0.1475,-0.0415,-0.5315,-0.5305,-0.3395,-0.032,0.2875,-0.043,-0.2245,-0.027
3,1953,0.2225,0.339,0.482,0.7215,0.4735,0.2865,0.2955,0.346,0.285,0.0825,0.2145,0.1575
4,1954,-0.0315,0.0635,-0.231,-1.0405,-1.5115,-1.4565,-1.4055,-1.3075,-1.247,-1.2295,-1.106,-0.914


In [43]:
# create a new dataframe such that the data is index by yearmonth
# first, create a new column containing months
df3 = pd.melt(df2, id_vars=["Year"], var_name="Month", value_name="MEI",)

In [44]:
df3.head()

Unnamed: 0,Year,Month,MEI
0,1950,Jan,-1.0815
1,1951,Jan,-1.1005
2,1952,Jan,0.2855
3,1953,Jan,0.2225
4,1954,Jan,-0.0315


In [45]:
df3.tail()

Unnamed: 0,Year,Month,MEI
823,2014,Dec,0.4915
824,2015,Dec,2.168
825,2016,Dec,-0.081
826,2017,Dec,-0.5995
827,2018,Dec,


In [46]:
# create a new column where year and month are aggregated
df3['date'] = df3[['Year', 'Month']].apply(lambda x: ' '.join(x), axis=1)

In [47]:
df3.head()

Unnamed: 0,Year,Month,MEI,date
0,1950,Jan,-1.0815,1950 Jan
1,1951,Jan,-1.1005,1951 Jan
2,1952,Jan,0.2855,1952 Jan
3,1953,Jan,0.2225,1953 Jan
4,1954,Jan,-0.0315,1954 Jan


In [48]:
# convert date column data into datetime objects
df3['datetime'] = pd.to_datetime(df3.date, format='%Y %b')

In [49]:
# set datetime column as index
df3 = df3.set_index('datetime').sort_index()

In [50]:
# drop unneeded columns
# collect data from 1997 on to match satellite data
df3.drop(['Year', 'Month', 'date'], axis=1, inplace=True)
df3 = df3['1997':]
df3.head()

Unnamed: 0_level_0,MEI
datetime,Unnamed: 1_level_1
1997-01-01,-0.5425
1997-02-01,-0.4265
1997-03-01,0.1395
1997-04-01,0.8295
1997-05-01,1.7035


In [51]:
# pickle this final dataframe
df3.to_pickle('../PklJar/dfMEI.pkl')