In [None]:
# Jupyter interactive notebook (Python v3.x)
#
# Download CO2 data from MG's GitHub repository
# Perform exloratory data analysis, with commentary
#
# For use with Software Carpentry Python module
#

In [1]:
# uses the download.py module available in the above repo
#
import download

In [None]:
base_url = 'https://raw.githubusercontent.com/megarcia/SWC_Python/master/'

In [None]:
file_names = ['MaunaLoa_CO2_monthly_filled_1959-1975.csv',
              'MaunaLoa_CO2_monthly_filled_1976-2000.csv',
              'MaunaLoa_CO2_monthly_filled_2001-2015.csv']

In [None]:
for fname in file_names:
    download.get_file(base_url,fname)

In [None]:
# Get the file names and put them in a list variable
#
import glob
filelist = glob.glob('MaunaLoa_*.csv')

In [None]:
# How many files do we have?
#
len(filelist)

In [None]:
# Look at the list
#
filelist

In [None]:
# Note: no alias here, but typically "import numpy as np" is used
#
import numpy

In [None]:
# There are several ways to get CSV data into Python, this is one of the easiest
#
data1 = numpy.loadtxt(fname=filelist[0],delimiter=',')
#
# Note that this syntax works for lots of data files
#     For space-delimited, use "delimiter=' '"
#     For tab-delimited, use "delimiter='\t'"

In [None]:
# Look at the loaded data
#
data1

In [None]:
# Get the dimensions of the array, and note that time is axis 0!
#
numpy.shape(data1)

In [None]:
# Isolate and extract the values of interest
#
data1_vals = data1[:,2]

In [None]:
# Look at the single-variable time series
#
data1_vals

In [None]:
# Magic function to show plots here in the notebook, instead of a pop-up window
#
%matplotlib inline

In [None]:
# Note: no alias here, but typically "import matplotlib.pyplot as plt" is used
#
import matplotlib.pyplot

In [None]:
# Make a simple (exploratory) line plot of our time series
#
matplotlib.pyplot.figure()
matplotlib.pyplot.plot(data1_vals)
matplotlib.pyplot.show()
#
# There are ways to prettify the plot; most are easier when we run a command-line script
#     Add axis specs and labels
#     Add title and legend
#     Make multiple-plot figures
#     Add annotations

In [None]:
# OK, that's the first file. How about the rest? One at a time, or loop it?
#
# Sometimes we have tons of files, so let's make a loop!
#
# Since we already have the data array from the 1st file in memory, use that to start
#
all_data = numpy.copy(data1) 
#
# Now loop through the remainder of the files and append them to the existing array
for filename in filelist[1:]:
    new_data = numpy.loadtxt(fname=filename,delimiter=',')
    all_data = numpy.append(all_data,new_data,axis=0)

In [None]:
# Look at the loaded data
#
all_data

In [None]:
# Get the dimensions of the array
#
data_shape = numpy.shape(all_data)
data_shape

In [None]:
# Recall that the data is monthly, so how many years do we have?
#
nyears = data_shape[0] / 12
nyears

In [None]:
# Isolate and extract the values of interest
#
CO2_vals = all_data[:,2]

In [None]:
# Look at the single-variable time series
#
CO2_vals

In [None]:
# Check the dimensions of our variable time series
#
numpy.shape(CO2_vals)

In [None]:
# The number of items is the number of months in the series
#
nmonths = numpy.shape(CO2_vals)[0]

In [None]:
# From looking at the full data array, we know that the time series
# starts in 1959 and ends at the end of 2015. For plotting, it would 
# be nice to put those dates on the x-axis. The function to make those
# x-axis locations is stored in a variable for passing to the plot.
#
# Note that the length of this array must match the length of our time 
# series variable array.
#
x = numpy.linspace(1959,2016,nmonths)

In [None]:
# Make a slightly less-simple line plot of our time series, now with dates
#
matplotlib.pyplot.figure()
matplotlib.pyplot.plot(x, CO2_vals)
matplotlib.pyplot.show()

In [None]:
# This is called the Keeling Curve! 
#
# See http://scrippsco2.ucsd.edu/
# and http://en.wikipedia.org/wiki/Keeling_Curve
#
# There are two things about this plot that we'll explore further:
# 1. the seasonal variation
# 2. the annual trend
#
# Instead of finding a new dataset in the right shape, we can use what 
# we already have and "reshape" it. 
#
# We want an array that has each year in a row and each month in a column. 
# You'll see why in a few more lines.
#
data_arr = CO2_vals.reshape(nyears,12)

In [None]:
# Check the resulting array shape to see that it's right
#
numpy.shape(data_arr)

In [None]:
# To look at the seasonal variation, we want the mean values by month
# over all years. That means that we're averaging over axis 0.
#
seasonal = numpy.mean(data_arr, axis=0)

In [None]:
# Look at the result to see that it's what we wanted
#
seasonal

In [None]:
# Create an array for the months on the x-axis
#
x = numpy.linspace(1,12,12)

In [None]:
# Make a plot of our monthly averages
#
matplotlib.pyplot.figure()
matplotlib.pyplot.plot(x, seasonal)
matplotlib.pyplot.show()

In [None]:
# These plot values include the mean over all of the years. We can subtract 
# that out to get an idea of the variation within any single year.
#
mean = numpy.mean(seasonal)

In [None]:
# Basic math operations on arrays proceed element-wise
#
seasonal_adj = seasonal - mean

In [None]:
# Plot the adjusted values
#
matplotlib.pyplot.figure()
matplotlib.pyplot.plot(x, seasonal_adj)
matplotlib.pyplot.show()

In [None]:
# Notice the CO2 concentration decreases between late Spring and early Autumn 
# (in the Northern Hemisphere). There is more land area in the NH, and all 
# that plant growth is drawing CO2 from the atmosphere. At other times, plant 
# respiration, cement production, and fossil fuels cause a net increase in CO2 
# concentration.
#
# The Earth breathes!
#
# Watch: http://1.bp.blogspot.com/-LemiCA8B_H4/UfLN63QLXdI/AAAAAAAACyM/Xc3HtckubEg/s640/Animated.gif
#
# To look at the annual variation, we want the mean values over all months
# for each year. That means that we can use the same data array, but average 
# over axis 1 this time.
#
annual = numpy.mean(data_arr, axis=1)

In [None]:
# Look at the result to see that it's what we wanted
#
annual

In [None]:
# Create an array for the years on the x-axis
#
x = numpy.linspace(1959,2015,nyears)

In [None]:
# Plot the annual mean values
#
matplotlib.pyplot.figure()
matplotlib.pyplot.plot(x, annual)
matplotlib.pyplot.show()

In [None]:
# Notice there are a couple of times when the trend gets shallow:
# early 1970s: Inflation, oil crisis (possibly)
# early 1990s: Mount Pinatubo eruption in 1991 (cooler summers, less energy use)
#
# Notice also the most recent milestone: graph crosses 400 ppm in 2015!
#
# You've just done some exploratory data analysis!
#
# You deserve a final treat!
#
import antigravity