version 1.0

# Data Analysis using Pandas and Matplotlib

A good source of data sets https://data.gov.ie/datase <br>
Met Eireann Historical data available:<br>

https://www.met.ie/climate/available-data/long-term-data-sets

In this lesson we are going to import historical data from Met eireann and analyise it using Panda's and plot the results using Matplotlib.



In [None]:
# import the list of stations
import pandas as pd
# Plot Library
from matplotlib import pyplot as plt
import numpy as np

## Import the CSV and Print Dataframe

In [None]:
# No need for a directory path if in the same directory
df = pd.read_csv("/Users/morgan/Documents/Courses/Python/MyModules/DublinAirportClean.csv")

In [None]:
print(df)

## Changing the Index

In [None]:
# Make the Year the index value permanently
df.set_index('Year', inplace = True)

In [None]:
df

In [None]:
# Output the year values
df['Year']

In [None]:
df.index
df['Year']
#rain_y = df.loc['Aug']

In [None]:
#Simple Plot
plt.rcParams['figure.figsize'] = [10, 6]
plt.style.use('Solarize_Light2')
plt.plot(df['Year'],df.Jan)
plt.plot(df['Year'],df.Aug)

## Create a filter
Our objective is to print out a bar chart that prints out the rain values for January, August and November at five or ten year intervals.

In [None]:
# Create a list with a fewer number of years
nums = range(1850,2010,5)
my_list = []
for n in nums:
    my_list.append(n)
print(my_list)

In [None]:
# Filter out the data for those years
filt = df['Year'].isin(my_list)
print(filt.head(30))

In [None]:
x_values = df.loc[filt, ['Year']]
print(x_values)

## Create a new dataframe with the information that we want


In [None]:
# Create a new DataFrame with the data we want
newDF = df.loc[filt, ['Year','Jan', 'Aug', 'Nov']]
print(newDF)

In [None]:
# Print our new plot
plt.plot(newDF['Year'],newDF.Jan)
plt.plot(newDF['Year'],newDF.Aug)
plt.plot(newDF['Year'],newDF.Nov)

In [None]:
# Change to a bar plot
plt.bar(newDF['Year'],newDF.Jan)
plt.bar(newDF['Year'],newDF.Aug)
plt.bar(newDF['Year'],newDF.Nov)





## Hmmm...
Let us try to sort out this mess...

In [None]:
plt.style.use('Solarize_Light2')
plt.rcParams['figure.figsize'] = [14, 6]
width = 0.85
plt.bar(newDF['Year'] - width, newDF.Jan, width = width, label = 'January Rain')
plt.bar(newDF['Year'] ,  newDF.Aug, width = width, color = 'k', label = 'August Rain')
plt.bar(newDF['Year'] + width, newDF.Nov, width = width, label = 'November Rain')


plt.title('Rain- Dublin Airport(1860-2010)')
plt.xlabel('Year')
plt.ylabel('Rain mm')
plt.grid(True)
plt.legend()

# Statistical Analysis

## Simple Stats

In [None]:
# Mean and Median
print(df['Jan'].mean())

df['Jan'].median()

In [None]:
# Statistic on all of the dataframe
df.mean()

In [None]:
# General Statistics
df.describe()

## Grouping Data
When grouping data it is a three phase process. Split, apply a function, recombine.

In [None]:
# load a new set of data for Killarney
kill_df = pd.read_csv("/Users/morgan/Documents/Courses/Python/MyModules/Killarney_1881-1933.csv")
kill_df

In [None]:
kill_df.shape

In [None]:
year_group = kill_df.groupby(['Year'])

In [None]:
year_group.get_group(1933)

In [None]:
# Same as filter 
filt = kill_df['Year']== 1933
kill_df.loc[filt]

In [None]:
kill_df.loc[filt,['Min (C)']].min()

In [None]:
#Get the lowest temp for each year from 1881
year_group['Min (C)'].min()

# SAQ 1
Go to Met Eireann and download a set of data which is suitable for grouping. Import the csv file and group the data.

# Appendix

## Clunky Graph Solution

In [None]:
jan_rain = df.loc[filt, ['Jan']]
aug_rain = df.loc[filt, ['Aug']]
nov_rain = df.loc[filt, ['Nov']]

In [None]:
year = df.loc[filt, ['Year']]

In [None]:
list_jan = jan_rain['Jan'].to_list()
list_aug = aug_rain['Aug'].to_list()
list_nov = nov_rain['Nov'].to_list()

data= {'Year': my_list,
      'Jan_rain': list_jan,
      'Aug_rain': list_aug,
      'Nov_rain': list_nov
      }
df_new = pd.DataFrame(data)
df_new
#print(jan_rain['Jan'].to_list())
# print(jan_rain)

In [None]:
print(nov_rain)

In [None]:
# Set the size of the plot
plt.rcParams['figure.figsize'] = [10, 6]

plt.bar(df_new['Year'],df_new.Jan_rain)
plt.bar(df_new['Year'],df_new.Aug_rain)




In [None]:
# Set the size of the plot
plt.rcParams['figure.figsize'] = [14, 6]

#Get a list of the years that we want
nums = range(1860,2010,10)
years = []
for n in nums:
    years.append(n)   

    # Use the values in years to filter our datframe    
filt = df['Year'].isin(years)

year = df.loc[filt, ['Year']]
# Get the values for the relevant months as a series
jan_rain = df.loc[filt, ['Jan']]
aug_rain = df.loc[filt, ['Aug']]
nov_rain = df.loc[filt, ['Nov']]

#Convert series to a list
list_jan = jan_rain['Jan'].to_list()
list_aug = aug_rain['Aug'].to_list()
list_nov = nov_rain['Nov'].to_list()

data= {'Year': years,
      'Jan_rain': list_jan,
      'Aug_rain': list_aug,
      'Nov_rain': list_nov
      }
df_new = pd.DataFrame(data)
print(df_new)

# x_indices = np.arange(len(month))
width = 1.85

In [None]:
plt.style.use('Solarize_Light2')

plt.bar(df_new['Year'] - width, df_new.Jan_rain, width = width, label = 'January Rain')
plt.bar(df_new['Year'] ,  df_new.Aug_rain, width = width, color = 'k', label = 'August Rain')
plt.bar(df_new['Year'] + width, df_new.Nov_rain, width = width, label = 'November Rain')


plt.title('Rain- Dublin Airport(1860-2010)')
plt.xlabel('Year')
plt.ylabel('Rain mm')
plt.grid(True)
plt.legend()