In [None]:
# Import all the libraries needed
import pandas as pd
from scipy import stats
import numpy as np
from numpy import mean
from numpy import std
from functools import reduce
from statistics import median

# Run previous ipynb files First then call MAIN_GRID
%run extractData.ipynb

In [None]:
df0 = MAIN_GRID[2][1]
df0.info()

In [None]:
# Rename columns
df0 = df0.rename(index=str, columns={'CSIRO - Adjusted sea level (inches)': "CSIRO_ASLinches", 'Year':'Year_Merge'})

# Filter dataset to 1959-2015 for Year column and CSIRO_ASLinches
df0_filter = df0.iloc[0:,[0,1]]
df0_filter = (df0_filter.set_index(['Year_Merge']).loc[1959:2015]).reset_index()
df0_filter.info()

In [None]:
# Check for null/NA values
df0_filter.isna().sum()

In [None]:
#towardsdatascience.com/handling-missing-values-with-pandas-b876bf6f008f
#Filling 2 null values with mean of the records 
#df0_mean = df0_filter['CSIRO_ASLinches'].mean()
#df0_filter['CSIRO_ASLinches'].fillna(df0_mean)
#df0_mean #6.199642083854547, which is less than that of NOAA have recorded for these years
#Alternatively missing values can be replaced with the values before or after them.
df0_filter = df0_filter.fillna(axis=0, method = 'ffill', limit=2)

# Anamoly Detection #https://help.ceda.ac.uk/article/4728-cru-data-python-example
# Calculate 57 year average
avCSIRO_ASLinches_1959_2015 = np.mean(df0_filter['CSIRO_ASLinches']) 
df0_filter['Anamoly_CSIRO_ASLinches'] = df0_filter['CSIRO_ASLinches'] - avCSIRO_ASLinches_1959_2015

In [None]:
#https://datahub.io/core/global-temp #useful reference data source

df1 = MAIN_GRID[0][0]
df1.info()

In [None]:
# Rename columns
df1 = df1.rename(index=str, columns={'Rainfall - (MM)': "Rnf_MM", " Year": "Year", })

# Filter data set to 1959-2015 and the columns Year and Rainfall (MM)
df1_filter = df1.iloc[0:,[0,1]]
df1_filter = (df1_filter.set_index(['Year']).loc[1959:2015]).reset_index()
df1_filter.head()

In [None]:
# Convert Year to proper format
df1_filter['Year'] = pd.to_datetime(df1_filter['Year'], format='%Y')

# Convert Year column to index
df1_filter.set_index('Year',inplace=True)

In [None]:
# Alternative methods GroupBy and plot
df1_filter_averagegrpby = df1_filter.groupby(pd.Grouper(freq='Y')).mean() #.plot()


In [None]:
# Anamoly Detection #https://help.ceda.ac.uk/article/4728-cru-data-python-example

avprecp_1959_2015 = np.mean(df1_filter_averagegrpby['Rnf_MM']) #Calculating 57 year average
df1_filter_averagegrpby['Anamoly_RnfMM'] = df1_filter_averagegrpby['Rnf_MM'] - avprecp_1959_2015
df1_filter_averagegrpby.head() 

In [None]:
df1_filter_averagegrpby.columns

In [None]:
#Remove index column
#df1_filter_averagegrpby.index #removes time portion from time stamp
df1_filter_averagegrpby.reset_index(level=0, inplace=True)

In [None]:
#Extract Year to Year_Merge Column
df1_filter_averagegrpby['Year_Merge'] = pd.DatetimeIndex(df1_filter_averagegrpby['Year']).year
df1_filter_averagegrpby.head()

In [None]:
df2 = MAIN_GRID[0][1]
df2.info()

In [None]:
# Rename columns
df2 = df2.rename(index=str, columns={'Temperature - (Celsius)':"Tmp_Cls"," Year": "Year"})

In [None]:
# Filter dataset to 1959-2015 and the columns Year and Rainfall (MM)
df2_filter = df2.iloc[0:,[0,1]]
df2_filter = (df2_filter.set_index(['Year']).loc[1959:2015]).reset_index()
df2_filter.head()

In [None]:
# Convert Year to proper format
df2_filter['Year'] = pd.to_datetime(df2_filter['Year'], format='%Y')

# Convert Year column to index
df2_filter.set_index('Year',inplace=True)

#df2_filter.info()
df2_filter.head()

In [None]:
#https://stackoverflow.com/questions/23859840/python-aggregate-by-month-and-calculate-average

#Alternative methods GroupBy and plot
df2_filter_averagegrpby = df2_filter.groupby(pd.Grouper(freq='Y')).mean()#.plot()
df2_filter_averagegrpby.info()

In [None]:
#Anamoly Detection #https://help.ceda.ac.uk/article/4728-cru-data-python-example

av_1959_2015 = np.mean(df2_filter_averagegrpby['Tmp_Cls']) #Calculating 57 years average
df2_filter_averagegrpby['Anamoly_TmpCls'] =df2_filter_averagegrpby['Tmp_Cls'] - av_1959_2015
df2_filter_averagegrpby.head()

In [None]:
# Remove index column
# df1_filter_averagegrpby.index #removes time portion from time stamp
df2_filter_averagegrpby.reset_index(level=0, inplace=True)

# Extract Year to Year_Merge Column
df2_filter_averagegrpby['Year_Merge'] = pd.DatetimeIndex(df1_filter_averagegrpby['Year']).year
df2_filter_averagegrpby.head()

In [None]:
df3 = MAIN_GRID[2][0]

df3.info() 

In [None]:
# Checking column names to identify white spaces
df3.columns

In [None]:
# Rename columns
df3 = df3.rename(index=str, columns={"Year (negative values = BC)": "Year", "Mauna Loa, Hawaii": "MLHawai_CO2ppm"})
# Filter dataset to columns Year and ml_hawai
df3_filter = df3.iloc[0:,[0,4]]

# https://stackoverflow.com/questions/47444999/check-if-column-contains-type-string-object
# Convert everything to numerical values 
df3_filter.loc[:, df3_filter.dtypes.eq('object')] = df3_filter.loc[:, df3_filter.dtypes.eq('object')].apply(pd.to_numeric, errors='coerce')

df3_filter.info()

#df3_filter.isnull().sum() #Checking if any null value exists.
#df3_filter.isna().sum() #Checking if any na value exists.

In [None]:
#Filter data set to 1959-2015 and the columns Year and ml_hawai
df3_filter = (df3_filter.set_index(['Year']).loc[1959:2015]).reset_index()
df3_filter.info()

In [None]:
#Checking fro null/NA
df3_filter.isna().sum() 

In [None]:
#Convert Year to datatime format
df3_filter['Year'] = pd.to_datetime(df3_filter['Year'], format='%Y')

df3_filter.info()

In [None]:
#Anamoly Detection #https://help.ceda.ac.uk/article/4728-cru-data-python-example
avCO2ppm_1959_2015 = np.mean(df3_filter['MLHawai_CO2ppm']) #Calculating 57 year average
df3_filter['Anamoly_CO2ppm'] =df3_filter['MLHawai_CO2ppm'] - avCO2ppm_1959_2015

In [None]:

# Extract Year to Year_Merge Column
df3_filter['Year_Merge'] = pd.DatetimeIndex(df3_filter['Year']).year
df3_filter.head()

In [None]:

# https://stackoverflow.com/questions/44327999/python-pandas-merge-multiple-dataframes
#
# Merge dataframes and fill the values that don't exist in the lines of merged dataframe simply fill with required strings as


df_tobe_merged = [df1_filter_averagegrpby,df2_filter_averagegrpby,df0_filter,df3_filter]
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['Year_Merge'], how='outer'), df_tobe_merged).fillna('void')


In [None]:
#df_merged.info()
df_merged.head()

In [None]:
# Rearrange columns in df_merged
df_merged_final = df_merged.iloc[0:,[3,1,2,5,6,7,8,10,11]]


In [None]:
# Summarize statistics
 
print('mean=%.3f median =%.3f stdv=%.3f' % (mean(df1_filter["Rnf_MM"]), median(df1_filter["Rnf_MM"]), std(df1_filter["Rnf_MM"])))

In [None]:
# Normality Check - Hypothesis testing

k2, p = stats.normaltest(df1_filter["Rnf_MM"])
alpha = 1e-3
print("p = {:g}".format(p))

if p < alpha:  # null hypothesis: x comes from a normal distribution
    print("The null hypothesis can be rejected")
else:
    print("The null hypothesis cannot be rejected")
#The null hypothesis cannot be rejected

In [None]:
# Normality Check - Hypothesis testing

k2, p = stats.normaltest(df3_filter["MLHawai_CO2ppm"])
alpha = 1e-3
print("p = {:g}".format(p))

if p < alpha:  # null hypothesis: x comes from a normal distribution
    print("The null hypothesis can be rejected")
else:
    print("The null hypothesis cannot be rejected")
#The null hypothesis cannot be rejected, since the dependent variable is normally distributed, we can perform Pearson's Correlation Test