In [None]:
#This File will be looking at the maximum and minimum Temperature data for the world as a whole.
#Comparing the years 2019 and 1919. 
#I had to set up the bucket, so the beginning portion is borrowed to create an S3 bucket to access the data set. 
#Please skip towards the end for code. 

In [None]:

#From Elizabeth.ipnyb:

# Setting up environment
import boto3
import botocore
import pandas as pd
from IPython.display import display, Markdown

# Using boto3 to access s3 API
s3 = boto3.client('s3')
s3_resource = boto3.resource('s3')

# Creating a unique s3 bucket
def create_bucket(bucket):
    import logging

    try:
        s3.create_bucket(Bucket=bucket)
    except botocore.exceptions.ClientError as e:
        logging.error(e)
        return 'Bucket ' + bucket + ' could not be created.'
    return 'Created or already exists ' + bucket + ' bucket.'

create_bucket('AaronL-noaa')

# List bucket to confirm that it is created
def list_buckets(match=''):
    response = s3.list_buckets()
    if match:
        print(f'Existing buckets containing "{match}" string:')
    else:
        print('All existing buckets:')
    for bucket in response['Buckets']:
        if match:
            if match in bucket["Name"]:
                print(f'  {bucket["Name"]}')

list_buckets(match='noaa')

# List bucket contents: Allows us to access and see the contents of the bucket that contains the data we want
def list_bucket_contents(bucket, match='', size_mb=0):
    bucket_resource = s3_resource.Bucket(bucket)
    total_size_gb = 0
    total_files = 0
    match_size_gb = 0
    match_files = 0
    for key in bucket_resource.objects.all():
        key_size_mb = key.size/1024/1024
        total_size_gb += key_size_mb
        total_files += 1
        list_check = False
        if not match:
            list_check = True
        elif match in key.key:
            list_check = True
        if list_check and not size_mb:
            match_files += 1
            match_size_gb += key_size_mb
            print(f'{key.key} ({key_size_mb:3.0f}MB)')
        elif list_check and key_size_mb <= size_mb:
            match_files += 1
            match_size_gb += key_size_mb
            print(f'{key.key} ({key_size_mb:3.0f}MB)')

    if match:
        print(f'Matched file size is {match_size_gb/1024:3.1f}GB with {match_files} files')   
        
# Lists the files in the NOAA Global Historical Climatology Network Daily data
list_bucket_contents(bucket='noaa-ghcn-pds', match='.csv', size_mb= 1000)

# Previewing the csv files that contain the data
def preview_csv_dataset(bucket, key, rows=10):
    data_source = {
            'Bucket': bucket,
            'Key': key
        }
    # Generate the URL to get Key from Bucket
    url = s3.generate_presigned_url(
        ClientMethod = 'get_object',
        Params = data_source
    )

    data = pd.read_csv(url, nrows=rows, header = None)
    return data

df_2019 = preview_csv_dataset(bucket='noaa-ghcn-pds', key='csv/2019.csv', rows = 1000)

df_1919 = preview_csv_dataset(bucket='noaa-ghcn-pds', key='csv/1919.csv', rows = 1000)

# Check and see if file exists in the bucket we created earlier. Then copy the data from NOAA into our own bucket
def key_exists(bucket, key):
    try:
        s3_resource.Object(bucket, key).load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            # The key does not exist.
            return(False)
        else:
            # Something else has gone wrong.
            raise
    else:
        # The key does exist.
        return(True)

def copy_among_buckets(from_bucket, from_key, to_bucket, to_key):
    if not key_exists(to_bucket, to_key):
        s3_resource.meta.client.copy({'Bucket': from_bucket, 'Key': from_key}, 
                                        to_bucket, to_key)        
        print(f'File {to_key} saved to S3 bucket {to_bucket}')
    else:
        print(f'File {to_key} already exists in S3 bucket {to_bucket}')

copy_among_buckets(from_bucket='noaa-ghcn-pds', from_key='csv/2019.csv',to_bucket='open-data-analytics-noaa', to_key='NOAA_2019.csv')
copy_among_buckets(from_bucket='noaa-ghcn-pds', from_key='csv/1919.csv',to_bucket='open-data-analytics-noaa', to_key='NOAA_1919.csv')

In [4]:
#Renaming Columns
df_2019.columns = ["ID","Date","Element","Element_Value","MFlag","QFlag","SFlag","Obs_Time"]
#Checking to see new columns
df_2019


Unnamed: 0,ID,Date,Element,Element_Value,MFlag,QFlag,SFlag,Obs_Time
0,US1FLSL0019,20190101,PRCP,0,,,N,
1,US1FLSL0019,20190101,SNOW,0,,,N,
2,US1NVNY0012,20190101,PRCP,0,,,N,
3,US1NVNY0012,20190101,SNOW,0,,,N,
4,US1ILWM0012,20190101,PRCP,163,,,N,
5,USC00141761,20190101,TMAX,-11,,,7,700.0
6,USC00141761,20190101,TMIN,-139,,,7,700.0
7,USC00141761,20190101,TOBS,-133,,,7,700.0
8,USC00141761,20190101,PRCP,0,,,7,700.0
9,USC00141761,20190101,SNOW,0,,,7,


In [5]:
df_1919.columns = ["ID","Date","Element","Element_Value","MFlag","QFlag","SFlag","Obs_Time"]
df_1919

Unnamed: 0,ID,Date,Element,Element_Value,MFlag,QFlag,SFlag,Obs_Time
0,ASN00037003,19190101,PRCP,0,,,a,
1,USC00242347,19190101,TMAX,-94,,,6,
2,USC00242347,19190101,TMIN,-183,,,6,
3,USC00242347,19190101,TOBS,-183,,,6,
4,USC00242347,19190101,PRCP,58,,,6,
5,USC00242347,19190101,SNOW,102,,,6,
6,USC00242347,19190101,SNWD,102,,,6,
7,ASN00058046,19190101,PRCP,0,,,a,
8,ASN00054014,19190101,PRCP,0,,,a,
9,ASN00066060,19190101,PRCP,0,,,a,


In [14]:
df_2019 = preview_csv_dataset(bucket='noaa-ghcn-pds', key='csv/2019.csv', rows = 1000)
df_2019.columns= ["ID","Date","Element","Element_Value","MFlag","QFlag","SFlag","Obs_Time"]
#of S-flags=blank in data set (No source for records)


df_1919 = preview_csv_dataset(bucket='noaa-ghcn-pds', key='csv/1919.csv', rows = 1000)
df_1919.columns = ["ID","Date","Element","Element_Value","MFlag","QFlag","SFlag","Obs_Time"]
#THIS IS THE END OF BORROWED CODE FROM ELIZABETH BRYANT

In [31]:
#We want to find a summary of all the maximum temperatures in the dataset for 2019, but I do not want a graph.
#I want just a simple numeric answer. 

df_2019Hot= df_2019[df_2019['Element']=='TMAX']
#Now for the mean, meadian, and mode. 
print("The median Max temperature is:", df_2019Hot["Element_Value"].median())
print("The average Max temperature is:", df_2019Hot["Element_Value"].mean())
print("The mode Max temperature is:", df_2019Hot["Element_Value"].mode())

print("------")

#Now for minimum temperature:
df_2019Cold= df_2019[df_2019['Element']=='TMIN']

print("The median Min temperature is:", df_2019Cold["Element_Value"].median())
print("The average Min temperature is:", df_2019Cold["Element_Value"].mean())
print("The mode Min temperature is:", df_2019Cold["Element_Value"].mode())


The median Max temperature is: 44.0
The average Max temperature is: 67.80152671755725
The mode Max temperature is: 0     44
1    167
dtype: int64
------
The median Min temperature is: -30.0
The average Min temperature is: -39.985074626865675
The mode Min temperature is: 0   -44
1   -33
2    17
3    22
4    28
5    44
dtype: int64


In [32]:
#Now for 1919:
#First max temperature:
df_1919Hot= df_1919[df_1919['Element']=='TMAX']

print("The median Max temperature is:", df_1919Hot["Element_Value"].median())
print("The average Max temperature is:", df_1919Hot["Element_Value"].mean())
print("The mode Max temperature is:", df_1919Hot["Element_Value"].mode())

print("------")
#Now for minimum temperature:
df_1919Cold= df_1919[df_2019['Element']=='TMIN']

print("The median Min temperature is:", df_1919Cold["Element_Value"].median())
print("The average Min temperature is:", df_1919Cold["Element_Value"].mean())
print("The mode Min temperature is:", df_1919Cold["Element_Value"].mode())


The median Max temperature is: 0.0
The average Max temperature is: 23.310606060606062
The mode Max temperature is: 0    0
dtype: int64
------
The median Min temperature is: 0.0
The average Min temperature is: 4.074626865671642
The mode Min temperature is: 0    0
dtype: int64


In [None]:
#The average is basically at every country in the dataset, and each region.
#As you can see, it appears that the maximum average world temperature has gone up since 1919:
#It has gone up from 23.31 to 67.8 Celcius 
#Average minimum temperature has also gone down from -40 to 4.07 degrees Celcius. 
#Why this is is entirely speculative, since this is only looking at two years, we don't know when the turning point for temperature change was.
#It could have been during World War II when carbon emissions increased globally, it could have been during the sixties and seventies.
#Its unknown when the turning point was. 
#What is clear is that world temperatures are reaching higher extremes. 