In [1]:
import boto3
import botocore
import pandas as pd
from IPython.display import display, Markdown

In [2]:
s3 = boto3.client('s3')
s3_resource = boto3.resource('s3')

In [3]:
def create_bucket(bucket):
    import logging

    try:
        s3.create_bucket(Bucket=bucket)
    except botocore.exceptions.ClientError as e:
        logging.error(e)
        return 'Bucket ' + bucket + ' could not be created.'
    return 'Created or already exists ' + bucket + ' bucket.'

In [4]:
create_bucket('noaa-bl')

'Created or already exists noaa-bl bucket.'

In [8]:
def list_buckets(match=''):
    response = s3.list_buckets()
    if match:
        print(f'Existing buckets containing "{match}" string:')
    else:
        print('All existing buckets:')
    for bucket in response['Buckets']:
        if match:
            if match in bucket["Name"]:
                print(f'  {bucket["Name"]}')

In [9]:
list_buckets(match='noaa')

Existing buckets containing "noaa" string:
  noaa-bl


In [10]:
def list_bucket_contents(bucket, match='', size_mb=0):
    bucket_resource = s3_resource.Bucket(bucket)
    total_size_gb = 0
    total_files = 0
    match_size_gb = 0
    match_files = 0
    for key in bucket_resource.objects.all():
        key_size_mb = key.size/1024/1024
        total_size_gb += key_size_mb
        total_files += 1
        list_check = False
        if not match:
            list_check = True
        elif match in key.key:
            list_check = True
        if list_check and not size_mb:
            match_files += 1
            match_size_gb += key_size_mb
            print(f'{key.key} ({key_size_mb:3.0f}MB)')
        elif list_check and key_size_mb <= size_mb:
            match_files += 1
            match_size_gb += key_size_mb
            print(f'{key.key} ({key_size_mb:3.0f}MB)')

    if match:
        print(f'Matched file size is {match_size_gb/1024:3.1f}GB with {match_files} files')            
    
    print(f'Bucket {bucket} total size is {total_size_gb/1024:3.1f}GB with {total_files} files')

In [11]:
list_bucket_contents(bucket='noaa-ghcn-pds', match='.csv', size_mb= 1000)

csv.gz/1763.csv.gz (  0MB)
csv.gz/1764.csv.gz (  0MB)
csv.gz/1765.csv.gz (  0MB)
csv.gz/1766.csv.gz (  0MB)
csv.gz/1767.csv.gz (  0MB)
csv.gz/1768.csv.gz (  0MB)
csv.gz/1769.csv.gz (  0MB)
csv.gz/1770.csv.gz (  0MB)
csv.gz/1771.csv.gz (  0MB)
csv.gz/1772.csv.gz (  0MB)
csv.gz/1773.csv.gz (  0MB)
csv.gz/1774.csv.gz (  0MB)
csv.gz/1775.csv.gz (  0MB)
csv.gz/1776.csv.gz (  0MB)
csv.gz/1777.csv.gz (  0MB)
csv.gz/1778.csv.gz (  0MB)
csv.gz/1779.csv.gz (  0MB)
csv.gz/1780.csv.gz (  0MB)
csv.gz/1781.csv.gz (  0MB)
csv.gz/1782.csv.gz (  0MB)
csv.gz/1783.csv.gz (  0MB)
csv.gz/1784.csv.gz (  0MB)
csv.gz/1785.csv.gz (  0MB)
csv.gz/1786.csv.gz (  0MB)
csv.gz/1787.csv.gz (  0MB)
csv.gz/1788.csv.gz (  0MB)
csv.gz/1789.csv.gz (  0MB)
csv.gz/1790.csv.gz (  0MB)
csv.gz/1791.csv.gz (  0MB)
csv.gz/1792.csv.gz (  0MB)
csv.gz/1793.csv.gz (  0MB)
csv.gz/1794.csv.gz (  0MB)
csv.gz/1795.csv.gz (  0MB)
csv.gz/1796.csv.gz (  0MB)
csv.gz/1797.csv.gz (  0MB)
csv.gz/1798.csv.gz (  0MB)
csv.gz/1799.csv.gz (  0MB)
c

In [7]:
def preview_csv_dataset(bucket, key):
#def preview_csv_dataset(bucket, key, rows=10):


    data_source = {
            'Bucket': bucket,
            'Key': key
        }
    # Generate the URL to get Key from Bucket
    url = s3.generate_presigned_url(
        ClientMethod = 'get_object',
        Params = data_source
    )

    #data = pd.read_csv(url, nrows=rows)
    data = pd.read_csv(url) # read all rows
    return data

In [23]:
df_1919_all = preview_csv_dataset(bucket='noaa-ghcn-pds', key='csv/1919.csv')

In [8]:
def key_exists(bucket, key):
    try:
        s3_resource.Object(bucket, key).load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            # The key does not exist.
            return(False)
        else:
            # Something else has gone wrong.
            raise
    else:
        # The key does exist.
        return(True)

def copy_among_buckets(from_bucket, from_key, to_bucket, to_key):
    if not key_exists(to_bucket, to_key):
        s3_resource.meta.client.copy({'Bucket': from_bucket, 'Key': from_key}, 
                                        to_bucket, to_key)        
        print(f'File {to_key} saved to S3 bucket {to_bucket}')
    else:
        print(f'File {to_key} already exists in S3 bucket {to_bucket}') 

In [12]:
copy_among_buckets(from_bucket='noaa-ghcn-pds', from_key='csv/2019.csv',to_bucket='noaa-bl', to_key='NOAA_2019.csv')

File NOAA_2019.csv already exists in S3 bucket noaa-bl


In [13]:
copy_among_buckets(from_bucket='noaa-ghcn-pds', from_key='csv/1919.csv',to_bucket='noaa-bl', to_key='NOAA_1919.csv')

File NOAA_1919.csv already exists in S3 bucket noaa-bl


## Olivia

In [14]:
#Create a new function that reads the whole file
def preview_csv_dataset_all(bucket, key):

    data_source = {
            'Bucket': bucket,
            'Key': key
        }
    # Generate the URL to get Key from Bucket
    url = s3.generate_presigned_url(
        ClientMethod = 'get_object',
        Params = data_source
    )
    data = pd.read_csv(url) # read all rows
    return data

In [15]:
# read 1919 dataset
df_1919_all = preview_csv_dataset_all(bucket='noaa-ghcn-pds', key='csv/1919.csv')

In [16]:
#Renaming Columns
df_1919_all.columns = ["ID","Date","Element","Element_Value","MFlag","QFlag","SFlag","Obs_Time"]

#Checking to see new columns
df_1919_all.head()

Unnamed: 0,ID,Date,Element,Element_Value,MFlag,QFlag,SFlag,Obs_Time
0,USC00242347,19190101,TMAX,-94,,,6,
1,USC00242347,19190101,TMIN,-183,,,6,
2,USC00242347,19190101,TOBS,-183,,,6,
3,USC00242347,19190101,PRCP,58,,,6,
4,USC00242347,19190101,SNOW,102,,,6,


In [17]:
df_1919_all.describe()

Unnamed: 0,Date,Element_Value,Obs_Time
count,9465082.0,9465082.0,280499.0
mean,19190660.0,52.93921,1407.433716
std,347.5456,117.054,615.539591
min,19190100.0,-1050.0,600.0
25%,19190330.0,0.0,800.0
50%,19190630.0,0.0,1700.0
75%,19191000.0,89.0,1800.0
max,19191230.0,22860.0,2400.0


In [18]:
from datetime import datetime, timedelta

In [19]:
def toDatetime(x):
    x = str(x) # convert the integer type to string.
    date = datetime(year=int(x[0:4]), month=int(x[4:6]), day=int(x[6:8])) # cut the string into pieces and convert them to datetime type.
    return date

In [20]:
df_1919_pd = pd.DataFrame(df_1919_all) 
df_1919_pd['Date']=df_1919_pd['Date'].apply(toDatetime)

In [21]:
df_1919_date=df_1919_pd

In [22]:
df_1919_date.head(20)

Unnamed: 0,ID,Date,Element,Element_Value,MFlag,QFlag,SFlag,Obs_Time
0,USC00242347,1919-01-01,TMAX,-94,,,6,
1,USC00242347,1919-01-01,TMIN,-183,,,6,
2,USC00242347,1919-01-01,TOBS,-183,,,6,
3,USC00242347,1919-01-01,PRCP,58,,,6,
4,USC00242347,1919-01-01,SNOW,102,,,6,
5,USC00242347,1919-01-01,SNWD,102,,,6,
6,ASN00058046,1919-01-01,PRCP,0,,,a,
7,ASN00054014,1919-01-01,PRCP,0,,,a,
8,ASN00066060,1919-01-01,PRCP,0,,,a,
9,USC00363526,1919-01-01,PRCP,76,,,6,


In [23]:
# Set index
df_1919_date.set_index('Date', inplace=True)

In [26]:
df_1919TMAX= df_1919_date[df_1919_date['Element']=='TMAX']
df_1919date=df_1919TMAX.groupby('Date')
df_1919date["Element_Value"].mean()

Date
1919-01-01    15.108004
1919-01-02     1.982838
1919-01-03   -12.692454
1919-01-04    -7.371561
1919-01-05    -1.551394
1919-01-06    22.033986
1919-01-07    38.292650
1919-01-08    52.380656
1919-01-09    48.297451
1919-01-10    56.431831
1919-01-11    56.592754
1919-01-12    56.043205
1919-01-13    65.478946
1919-01-14    63.861656
1919-01-15    62.596721
1919-01-16    70.426507
1919-01-17    79.092249
1919-01-18    91.865757
1919-01-19    90.257198
1919-01-20    87.411427
1919-01-21    88.330233
1919-01-22    88.953800
1919-01-23    88.586743
1919-01-24    84.009254
1919-01-25    78.549196
1919-01-26    83.614059
1919-01-27    82.900435
1919-01-28    82.284434
1919-01-29    82.474470
1919-01-30    79.311991
                ...    
1919-12-02    19.616799
1919-12-03    20.012342
1919-12-04    34.089462
1919-12-05    40.519220
1919-12-06    49.507290
1919-12-07    49.526606
1919-12-08    19.067857
1919-12-09     0.173674
1919-12-10     1.459043
1919-12-11    11.697144
1919-12-12 

In [None]:
plot= df_1919date.plot()

plot.set_ylabel('Element Value')
plot.set_title('Average Measure of Elements - 1919')

## Betty

Take the function for reading from an S3 bucket that's in the template notebook, and then write a loop that iterates through a list of csv files you want to read and each time in the loop it would apply that function you had

In [5]:
#Create a new function that reads the whole file
def preview_csv_dataset_all(bucket, key):

    data_source = {
            'Bucket': bucket,
            'Key': key
        }
    # Generate the URL to get Key from Bucket
    url = s3.generate_presigned_url(
        ClientMethod = 'get_object',
        Params = data_source
    )
    data = pd.read_csv(url) # read all rows
    return data

In [7]:
frames = [ preview_csv_dataset_all(f) for f in files ]
result = pd.concat(frames)

NameError: name 'files' is not defined

## Conclusion

The research question we want to demonstrate is how climate has changed over 100 years, from 1919 to 2019. We first create unique s3 buckets since the datasets are huge, it is impossible for us to actually read in the dataset directly; so, we have to use s3 buckets as a storage tool and copy the datasets we need into the buckets then do some processing. After cleaning the dataset and giving column names, we can compare the two plots of 1919 and 2019 with average measure of each element recorded directly and we can see that there is an approximately 10 degrees Celsius difference in average Temperature, indicating the climate has changed greatly over years.

However, after selecting the specific columns that we need and comparing the maximum and minimum temperature on average, we find out that there is actually a slight decrease of 0.5 degrees Celsius in maximum temperature and a 1 degree Celsius increase in minimum temperature. This does not fit the original assumption of great change in climate, so it raises a question about why the average temperature changes so much while the maximum even decreases a little.

In this case, we would like to take a closer look and see the difference in the distribution of the maximum temperature through the whole year. However, because of the limitation of memory, we can only see the change of average maximum temperature plot for 1919. Yet, we can still conclude that although the average maximum and minimum temperature do not seem to change much, the climate warming is happening since the increase in average temperature indicates an increase in hotter days in a year.

After finishing the notebook, we choose a theme and several extensions to make the webpage more reader friendly. Then, we use the code jupyter convert in the terminal of SageMaker to convert it into an html. Then we download the file and upload to the s3 bucket, setting the permissions to be open to the public.

From our work, people can get an idea of how climate changes in 100 years. They should also realize if they keep improving the technology but not at the same time protect the environment, there will be more hotter days and the minimum temperature will also rise. If they want to do analysis to see how other indexes of climate change, they can also view our project as a demo, as they can learn how to access data from s3 bucket from our work, which is a useful way to process large samples of data.

To improve our work, as people may want to see more directly from the graph of how the temperature changes each month correspondingly over time, we can use a notebook instance with a larger memory so that we can graph the plot of 2019 and compare it to the plot of 1919. Also, our conclusion can be more comprehensive if we can process more datasets and have graphs with larger time series.
