# Team Green Project:  Scrub Global Temperature File
### Note:  Temperatures are in Celsius

In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# File to Load
temp_data_to_load = "data/GlobalTemperatures.csv"

# Read Temperature Data
temp_data = pd.read_csv(temp_data_to_load)

In [3]:
temp_data.head()

Unnamed: 0,dt,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
0,1750-01-01,3.034,3.574,,,,,,
1,1750-02-01,3.083,3.702,,,,,,
2,1750-03-01,5.626,3.076,,,,,,
3,1750-04-01,8.49,2.451,,,,,,
4,1750-05-01,11.573,2.072,,,,,,


In [4]:
# Create a new dataset with only dt, LandAndOceanAverageTemperature, and LandAndOceanAverageTemperatureUncertainty
temp_data_1 = temp_data[['dt', 'LandAndOceanAverageTemperature', 'LandAndOceanAverageTemperatureUncertainty']]
temp_data_1

Unnamed: 0,dt,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
0,1750-01-01,,
1,1750-02-01,,
2,1750-03-01,,
3,1750-04-01,,
4,1750-05-01,,
...,...,...,...
3187,2015-08-01,17.589,0.057
3188,2015-09-01,17.049,0.058
3189,2015-10-01,16.290,0.062
3190,2015-11-01,15.252,0.063


In [6]:
# Drop rows with NaN
temp_data_2 = temp_data_1.dropna()
temp_data_2

Unnamed: 0,dt,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
1200,1850-01-01,12.833,0.367
1201,1850-02-01,13.588,0.414
1202,1850-03-01,14.043,0.341
1203,1850-04-01,14.667,0.267
1204,1850-05-01,15.507,0.249
...,...,...,...
3187,2015-08-01,17.589,0.057
3188,2015-09-01,17.049,0.058
3189,2015-10-01,16.290,0.062
3190,2015-11-01,15.252,0.063


In [11]:
# Create a Year Column
# Disable the SettingWithCopyWarning warning
pd.options.mode.chained_assignment = None
temp_data_2['Year'] = temp_data_2.dt.str[:4]
temp_data_2

Unnamed: 0,dt,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty,Year
1200,1850-01-01,12.833,0.367,1850
1201,1850-02-01,13.588,0.414,1850
1202,1850-03-01,14.043,0.341,1850
1203,1850-04-01,14.667,0.267,1850
1204,1850-05-01,15.507,0.249,1850
...,...,...,...,...
3187,2015-08-01,17.589,0.057,2015
3188,2015-09-01,17.049,0.058,2015
3189,2015-10-01,16.290,0.062,2015
3190,2015-11-01,15.252,0.063,2015


In [21]:
# Group by Year and get average of both numerical columns
temp_data_3 = temp_data_2.groupby('Year').mean().reset_index()
temp_data_3

Unnamed: 0,Year,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
0,1850,14.867167,0.308167
1,1851,14.991833,0.312083
2,1852,15.0065,0.316417
3,1853,14.955167,0.283833
4,1854,14.991,0.276417
5,1855,15.021083,0.291167
6,1856,14.879333,0.260167
7,1857,14.75825,0.25975
8,1858,14.8815,0.251833
9,1859,14.929917,0.25825


In [35]:
# Convert Year column to int, then filter for Years 1990 on
temp_data_3['Year'] = temp_data_3['Year'].astype(str).astype(int)
target_years = temp_data_3['Year'] >= 1990
temp_data_3 = temp_data_3[target_years]
temp_data_3

Unnamed: 0,Year,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
140,1990,15.629333,0.05725
141,1991,15.598,0.055083
142,1992,15.453,0.058167
143,1993,15.466417,0.059167
144,1994,15.535,0.058583
145,1995,15.637833,0.060417
146,1996,15.524667,0.0595
147,1997,15.713833,0.059167
148,1998,15.826,0.063
149,1999,15.600333,0.063333


In [36]:
# Save Data to .csv
# Save scrubbed file to .csv
temp_data_3.to_csv(r'Data/temp_data.csv', index = False, header=True)