In [1]:
# Import dependencies
import pandas as pd
import datetime

In [2]:
# Read in csv
df = pd.read_csv("GlobalLandTemperaturesByState.csv")
df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,State,Country
0,1855-05-01,25.544,1.171,Acre,Brazil
1,1855-06-01,24.228,1.103,Acre,Brazil
2,1855-07-01,24.371,1.044,Acre,Brazil
3,1855-08-01,25.427,1.073,Acre,Brazil
4,1855-09-01,25.675,1.014,Acre,Brazil


In [3]:
# Filter Country for only US
df = df[df.Country=="United States"]
df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,State,Country
7458,1743-11-01,10.722,2.898,Alabama,United States
7459,1743-12-01,,,Alabama,United States
7460,1744-01-01,,,Alabama,United States
7461,1744-02-01,,,Alabama,United States
7462,1744-03-01,,,Alabama,United States


In [4]:
# Check data types
df.dtypes

dt                                object
AverageTemperature               float64
AverageTemperatureUncertainty    float64
State                             object
Country                           object
dtype: object

In [5]:
# Convert dt to datetime
df["dt"] = pd.to_datetime(df["dt"])

In [6]:
df.dtypes

dt                               datetime64[ns]
AverageTemperature                      float64
AverageTemperatureUncertainty           float64
State                                    object
Country                                  object
dtype: object

In [7]:
# Filter for temperatures after 1953

df = df[(df["dt"] >= "1953-01-01")]
df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,State,Country
9968,1953-01-01,9.709,0.156,Alabama,United States
9969,1953-02-01,9.626,0.127,Alabama,United States
9970,1953-03-01,14.707,0.096,Alabama,United States
9971,1953-04-01,15.922,0.162,Alabama,United States
9972,1953-05-01,23.291,0.19,Alabama,United States


In [8]:
# Check if any na columns
df.isna().sum()

dt                               0
AverageTemperature               2
AverageTemperatureUncertainty    2
State                            0
Country                          0
dtype: int64

In [9]:
# Drop Na's
df = df.dropna()
df.isna().sum()

dt                               0
AverageTemperature               0
AverageTemperatureUncertainty    0
State                            0
Country                          0
dtype: int64

In [10]:
#Covert temps to Fahrenheit
# Code taken from https://www.tutorialspoint.com/write-a-program-in-python-pandas-to-convert-a-dataframe-celsius-data-column-into-fahrenheit
df = df.assign(AverageTemperatureF = lambda x: (9/5)*x["AverageTemperature"]+32)
df = df.assign(AverageTemperatureUncertaintyF = lambda x: (9/5)*x["AverageTemperatureUncertainty"]+32)
df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,State,Country,AverageTemperatureF,AverageTemperatureUncertaintyF
9968,1953-01-01,9.709,0.156,Alabama,United States,49.4762,32.2808
9969,1953-02-01,9.626,0.127,Alabama,United States,49.3268,32.2286
9970,1953-03-01,14.707,0.096,Alabama,United States,58.4726,32.1728
9971,1953-04-01,15.922,0.162,Alabama,United States,60.6596,32.2916
9972,1953-05-01,23.291,0.19,Alabama,United States,73.9238,32.342


In [11]:
# Drop old temp columns

df = df.drop(["AverageTemperature", "AverageTemperatureUncertainty"], axis=1)
df.head()

Unnamed: 0,dt,State,Country,AverageTemperatureF,AverageTemperatureUncertaintyF
9968,1953-01-01,Alabama,United States,49.4762,32.2808
9969,1953-02-01,Alabama,United States,49.3268,32.2286
9970,1953-03-01,Alabama,United States,58.4726,32.1728
9971,1953-04-01,Alabama,United States,60.6596,32.2916
9972,1953-05-01,Alabama,United States,73.9238,32.342


In [12]:
# Export to csv
df.to_csv("temperature_data_clean.csv", index = False)