In [153]:
# Dependencies
import pandas as pd
import numpy as np

In [154]:
# Csv path
temp_path = "data/raw_data/GlobalLandTemperaturesByState.csv"

# Read csv as df
temp_df = pd.read_csv(temp_path, encoding='utf-8')

In [155]:
# Filter United States
temp_df = temp_df[(temp_df["Country"] == "United States")]

# Filter from 1998 to 2012
temp_df = temp_df[(temp_df["dt"] >= "1998-01-01") & (temp_df["dt"] <= "2012-12-31")]

In [156]:
temp_df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,State,Country
10508,1998-01-01,9.114,0.188,Alabama,United States
10509,1998-02-01,9.828,0.147,Alabama,United States
10510,1998-03-01,12.261,0.187,Alabama,United States
10511,1998-04-01,16.744,0.165,Alabama,United States
10512,1998-05-01,23.599,0.121,Alabama,United States


In [157]:
temp_df.count()

dt                               9180
AverageTemperature               9180
AverageTemperatureUncertainty    9180
State                            9180
Country                          9180
dtype: int64

In [158]:
temp_df["State"].nunique()

51

In [159]:
states_to_drop = ["Alaska", "Connecticut", "Delaware", "District Of Columbia", "Massachusetts", "New Hampshire", "Rhode Island"]

temp_df = temp_df[~temp_df["State"].isin(states_to_drop)]

print(temp_df["State"].nunique())

44


In [160]:
temp_df.count()

dt                               7920
AverageTemperature               7920
AverageTemperatureUncertainty    7920
State                            7920
Country                          7920
dtype: int64

In [161]:
temp_df.dtypes

dt                                object
AverageTemperature               float64
AverageTemperatureUncertainty    float64
State                             object
Country                           object
dtype: object

In [162]:
temp_df.State.unique()

array(['Alabama', 'Arizona', 'Arkansas', 'California', 'Colorado',
       'Florida', 'Georgia (State)', 'Hawaii', 'Idaho', 'Illinois',
       'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine',
       'Maryland', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri',
       'Montana', 'Nebraska', 'Nevada', 'New Jersey', 'New Mexico',
       'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma',
       'Oregon', 'Pennsylvania', 'South Carolina', 'South Dakota',
       'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming'], dtype=object)

In [163]:
# Replace Georgia (State) to just Georgia
for i, row in temp_df.iterrows():
    if row["State"] == "Georgia (State)":
        
        temp_df.at[i, "State"] = "Georgia"

In [164]:
temp_df.State.unique()

array(['Alabama', 'Arizona', 'Arkansas', 'California', 'Colorado',
       'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana',
       'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
       'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana',
       'Nebraska', 'Nevada', 'New Jersey', 'New Mexico', 'New York',
       'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
       'Pennsylvania', 'South Carolina', 'South Dakota', 'Tennessee',
       'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming'], dtype=object)

In [165]:
temp_df["dt"] = pd.to_datetime(temp_df["dt"], format="%Y-%m-%d")

In [166]:
temp_df.dtypes

dt                               datetime64[ns]
AverageTemperature                      float64
AverageTemperatureUncertainty           float64
State                                    object
Country                                  object
dtype: object

In [167]:
temp_df.isnull().values.any()

False

In [168]:
# Change datetime to just years
temp_df["dt"] = temp_df["dt"].dt.year

In [169]:
temp_df = temp_df[["dt", "AverageTemperature", "State"]]

In [170]:
group_df = temp_df.groupby(["State", "dt"])
temp_clean = group_df.agg(['min', 'median', 'mean', 'max']).reset_index()

In [171]:
temp_clean.columns

MultiIndex(levels=[['AverageTemperature', 'dt', 'State'], ['min', 'median', 'mean', 'max', '']],
           labels=[[2, 1, 0, 0, 0, 0], [4, 4, 0, 1, 2, 3]])

In [172]:
temp_clean.head()

Unnamed: 0_level_0,State,dt,AverageTemperature,AverageTemperature,AverageTemperature,AverageTemperature
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,min,median,mean,max
0,Alabama,1998,9.114,18.1855,18.786833,28.137
1,Alabama,1999,8.662,18.83,18.271083,28.234
2,Alabama,2000,4.224,17.243,17.911333,28.149
3,Alabama,2001,5.358,17.43,17.67775,27.066
4,Alabama,2002,7.669,19.875,18.022167,27.441


In [173]:
# Drop level 0
temp_clean.columns = temp_clean.columns.droplevel(0)

In [174]:
temp_clean.columns = ["state_name", "year", "tmin", "tmedian", "tmean", "tmax"]

In [175]:
states_hash = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District Of Columbia': 'DC',
    'Federated States Of Micronesia': 'FM',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Marshall Islands': 'MH',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands': 'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Palau': 'PW',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

In [176]:
# Create column for abbrev
temp_clean["state"] = ""

In [177]:
temp_clean.head()

Unnamed: 0,state_name,year,tmin,tmedian,tmean,tmax,state
0,Alabama,1998,9.114,18.1855,18.786833,28.137,
1,Alabama,1999,8.662,18.83,18.271083,28.234,
2,Alabama,2000,4.224,17.243,17.911333,28.149,
3,Alabama,2001,5.358,17.43,17.67775,27.066,
4,Alabama,2002,7.669,19.875,18.022167,27.441,


In [178]:
for i, row in temp_clean.iterrows():
    
    for key, value in states_hash.items():
        
        if row["state_name"] == key:
            
            temp_clean.at[i, "state"] = value

In [179]:
temp_clean.head()

Unnamed: 0,state_name,year,tmin,tmedian,tmean,tmax,state
0,Alabama,1998,9.114,18.1855,18.786833,28.137,AL
1,Alabama,1999,8.662,18.83,18.271083,28.234,AL
2,Alabama,2000,4.224,17.243,17.911333,28.149,AL
3,Alabama,2001,5.358,17.43,17.67775,27.066,AL
4,Alabama,2002,7.669,19.875,18.022167,27.441,AL


In [180]:
temp_clean = temp_clean.round({"tmin": 1, 
                               "tmedian": 1, 
                               "tmean": 1, 
                               "tmax": 1,})

In [181]:
# Save temp data to csv
temp_clean.to_csv("data/clean_data/temp_1998_2012.csv", encoding="utf-8", index=False)

In [182]:
# Csv path
honey_path = "data/raw_data/honeyproduction.csv"

# Read csv as df
honey_df = pd.read_csv(honey_path, encoding='utf-8')

# Sort by state then by year
honey_df = honey_df.sort_values(by=["state", "year"]).reset_index(drop=True)

In [183]:
honey_df.head()

Unnamed: 0,state,numcol,yieldpercol,totalprod,stocks,priceperlb,prodvalue,year
0,AL,16000.0,71,1136000.0,159000.0,0.72,818000.0,1998
1,AL,17000.0,68,1156000.0,185000.0,0.56,647000.0,1999
2,AL,16000.0,78,1248000.0,187000.0,0.59,736000.0,2000
3,AL,14000.0,73,1022000.0,235000.0,0.72,736000.0,2001
4,AL,12000.0,86,1032000.0,103000.0,1.18,1218000.0,2002


In [184]:
honey_df.count()

state          626
numcol         626
yieldpercol    626
totalprod      626
stocks         626
priceperlb     626
prodvalue      626
year           626
dtype: int64

In [185]:
# Merge dfs
merged_df = pd.merge(honey_df, temp_clean, on=["state", "year"])

In [186]:
merged_df.head()

Unnamed: 0,state,numcol,yieldpercol,totalprod,stocks,priceperlb,prodvalue,year,state_name,tmin,tmedian,tmean,tmax
0,AL,16000.0,71,1136000.0,159000.0,0.72,818000.0,1998,Alabama,9.1,18.2,18.8,28.1
1,AL,17000.0,68,1156000.0,185000.0,0.56,647000.0,1999,Alabama,8.7,18.8,18.3,28.2
2,AL,16000.0,78,1248000.0,187000.0,0.59,736000.0,2000,Alabama,4.2,17.2,17.9,28.1
3,AL,14000.0,73,1022000.0,235000.0,0.72,736000.0,2001,Alabama,5.4,17.4,17.7,27.1
4,AL,12000.0,86,1032000.0,103000.0,1.18,1218000.0,2002,Alabama,7.7,19.9,18.0,27.4


In [187]:
# Save csv
merged_df.to_csv("data/clean_data/honey_temp_1998_2012.csv", encoding='utf-8', index=False)

In [188]:
merged_df.corr()

Unnamed: 0,numcol,yieldpercol,totalprod,stocks,priceperlb,prodvalue,year,tmin,tmedian,tmean,tmax
numcol,1.0,0.243515,0.953594,0.825929,-0.232701,0.912796,0.008709,-0.119068,-0.127629,-0.132249,-0.099587
yieldpercol,0.243515,1.0,0.396252,0.367812,-0.358646,0.278977,-0.232092,0.176676,0.260232,0.226306,0.175801
totalprod,0.953594,0.396252,1.0,0.87883,-0.264499,0.907236,-0.055556,-0.142884,-0.10681,-0.126047,-0.081457
stocks,0.825929,0.367812,0.87883,1.0,-0.305867,0.72856,-0.119602,-0.28013,-0.245699,-0.262522,-0.177148
priceperlb,-0.232701,-0.358646,-0.264499,-0.305867,1.0,-0.089567,0.693984,0.027347,0.05389,0.050694,0.094775
prodvalue,0.912796,0.278977,0.907236,0.72856,-0.089567,1.0,0.139451,-0.155445,-0.127518,-0.147811,-0.085078
year,0.008709,-0.232092,-0.055556,-0.119602,0.693984,0.139451,1.0,-0.045318,-0.008069,-0.019342,0.03818
tmin,-0.119068,0.176676,-0.142884,-0.28013,0.027347,-0.155445,-0.045318,1.0,0.878495,0.928113,0.654322
tmedian,-0.127629,0.260232,-0.10681,-0.245699,0.05389,-0.127518,-0.008069,0.878495,1.0,0.982939,0.833108
tmean,-0.132249,0.226306,-0.126047,-0.262522,0.050694,-0.147811,-0.019342,0.928113,0.982939,1.0,0.835726
