In [8]:
import pandas as pd
import numpy as np


### Defining functions

In [9]:
# Determining seasons by month
def get_season (month, hemisphere = None):
    northern_hemisphere_seasons = {1: 'Winter', 2: 'Winter', 3: 'Winter', 4: 'Spring', 5: 'Spring', 6: 'Spring', 7: 'Summer', 8: 'Summer', 9: 'Summer', 10: 'Fall', 11: 'Fall', 12: 'Fall'}
    southern_hemisphere_seasons = {7: 'Winter', 8: 'Winter', 9: 'Winter', 10: 'Spring', 11: 'Spring', 12: 'Spring', 1: 'Summer', 2: 'Summer', 3: 'Summer', 4: 'Fall', 5: 'Fall', 6: 'Fall'}

    if hemisphere.lower() == "northern":
        season = northern_hemisphere_seasons[month]
    elif hemisphere.lower() == "southern":
        season = southern_hemisphere_seasons[month]
    
    return season

In [10]:
# Converting date fraction to datetime
def convert_date_fraction_series_to_datetime(series = None):
    
    # Separate month decimal and year
    month_decimal = np.mod(series, 1)
    year = (series - month_decimal).astype(int)
    
    # Convert month decimal to month integer (1 = January, 2 = February, etc.)
    month = np.round(12 * month_decimal + 0.5).astype(int)
    
    # Concatenate the values together into a string
    date = year.astype(str) + "-" + month.astype(str) + "-01"
    
    # Convert the date strings into datetime values
    series = pd.to_datetime(date, yearfirst = True)
    
    return series

## Importing weather station data

In [11]:
# Import weather stations
station_data = pd.read_csv("../datasets/capstone/downloaded data/berkeley_earth_stations--site_detail.txt", 
                               delimiter = "\t", 
                               skiprows =148, 
                               names = ["Station ID", 
                                        "Station Name", 
                                        "Latitude", 
                                        "Longitude", 
                                        "Elevation (m)", 
                                        "Lat. Uncertainty", 
                                        "Long. Uncertainty", 
                                        "Elev. Uncertainty (m)", 
                                        "Country", 
                                        "State / Province Code", 
                                        "County", 
                                        "Time Zone", 
                                        "WMO ID", 
                                        "Coop ID", 
                                        "WBAN ID", 
                                        "ICAO ID", 
                                        "# of Relocations", 
                                        "# Suggested Relocations", 
                                        "# of Sources", 
                                        "Hash"])


## Formatting weather station data

In [12]:
# Select only relevant columns
station_data = station_data[["Station ID", "Station Name", "Latitude", "Longitude", "Elevation (m)", "Lat. Uncertainty", "Long. Uncertainty", "Elev. Uncertainty (m)", "Country"]]

# Convert values in numerical columns
numeric_columns_in_stations_data = ["Latitude", "Longitude", "Elevation (m)", "Lat. Uncertainty", "Long. Uncertainty", "Elev. Uncertainty (m)"]
station_data.loc[:, numeric_columns_in_stations_data] = station_data[numeric_columns_in_stations_data].apply(pd.to_numeric, errors = 'coerce')

# Remove whitespace from non-numerical columns
station_data.loc[:, "Station Name"] = station_data["Station Name"].str.strip()
station_data.loc[:, "Country"] = station_data["Country"].str.strip()

# Find when countries produce coffee
coffee_harvest_schedule = pd.read_csv("../datasets/capstone/coffee harvest schedule.csv", index_col = 0)

# Concentrate on the stations in countries that produce coffee
stations_in_coffee_producing_countries = station_data[station_data["Country"].isin(coffee_harvest_schedule["Producing Country"])]

# Determing if most of each country's stations are in the northern or southern hemisphere
hemisphere_dictionary = (station_data.groupby(by = "Country")["Latitude"].mean() > 0).map({True: "Northern", False: "Southern"}).to_dict()

# Arabica grows best in elevations 548 m – 1100 m for latitudes between 16° and 24°, or 1097 m – 1920 m for latitudes less that ±10°
arabica_growing_conditions_criteria = (stations_in_coffee_producing_countries["Elevation (m)"] >= 548) & (stations_in_coffee_producing_countries["Elevation (m)"] <= 1100) & (stations_in_coffee_producing_countries["Latitude"].abs() > 16) & (stations_in_coffee_producing_countries["Latitude"].abs() <= 24)
arabica_growing_conditions_criteria = arabica_growing_conditions_criteria | ((stations_in_coffee_producing_countries["Elevation (m)"] >= 1097) & (stations_in_coffee_producing_countries["Elevation (m)"] <= 1920) & (stations_in_coffee_producing_countries["Latitude"].abs() <= 16))

# Robusta grows best in elevations 0 m – 914 m in latitudes between ±10°
robusta_growing_conditions_criteria = (stations_in_coffee_producing_countries["Elevation (m)"] <= 914) & (stations_in_coffee_producing_countries["Latitude"].abs() <= 10)

# Select the stations in the ideal coffee growing regions
stations_in_arabica_conditions = stations_in_coffee_producing_countries[arabica_growing_conditions_criteria]["Station ID"]
stations_in_robusta_conditions = stations_in_coffee_producing_countries[robusta_growing_conditions_criteria]["Station ID"]

stations_in_arabica_conditions_dictionary = dict.fromkeys(stations_in_arabica_conditions.values, True)
stations_in_robusta_conditions_dictionary = dict.fromkeys(stations_in_robusta_conditions.values, True)


## Importing temperature data

In [13]:
# Import raw temperature data from Berkeley Earth
temperatures_for_all_stations = pd.read_csv("../datasets/capstone/downloaded data/berkeley_earth -- data.txt", 
            delimiter = "\t",  
            skiprows = 111, 
            names = ["Station ID", 
                     "Series Number", 
                     "Date", 
                     "Temperature (C)", 
                     "Uncertainty (C)", 
                     "Observations", 
                     "Time of Observation"])



## Formatting temperature station data

In [14]:
# Remove unnecessary labels
temperatures_for_all_stations.drop(labels = ["Series Number", "Uncertainty (C)", "Observations", "Time of Observation"], axis = 1, inplace = True)

# Select the temperature data for stations in coffee growing regions, and add country names
temperatures_for_coffee_producing_countries = temperatures_for_all_stations[temperatures_for_all_stations["Station ID"].isin(stations_in_coffee_producing_countries["Station ID"])]
temperatures_for_coffee_producing_countries = stations_in_coffee_producing_countries[["Station ID", "Country"]].merge(temperatures_for_coffee_producing_countries, on = "Station ID")

# Add columns indicating each station's hemisphere (for seasonality calculations later)
temperatures_for_coffee_producing_countries["Hemisphere"] = temperatures_for_coffee_producing_countries["Country"].map(hemisphere_dictionary)

# Designate stations in areas that grow arabica and robusta coffee
temperatures_for_coffee_producing_countries["Arabica Production"] = temperatures_for_coffee_producing_countries["Station ID"].map(stations_in_arabica_conditions_dictionary)
temperatures_for_coffee_producing_countries["Robusta Production"] = temperatures_for_coffee_producing_countries["Station ID"].map(stations_in_robusta_conditions_dictionary)
temperatures_for_coffee_producing_countries = temperatures_for_coffee_producing_countries.fillna(False)

# Keep the stations in areas that grow arabica or robusta coffee
temperature_data = temperatures_for_coffee_producing_countries[
    temperatures_for_coffee_producing_countries["Arabica Production"]
    | temperatures_for_coffee_producing_countries["Robusta Production"]]

# Drop the Station ID column since it's no longer needed
temperature_data.drop("Station ID", axis = 1, inplace = True)

# Convert dates to datetime values
temperature_data.loc[:, "Date"] = convert_date_fraction_series_to_datetime(temperature_data["Date"])

# Determine if an observation occurs within the harvest season for each country. 
# It's significantly easier to perform this before reindexing by time.
# for country in temperature_data["Country"].unique():
#     for row in temperature_data[temperature_data["Country"] == country].head().iterrows():
#         crop = ("Robusta", "Arabica")[row[1]["Arabica Production"]]
#         month = row[1]["Date"].month
#         harvest_schedule_range = coffee_harvest_schedule[(coffee_harvest_schedule["Producing Country"] == country) & (coffee_harvest_schedule[crop])][["Harvest Begins", "Harvest Ends"]].values.tolist()
#         if len(harvest_schedule_range) != 0:
#             harvest_schedule = harvest_schedule_range[0]
#             if harvest_schedule[0] < harvest_schedule[1]:
#                 temperature_data.ix[row[0], "Harvest Season"] = (harvest_schedule[0] <= month <= harvest_schedule[1])
#             elif harvest_schedule[0] > harvest_schedule[1]:
#                 temperature_data.ix[row[0], "Harvest Season"] = (harvest_schedule[0] <= month + 12) and (month <= harvest_schedule[1])
                
# temperature_data["Harvest Season"] = temperature_data["Harvest Season"].map({True: True, False: False, np.NaN: False})
                
# Index by date
temperature_data.set_index("Date", inplace = True)
# temperature_data = temperature_data.sort_index()


# Add seasons columns
temperature_data["Season"] = temperature_data.index.month
temperature_data.ix[temperature_data["Hemisphere"] == "Northern", "Season"] = temperature_data[temperature_data["Hemisphere"] == "Northern"]["Season"].apply(lambda x: get_season(x, hemisphere = "Northern"))
temperature_data.ix[temperature_data["Hemisphere"] == "Southern", "Season"] = temperature_data[temperature_data["Hemisphere"] == "Southern"]["Season"].apply(lambda x: get_season(x, hemisphere = "Southern"))

# Add frost likelihood
temperature_data["Frost likelihood"] = temperature_data["Temperature (C)"] ** -2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Exporting final temperatures dataframe

In [16]:
temperature_data.to_csv("../datasets/capstone/temperature-in-coffee-growing-regions--from-berkeley-earth.csv")
