# Temperature Data

- Replace Country Names with 2 digit Iso Code using fuzzy matching
- Aggregate historic temperature data to Country / Month Level

In [1]:
import pandas as pd
import pycountry
import datetime

In [2]:
# perform fuzzy matching on country name to get 2-digit ISO Code
def get_country_code(country_name):
    try:
        country_code = pycountry.countries.search_fuzzy(country_name)[0].alpha_2
        return country_code
    except:
        #print(f"{country_name} not found")
        return "no match"

In [3]:
# import data
df_temp = pd.read_csv("./data/GlobalLandTemperaturesByCity.csv")

In [4]:
df_temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8599212 entries, 0 to 8599211
Data columns (total 7 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   dt                             object 
 1   AverageTemperature             float64
 2   AverageTemperatureUncertainty  float64
 3   City                           object 
 4   Country                        object 
 5   Latitude                       object 
 6   Longitude                      object 
dtypes: float64(2), object(5)
memory usage: 459.2+ MB


In [5]:
df_temp.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E


In [6]:
# convert date-object to date

df_temp['dt'] = pd.to_datetime(df_temp['dt'])

In [7]:
# extract the last 20 available years

df_temp_last20y = df_temp[df_temp['dt'].dt.year > df_temp['dt'].max().year - 20]

In [8]:
# consolidate data and calculate avg, min and max temperature by country and month 

df_temp_agg = df_temp_last20y.groupby(['Country', df_temp_last20y['dt'].dt.month ])[['AverageTemperature']].agg(['mean','min','max']).reset_index()

In [9]:
df_temp_agg.head()

Unnamed: 0_level_0,Country,dt,AverageTemperature,AverageTemperature,AverageTemperature
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,min,max
0,Afghanistan,1,1.7142,-7.7,8.589
1,Afghanistan,2,3.961238,-4.263,11.794
2,Afghanistan,3,9.753187,2.872,18.52
3,Afghanistan,4,15.361531,9.273,23.552
4,Afghanistan,5,21.18115,14.518,29.882


In [10]:
# remove multi_index column header

df_temp_agg.columns = ['_'.join(col) for col in df_temp_agg.columns]

In [11]:
df_temp_agg

Unnamed: 0,Country_,dt_,AverageTemperature_mean,AverageTemperature_min,AverageTemperature_max
0,Afghanistan,1,1.714200,-7.700,8.589
1,Afghanistan,2,3.961238,-4.263,11.794
2,Afghanistan,3,9.753187,2.872,18.520
3,Afghanistan,4,15.361531,9.273,23.552
4,Afghanistan,5,21.181150,14.518,29.882
...,...,...,...,...,...
1903,Zimbabwe,8,18.153864,15.109,20.649
1904,Zimbabwe,9,21.581586,18.888,23.111
1905,Zimbabwe,10,23.366481,19.573,25.828
1906,Zimbabwe,11,23.657406,20.880,25.529


In [12]:
# replace Country Names with iso-codes

df_temp_agg['country_id'] = df_temp_agg["Country_"].apply(get_country_code)

In [15]:
# drop the country Country name Column

df_temp_agg = df_temp_agg.drop("Country_", axis=1)

In [16]:
df_temp_agg.head()

Unnamed: 0,dt_,AverageTemperature_mean,AverageTemperature_min,AverageTemperature_max,country_id
0,1,1.7142,-7.7,8.589,AF
1,2,3.961238,-4.263,11.794,AF
2,3,9.753187,2.872,18.52,AF
3,4,15.361531,9.273,23.552,AF
4,5,21.18115,14.518,29.882,AF


In [17]:
# exclude non matching countries

df_temp_agg = df_temp_agg[df_temp_agg['country_id'] != 'no match']

In [18]:
# add appropriate column labels
df_temp_agg = df_temp_agg.rename(columns={'dt_':'month', 
                                          'AverageTemperature_mean':'temperature_mean', 
                                          'AverageTemperature_min':'temperature_min',
                                          'AverageTemperature_max':'temperature_max'})

In [19]:
# save data
df_temp_agg[['country_id', 'month', 'temperature_mean', 'temperature_min', 'temperature_max']].to_csv('../staging/temperature.csv', sep= ";", index=False)