# EDA 1

> "Which country has the most severe overall temperature change from decade to decade?   
>    Please elaborate on how you interpret severe change and decade to decade."

In [1]:
import pandas as pd

In [2]:
pdf = pd.read_csv('./data/GlobalLandTemperaturesByCountry.csv',
        parse_dates=['dt'], infer_datetime_format=True) \
        .dropna()

In [9]:
pdf.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country,decade
0,1743-11-01,4.384,2.294,Åland,1740
5,1744-04-01,1.53,4.68,Åland,1740
6,1744-05-01,6.702,1.789,Åland,1740
7,1744-06-01,11.609,1.577,Åland,1740
8,1744-07-01,15.342,1.41,Åland,1740


Define a decade based only on the calendar (1995 --> 1990) 

In [4]:
pdf['decade'] = pdf.dt.apply(lambda x: x.year-x.year%10)

In [5]:
def get_longest(dates, datediff=10):
    
    # Compute the differences to check consecutivity
    diffs = np.diff(dates)

    # Storing results
    pts = []
    current = [dates[0]]
    
    # Gather the dates into consecutive ranges
    for n, diff in enumerate(diffs):
        
        if diff > datediff:
            #current.append(dates[n+1])
            pts.append(current)
            current = [dates[n+1]]

        else:
            current.append(dates[n+1])
        pts.append(current)

    # Get longest list of consecutive decades
    longest = np.argmax(np.array([len(l) for l in pts]))

    # Return the indices of the longest consecutive
    return np.in1d(dates, pts[longest]).nonzero()[0]

def _dec_diff(date_col, target_col):
    
    def meth(df):
        
        # Get the longest consecutive sequence of dates
        longest = get_longest(df[date_col].values)

        # Get the standard deviation for those dates
        stddev = np.std(df[target_col].values[longest])
    
        return (stddev, df[date_col].values[longest].min(), df[date_col].values[longest].max()) 

    return meth

def decade_diff(df, date_range=[]):

    # Columns to process
    target_col =  'AverageTemperature'
    date_col = 'decade'
    grouping = ['Country']
    
    # Declare func with some arguments
    dec_diff = _dec_diff(date_col, target_col)
    
    # Filter to the date range (if given)
    if len(date_range)>0:
        pdf_ = df[(df['dt'] >= date_range[0]) & (df['dt'] <= date_range[1])]
    else:
        pdf_ = df.copy()
    
    # First, get the average per decade
    pdf_ = pdf_.groupby(grouping +[date_col])[target_col].mean() \
        .reset_index() \
        .sort_values(date_col) \
        .reset_index(drop=True)

    pdf_ =  pdf_.groupby(grouping)\
        .apply(dec_diff) \
        .reset_index()

    pdf_['temp_change'] = pdf_[0].apply(lambda x: x[0])
    pdf_['min_decade'] = pdf_[0].apply(lambda x: x[1])
    pdf_['max_decade'] = pdf_[0].apply(lambda x: x[2])

    pdf_ = pdf_.sort_values('temp_change', ascending=False) \
        .reset_index(drop=True) \
        .drop(columns=[0])

    return pdf_


Procedure

First aggregation:

1. Group on Country and decade
2. Get the mean temp for each decade
3. Save the result

Second aggregation: 

1. Group on Country 
2. Get the longest consecutive sequence of decades
3. Measure the standard deviation of the mean temperature differences per decade  
    and the first and last decade
    
Sort the results based on temperature change.

In [6]:
pdfg = decade_diff(pdf)

In [7]:
pdfg.head(10)

Unnamed: 0,Country,temp_change,min_decade,max_decade
0,Kuwait,2.99388,1830,2010
1,Georgia,1.369465,1770,2010
2,Afghanistan,1.115474,1830,2010
3,Uzbekistan,1.069989,1810,2010
4,Canada,0.937557,1810,2010
5,Turkmenistan,0.93073,1810,2010
6,Greenland,0.854224,1820,2010
7,Azerbaijan,0.849617,1800,2010
8,Armenia,0.848696,1780,2010
9,Denmark,0.843686,1820,2010


In [8]:
pdfg.tail(10)

Unnamed: 0,Country,temp_change,min_decade,max_decade
232,Macau,0.380985,1840,2010
233,Cayman Islands,0.373169,1820,2010
234,Cuba,0.365728,1820,2010
235,Bahamas,0.35386,1820,2010
236,Oceania,0.351914,1850,2010
237,Australia,0.350612,1850,2010
238,Papua New Guinea,0.348855,1880,2010
239,Guam,0.345019,1900,2010
240,Northern Mariana Islands,0.345019,1900,2010
241,Timor Leste,0.331007,1870,2010
