# Backend for Peaking Analysis Dashboard

In [10]:
#Import data science libraries
import pandas as pd
import numpy as np
import simplejson
#Calculate current year for use in peaking criteria 
from datetime import date
current_year = date.today().year
#Import libraries for sending email updates
import smtplib
from email.mime.text import MIMEText
#Set display options
pd.options.display.float_format = '{:.0f}'.format
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows',1000)

In [12]:
def read_in_data_from_2017_peaking_analysis(path):
    """
    Reads in data from 2017 peaking analysis and reshapes for use in programme.
    INPUT: Excel file (.xlsx)
    OUTPUT: Pandas DataFrame
    """
    df = pd.read_excel(path, sheet_name = 'Mastersheet', header = 2)
    df = df.iloc[0:80,0:51] #select relevant rows and columns 
    df.drop('Protocol', axis = 1, inplace = True) #drop protocol column 
    #df = df[df['Peaked? (Criteria 1, 2, 3 and 4 all met)']!='Do not use'] #filter out records marked as "do not use"
    df = df.iloc[0:80,0:31] #select relevant columns (ignore 'peaked criteria and helper columns')
    df = pd.melt(df, id_vars=["City", "Data Source"], var_name="Year", value_name="Emissions").sort_values(['City', 'Year']).reset_index(drop=True) #reshape dataframe
    df['Data Source'] = df['Data Source'].apply(clean_data_source) #clean data source columns [TEMPORARY FIX]
    df.dropna(subset=['Emissions'], inplace = True) #Drop rows where emissions is zero 
    return df

In [13]:
def clean_data_source(x):
    """
    Temporary fix while data sources are not updated in peaking analysis
    """
    if x == 'A':
        return 1
    if x == 'B':
        return 2
    if x == 'GHG' or x == 'CO2 only':
        return 7
    if pd.isnull(x):
        return 2
    if x == 2:
        return 1 
    else:
        return x

In [14]:
def read_in_data_from_gpc_tracker(path):
    """
    Reads in data from GPC Tracker and reformats for use in programme.
    INPUT: Excel file (.xlsx)
    OUTPUT: Pandas DataFrame
    """
    df = pd.read_excel(path, sheet_name = 'GPC data_Live', header = 2) #Read in gpc tracker data
    df = df[['Use for GPC Dashboard?', 'City', 'Year_calendar', 'BASIC']] #Select relevant columns 
    df = df[df['Use for GPC Dashboard?'] == 'Yes'].reset_index(drop=True) #Filter rows marked for use in GPC Dashboard
    df.rename(columns={'Year_calendar': 'Year', 'BASIC':'Emissions'}, inplace=True) #Rename to align with peaking analysis
    df['Data Source'] = 1 #Add data source column and code all data points as data source 1
    df.dropna(subset=['Emissions'], inplace = True) #Drop rows where emissions is zero 
    df = df[['City','Data Source','Year','Emissions']].sort_values(['City','Year']) #Reorder columns and sort
    return df

In [15]:
def join_data_from_peaking_analysis_and_gpc_tracker(df1, df2):
    """
    Combines data from 2017 Peaking Analysis and GPC Tracker into a single DataFrame and drops duplicate rows
    INPUT: 2 DataFrame
    OUTPUT: 1 DataFrame
    """
    df = df1.append(df2).drop_duplicates().sort_values(['City', 'Data Source', 'Year']).reset_index(drop=True) 
    df = df[df['City']!='Basel'] #Remove Basel from the DataFrame as it is no longer a C40 city
    return df

In [16]:
def calculate_peak_emissions(df, current_year):
    """
    Analyses city GHG emissions to determine if they have peaked
    INPUT: DataFrame containing peaking analysis and GPC Tracker GHG emissions
    OUTPUT: DataFrame with assessment of whether each city has peaked
    """
    
    def reshape_data(df):
        """
        Reshapes DataFrame so that each city and datasource has a unique row with years as a column header
        INPUT: DataFrame
        OUTPUT: Reshaped DataFrame
        """
        df = df.pivot_table(values='Emissions', index=['City','Data Source'], columns='Year', aggfunc='first').reset_index().fillna(0)
        df.columns.name = None #set columns index name to none 
        return df

    def calculate_peaking_parameters(df):
        """
        Caclulatea parameters used to assess whether city has peaked emissions
        INPUT: DataFrame
        OUTPUT: DataFrame with peakign parameters as 4 additional columns
        """
        cols = df.columns.difference(['City','Data Source'])
        df['Num data points'] = df[cols].gt(0).sum(axis=1)
        df['Max emissions'] = df[cols].max(axis =1)
        df['Max emissions year'] = df[cols].idxmax(axis =1)
        df['Recent emissions'] = df[cols].apply(lambda x: x.iloc[x.nonzero()].iloc[-1], axis=1)
        df['Recent emissions year'] = df[cols].apply(lambda x: x.iloc[x.nonzero()].index[-1], axis=1)
        return df
    
    def apply_peaking_criteria(df, current_year):
        """
        Calculates pearking criteria using peaking parameters
        INPUT: DataFrame
        OUTPUT: DataFrame with Boolean assessment for each peaking criteria
        """
        #Peaking Criteria 1: At least 3 years of data available?
        df['PC1'] = (current_year - df['Recent emissions year'] >= 3)
        #Peaking Criteria 2: Max emissions >5 years before recent inventory?
        df['PC2'] = (df['Recent emissions year'] - df['Max emissions year'] >= 5)
        #Peaking Criteria 3: Recent inventory < 5 years old
        df['PC3'] = (current_year - df['Recent emissions year'] <= 5)
        #Peaking Criteria 4: Max emissions >10% higher than recent inventory
        df['PC4'] = ((df['Max emissions']-df['Recent emissions'])/df['Recent emissions']) >= 0.1
        return df
    
    def calculate_peak_emissions_status(df):
        """
        Analyses peaking criteria to assess whether city has peaked
        INPUT: DataFrame
        OUTPUT: DataFrame with peaking assessment as additional columne
        """
        
        def f(x):
            if x['PC1'] & x['PC2'] & x['PC3'] & x['PC4']: #If all peaking criteria are TRUE returns 'PEAKED'
                return 'PEAKED'
            elif x['PC1'] & (not x['PC2']) & x['PC3'] & (not x['PC4']): #If first 3 peaking critiera are TRUE returns 'NOT PEAKED'
                return 'NOT PEAKED'
            else: #Otherwise returns 'CANNOT TELL'
                return 'UNKNOWN'
        
        df['Peak Status'] = df.apply(f, axis = 1) #Calculates columns by applying above function across rows 
        return df
        
    def rename_columns(df):
        """
        Renames columns for ease of understanding
        """
        df.rename(columns ={
            'PC1':'PC1: At least 3 year of years of data available',
            'PC2':'PC2: Max emissions >5 years before recent inventory',
            'PC3':'PC3: Recent inventory <5 years old',
            'PC4':'PC4: Max emissions <10% higher than recent inventory'},
            inplace = True)
        return df 
  
    df = reshape_data(df)
    df = calculate_peaking_parameters(df)
    df = apply_peaking_criteria(df, current_year)
    df = calculate_peak_emissions_status(df)
    df = rename_columns(df)
    return df

In [17]:
def select_duplicate_cities_to_use_in_dashboard(df):
    """
    Sorts input dataframe by city, peak status, number data points and data source and removes duplicates keeping the
    first record. Duplicate records are retained in this order: Status = PEAKED > Status = NOT PEAKED > Highest number
    data points > lowest numerical data source code. 

    INPUT: DataFrame 
    
    OUTPUT: DataFrame with sorted cities conaining NO duplicates for use in peaking analysis dashboard 

    """
    df['Peak Status'] = pd.Categorical(df['Peak Status'], ['PEAKED', 'NOT PEAKED', "UNKNOWN"]) #Make peak status a categorical variable
    df = df.sort_values(['City', 'Peak Status', 'Num data points', 'Data Source'], ascending = [True, True, False, True])

    #Initialise values for loop 
    ref_city = df['City'][0]
    df.loc[0, 'Use for dashboard?'] = 'Y'
    
    for index, row in df.iterrows(): #iterate through rows in DataFrame
        city = row['City']
        if city != ref_city:
            df.loc[index,'Use for dashboard?'] = 'Y'
            ref_city = city
        else:
            df.loc[index, 'Use for dashboard?'] = 'N'
    return df

In [18]:
def reshape_data_for_dashboard(df):
    """
    Reshapes data for use in the Qlik dashboatd
    INPUT: DataFrame (Years as rows)
    OUTPUT: DataFrame (Years as column)
    """
    df = df[df['Use for dashboard?'] == 'Y'] #filter rows to be used in dashboard 
    cols = df.columns.difference(['Num data points', 'Max emissions', 'Recent emissions', 'Recent emissions year', 'PC1: At least 3 year of years of data available', 'PC2: Max emissions >5 years before recent inventory', 'PC3: Recent inventory <5 years old', 'PC4: Max emissions <10% higher than recent inventory', 'Use for dashboard?'])
    df = df[cols] #Select columns for use in dashboard
    df = pd.melt(df, id_vars=["City", "Data Source", "Peak Status", "Max emissions year"], var_name="Year", value_name="Emissions").sort_values(['City', 'Year']).reset_index(drop=True)
    df['Peak year'] = df.apply(lambda x : 1 if x['Max emissions year'] == x['Year'] and x['Peak Status'] == 'PEAKED' else 0, axis=1)
    df.drop('Max emissions year', axis = 1, inplace = True)
    df = df[['City', 'Data Source', 'Year', 'Emissions', 'Peak Status', 'Peak year']]
    
    #def map_cities_to_regions():
        
    
    
    #def create_regional_rankings():
        
    
    return df

In [19]:
def generate_dataframes(path_1, path_2):
    """
    Generates DataFrames used in the programme by calling above functions
    INPUT: File paths to 2017 Peaking Analysis and GPC Tracker
    OUTPUT: Tuple of 6 DataFrames
    """
    df1 = read_in_data_from_2017_peaking_analysis(path_1)
    df2 = read_in_data_from_gpc_tracker(path_2)
    df3 = join_data_from_peaking_analysis_and_gpc_tracker(df1, df2)
    df4 = calculate_peak_emissions(df3,current_year)
    df5 = select_duplicate_cities_to_use_in_dashboard(df4)
    df6 = reshape_data_for_dashboard(df5)
    return (df1, df2, df3, df4, df5, df6)

In [20]:
def write_to_excel(results):
    """
    Writes dataframes to Excel
    INPUT: Tuple of 6 DataFrames
    OUTPUT: None
    """
    #Create a Pandas Excel writer using XlsxWriter as the engine
    writer = pd.ExcelWriter('/Users/oliverwills/desktop/peaking_emissions_dashboard_final.xlsx', engine='xlsxwriter')
    #Write each dataframe to a different worksheet
    results[4].to_excel(writer, sheet_name='MASTER_Peak_Emissions', index = False)
    results[5].to_excel(writer, sheet_name='DASHBOARD_Peak_Emissions', index = False)
    #Close the Pandas Excel writer and output the Excel file
    writer.save()

In [21]:
def run_programme(path_1, path_2):
    results = generate_dataframes(path_1, path_2)
    write_to_excel(results)
    return results

In [22]:
path_1 = '/Users/oliverwills/Box/C40 (internal)/Regions and Cities (internal)/M&P/04_Analytics/03_Other analytics/01_Peaking analysis/00_GCAS/00 Master_Peaking analysis data review_20180905 NR.xlsx'
path_2 = '/Users/oliverwills/Box/C40 (internal)/Regions and Cities (internal)/M&P/04_Analytics/00_Raw data/01_Emissions/Live tracker/01_GPC Inventory Tracker.xlsx'

In [23]:
results = run_programme(path_1, path_2)

In [24]:
results[0]

Unnamed: 0,City,Data Source,Year,Emissions
25,Accra,1,2015,2321904
82,Amman,1,2014,7431422
84,Amman,1,2016,8015906
87,Amsterdam,2,1990,3011000
105,Amsterdam,2,2008,4008518
106,Amsterdam,2,2009,4045686
107,Amsterdam,2,2010,4026047
108,Amsterdam,2,2011,3979708
109,Amsterdam,2,2012,4072819
110,Amsterdam,2,2013,4044060


In [25]:
results[1]

Unnamed: 0,City,Data Source,Year,Emissions
0,Accra,1,2015,2321904
1,Amman,1,2014,7431422
2,Amsterdam,1,2015,4804359
5,Athens,1,2014,4062001
4,Athens,1,2015,4012305
3,Athens,1,2016,3453312
13,Auckland,1,1990,5643291
12,Auckland,1,2009,7551989
11,Auckland,1,2010,7396110
10,Auckland,1,2011,7176723


In [26]:
results[2]

Unnamed: 0,City,Data Source,Year,Emissions
0,Accra,1,2015,2321904
1,Amman,1,2014,7431422
2,Amman,1,2016,8015906
3,Amsterdam,1,2015,4804359
4,Amsterdam,2,1990,3011000
5,Amsterdam,2,2008,4008518
6,Amsterdam,2,2009,4045686
7,Amsterdam,2,2010,4026047
8,Amsterdam,2,2011,3979708
9,Amsterdam,2,2012,4072819


In [27]:
results[3]

Unnamed: 0,City,Data Source,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,Num data points,Max emissions,Max emissions year,Recent emissions,Recent emissions year,PC1: At least 3 year of years of data available,PC2: Max emissions >5 years before recent inventory,PC3: Recent inventory <5 years old,PC4: Max emissions <10% higher than recent inventory,Peak Status
0,Accra,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2321904,0,0,1,2321904,2015,2321904,2015,True,False,True,False,UNKNOWN
1,Amman,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7431422,0,8015906,0,2,8015906,2016,8015906,2016,True,False,True,False,UNKNOWN
2,Amsterdam,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4804359,0,0,1,4804359,2015,4804359,2015,True,False,True,False,UNKNOWN
3,Amsterdam,2,3011000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4008518,4045686,4026047,3979708,4072819,4044060,4015035,4136792,3948153,3932562,11,4136792,2015,3932562,2017,False,False,True,False,UNKNOWN
4,Athens,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5066334,5054790,4391579,0,3,5066334,2014,4391579,2016,True,False,True,True,UNKNOWN
5,Auckland,1,5643291,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7551989,7396110,7176723,7739018,7703362,7581652,7750281,0,0,8,7750281,2015,7750281,2015,True,False,True,False,UNKNOWN
6,Austin,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13001831,0,0,12527277,0,2,13001831,2013,12527277,2016,True,False,True,False,UNKNOWN
7,Austin,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13037962,0,0,14050168,0,0,13739417,0,0,0,0,3,14050168,2010,13739417,2013,True,False,False,False,UNKNOWN
8,Bangkok,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,50360919,0,0,0,0,1,50360919,2013,50360919,2013,True,False,False,False,UNKNOWN
9,Bangkok,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,42650000,0,0,0,0,0,43870000,0,0,0,0,2,43870000,2013,43870000,2013,True,False,False,False,UNKNOWN


In [28]:
results[4]

Unnamed: 0,City,Data Source,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,Num data points,Max emissions,Max emissions year,Recent emissions,Recent emissions year,PC1: At least 3 year of years of data available,PC2: Max emissions >5 years before recent inventory,PC3: Recent inventory <5 years old,PC4: Max emissions <10% higher than recent inventory,Peak Status,Use for dashboard?
0,Accra,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2321904,0,0,1,2321904,2015,2321904,2015,True,False,True,False,UNKNOWN,N
1,Amman,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7431422,0,8015906,0,2,8015906,2016,8015906,2016,True,False,True,False,UNKNOWN,Y
3,Amsterdam,2,3011000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4008518,4045686,4026047,3979708,4072819,4044060,4015035,4136792,3948153,3932562,11,4136792,2015,3932562,2017,False,False,True,False,UNKNOWN,Y
2,Amsterdam,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4804359,0,0,1,4804359,2015,4804359,2015,True,False,True,False,UNKNOWN,N
4,Athens,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5066334,5054790,4391579,0,3,5066334,2014,4391579,2016,True,False,True,True,UNKNOWN,Y
5,Auckland,1,5643291,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7551989,7396110,7176723,7739018,7703362,7581652,7750281,0,0,8,7750281,2015,7750281,2015,True,False,True,False,UNKNOWN,Y
7,Austin,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13037962,0,0,14050168,0,0,13739417,0,0,0,0,3,14050168,2010,13739417,2013,True,False,False,False,UNKNOWN,Y
6,Austin,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13001831,0,0,12527277,0,2,13001831,2013,12527277,2016,True,False,True,False,UNKNOWN,N
9,Bangkok,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,42650000,0,0,0,0,0,43870000,0,0,0,0,2,43870000,2013,43870000,2013,True,False,False,False,UNKNOWN,Y
8,Bangkok,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,50360919,0,0,0,0,1,50360919,2013,50360919,2013,True,False,False,False,UNKNOWN,N


In [29]:
results[5]

Unnamed: 0,City,Data Source,Year,Emissions,Peak Status,Peak year
0,Amman,1,1990,0,UNKNOWN,0
1,Amman,1,1991,0,UNKNOWN,0
2,Amman,1,1992,0,UNKNOWN,0
3,Amman,1,1993,0,UNKNOWN,0
4,Amman,1,1994,0,UNKNOWN,0
5,Amman,1,1995,0,UNKNOWN,0
6,Amman,1,1996,0,UNKNOWN,0
7,Amman,1,1997,0,UNKNOWN,0
8,Amman,1,1998,0,UNKNOWN,0
9,Amman,1,1999,0,UNKNOWN,0


In [30]:
# def record_cities_that_have_peaked(df):
#     df = df[df['Use for dashboard?'] == 'Y']
#     df = df[df['Peak Status'] == 'PEAKED']
#     cities = list(df['City'].drop_duplicates())
    
#     f = open('/Users/oliverwills/desktop/output.txt', 'r+')
#     previous_cities = simplejson.load(f)
#     f.truncate(0)
#     simplejson.dump(cities, f)
#     f.close()
    
#     cities_to_recently_peak = []

#     for city in cities:
#         if city not in previous_cities:
#             cities_to_recently_peak.append(city)
    
#     if len(cities_to_recently_peak) == 0:
        
#         def send_email(cities_to_recently_peak):
#             msg = MIMEMultipart()
#             msg['Subject'] = 'Our family reunion'
#             me = 'owills@c40.org'
#             family = 'owills@c40.org'
#             msg['From'] = me
#             msg['To'] = family
#             msg.preamble = 'Our family reunion'
            
#         send_email(cities_to_recently_peak)
    
#     return cities_to_recently_peak
        
# cities = record_cities_that_have_peaked(results[4])
# cities