In [148]:
import pandas as pd
import numpy as  np


In [149]:
#Load the energy data from the file
energy = pd.read_excel("Energy Indicators.xls")


In [150]:
#exluding header and footer
energy = energy[17:244]


In [151]:
#drop first two unneccessary columns
energy.drop(columns=['Unnamed: 0', 'Unnamed: 1'], inplace=True)

In [152]:
#rename columns
energy = energy.rename(columns={'Unnamed: 2':'Country', 
                        'Unnamed: 3':'Energy Supply', 
                        'Unnamed: 4':'Energy Supply per Capita', 
                        'Unnamed: 5':'% Renewable'})

In [153]:
#Convert Energy Supply to gigajoules 
energy['Energy Supply'] = energy['Energy Supply'] * 1000000


In [154]:
#mark missing data as NaN
energy['Energy Supply'] = energy["Energy Supply"].apply(lambda val: val if type(val) == int else np.nan)


In [155]:
#Rename the following list of countries

energy["Country"].replace({'Republic of Korea': 'South Korea',
'United States of America': 'United States',
'United Kingdom of Great Britain and Northern Ireland': 'United Kingdom',
'China, Hong Kong Special Administrative Region': 'Hong Kong'}, inplace=True)

In [156]:
import re

In [157]:

energy_list_countries = energy['Country']

# function remove numbers and/or parenthesis in their name
def remove_num_and_paran(country):
    try:
        return re.findall(
            '^[^(]*[^\0-9]',
            (energy_list_countries[country] if country in energy_list_countries.keys() else country)
        )[0]
    except:
        return np.nan

In [158]:
energy['Country'] = energy['Country'].apply(remove_num_and_paran)

In [159]:
#GDP from 1960 to 2015 from World Bank.

GDP = pd.read_csv("world_bank_GDP.csv", encoding='utf-8', engine='python',  skiprows=3, sep=',')
GDP.rename(columns={'Country Name': 'Country'}, inplace=True)

In [160]:
#GDP from 1960 to 2015 
GDP.drop(columns= ['2016', '2017', '2018', '2019', '2020', 'Unnamed: 65'], inplace=True)

In [161]:
#ename the list of countries
GDP["Country"].replace({'Korea, Rep.': 'South Korea',
                            'Iran, Islamic Rep.': 'Iran',
                            'Hong Kong SAR, China' : 'Hong Kong'
                            }, inplace=True)

In [162]:
#Load DataFrame "ScimEn" from 'Sciamgo Journal and Country Rank data for Energy Engineering and Power Technology'
ScimEn = pd.read_excel('scimagojr.xlsx')

In [163]:
# merge Energy, GDP, and ScimEn dataframes
joined_e_to_G = energy.merge(GDP, on="Country", how="inner")
joined_dataframes = joined_e_to_G.merge(ScimEn, on="Country", how="inner").set_index('Country')

In [164]:
joined_dataframes.drop(columns=['Country Code', 'Indicator Name', 'Indicator Code', '1960', '1961',
                                '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969',
                                '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', 
                                '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985',
                                '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993',
                                '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', 
                                '2002', '2003', '2004', '2005', 'Region'], inplace=True)

# Q1


In [165]:
joined_dataframes = joined_dataframes[['Rank', 'Documents', 'Citable documents','Citations', 'Self-citations', 
                                  'Citations per document', 'H index', 'Energy Supply', 'Energy Supply per Capita',
                                  '% Renewable', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
                                  '2014', '2015']]

In [176]:
def answer_one():
    '''-return the resulted DataFrame (20 columns and 15 entries)'''
    return joined_dataframes.sort_values(by='Rank').iloc[:15]

In [None]:
answer_one()

# Q2

In [174]:
def answer_two():
    '''Show average GDP over the last 10 years for list of countries
    -->return a Series named avgGDP with 15 countries and their average GDP'''
    Top15 = answer_one()
    avgGDP = (
        Top15[['2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']]
            .T
            .describe()
            .mean()
            .sort_values(ascending=False)
    )
    return avgGDP

In [None]:
answer_two()

# Q3

In [173]:
def answer_three():
    '''Show how much had the GDP changed over the 10 year span for the country with the 6th largest average GDP
    -->return a single number'''
    Top15 = answer_one() 
    country = answer_two().index[5]
    ratio = Top15.loc[country]['2006'] / Top15.loc[country]['2015']
    return ratio

In [None]:
answer_three()

# Q4

In [171]:
def answer_four():
    '''Function show the maximum value of ratio of Self-Citations to Total Citations
    -->return a tuple with the name of the country and the ratio'''
    Top15 = answer_one()
    ratio_citations = Top15['Self-citations'] / Top15['Citations']
    ratio_citations = ratio_citations.sort_values(ascending=False)
    return (ratio_citations.index[0], ratio_citations[0])

In [172]:
answer_four()

('China', 0.6912289816173135)

# Q5

In [141]:
def answer_five():
    '''Show the third most populous country by specified list
    -->return a single string value'''
    Top15 = answer_one()
    country = (Top15['Energy Supply'] / Top15['Energy Supply per Capita']).sort_values(ascending=False).index[2]
    return country

In [142]:
answer_five()

'Brazil'

# Q6

In [143]:
def answer_six():
    '''Show correlation between the number of citable documents per capita and the energy supply per capita
    -->return a single number'''
    Top15 = answer_one()
    Top15['Population'] = Top15['Energy Supply'] / Top15['Energy Supply per Capita']
    Top15['Documents per Capita'] = Top15['Citable documents'] / Top15['Population']
    correlation = Top15[['Documents per Capita', 'Energy Supply per Capita']].astype(float).corr().iloc[0][1]
    return correlation

In [144]:
answer_six()

0.7392980085965914


# Q7

In [145]:
# dictionary of Countries by Continent
ContinentDict  = {
    'China':'Asia', 
    'United States':'North America', 
    'Japan':'Asia', 
    'United Kingdom':'Europe', 
    'Russian Federation':'Europe', 
    'Canada':'North America', 
    'Germany':'Europe', 
    'India':'Asia',
    'France':'Europe', 
    'South Korea':'Asia', 
    'Italy':'Europe', 
    'Spain':'Europe', 
    'Iran':'Asia',
    'Australia':'Australia', 
    'Brazil':'South America',
    'Turkey': 'Asia',
    'Norway': 'Europe'
}

def answer_seven():
    '''Group the Countries by Continent show size (the number of countries in each continent), sum, mean,
    and std deviation for the estimated population of each continent
    -->return a DataFrame with index named Continent and columns ['size', 'sum', 'mean', 'std']'''
    Top15 = answer_one().reset_index()
    
    Top15['Continent'] = Top15['Country'].apply(lambda val: ContinentDict[val])
    Top15['Population'] = (Top15['Energy Supply'] / Top15['Energy Supply per Capita']).astype(float)    
    grouped_df = Top15.groupby(['Continent'])['Population'].agg(['size', 'sum', 'mean', 'std'])
    return grouped_df

In [146]:
answer_seven()

Unnamed: 0_level_0,size,sum,mean,std
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Asia,6,2975182000.0,495863700.0,641205000.0
Australia,1,23316020.0,23316020.0,
Europe,6,399142900.0,66523820.0,45488610.0
North America,1,35239860.0,35239860.0,
South America,1,205915300.0,205915300.0,
