In [36]:
import pandas as pd
import numpy as np
import json
import os

In [37]:
file1 = 'edited-data/Energy-Data-Edited.csv'
file2 = 'edited-data/worldPopulationData-Edited.csv'

In [38]:
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)

In [39]:
df1.head()

Unnamed: 0,Country,Year,pop,Region,SubRegion,biodiesel_cons_kboed,biodiesel_cons_pj,biodiesel_prod_kboed,biodiesel_prod_pj,biofuels_cons_ej,...,ren_power_ej,ren_power_twh,ren_power_twh_net,renewables_ej,solar_ej,solar_twh,solar_twh_net,wind_ej,wind_twh,wind_twh_net
0,Algeria,1980,18.739378,Africa,Northern Africa,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Algeria,1981,19.351357,Africa,Northern Africa,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Algeria,1982,20.000096,Africa,Northern Africa,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Algeria,1983,20.682111,Africa,Northern Africa,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Algeria,1984,21.39353,Africa,Northern Africa,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
df2.head()

Unnamed: 0,Country,Year,Population
0,Sri Lanka,1980,14943645.0
1,Sri Lanka,1981,15198918.0
2,Sri Lanka,1982,15438753.0
3,Sri Lanka,1983,15658442.0
4,Sri Lanka,1984,15872577.0


In [41]:
numerical_cols = [col for col in df1.columns if df1[col].dtype == 'float64' or df1[col].dtype == 'int64']
categorical_cols = [col for col in df1.columns if df1[col].dtype == 'object']

df1[numerical_cols].isnull().sum()

Year                    0
pop                     0
biodiesel_cons_kboed    0
biodiesel_cons_pj       0
biodiesel_prod_kboed    0
                       ..
solar_twh               0
solar_twh_net           0
wind_ej                 0
wind_twh                0
wind_twh_net            0
Length: 92, dtype: int64

In [42]:
# print percentage of missing values in each column
for col in df1.columns:
    pct_missing = np.mean(df1[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))


Country - 0%
Year - 0%
pop - 0%
Region - 0%
SubRegion - 0%
biodiesel_cons_kboed - 0%
biodiesel_cons_pj - 0%
biodiesel_prod_kboed - 0%
biodiesel_prod_pj - 0%
biofuels_cons_ej - 0%
biofuels_cons_kbd - 0%
biofuels_cons_kboed - 0%
biofuels_cons_pj - 0%
biofuels_prod_kbd - 0%
biofuels_prod_kboed - 0%
biofuels_prod_pj - 0%
biogeo_ej - 0%
biogeo_twh - 0%
biogeo_twh_net - 0%
co2_combust_mtco2 - 0%
co2_combust_pc - 0%
co2_combust_per_ej - 0%
co2_mtco2 - 0%
coalcons_ej - 0%
coalprod_ej - 0%
coalprod_mt - 0%
cobalt_kt - 0%
cobaltres_kt - 0%
diesel_gasoil_cons_kbd - 0%
elect_twh - 0%
electbyfuel_coal - 0%
electbyfuel_gas - 0%
electbyfuel_hydro - 0%
electbyfuel_nuclear - 0%
electbyfuel_oil - 0%
electbyfuel_other - 0%
electbyfuel_ren_power - 0%
electbyfuel_total - 0%
ethanol_cons_kboed - 0%
ethanol_cons_pj - 0%
ethanol_prod_kboed - 0%
ethanol_prod_pj - 0%
fuel_oil_cons_kbd - 0%
gascons_bcfd - 0%
gascons_bcm - 0%
gascons_ej - 0%
gasflared_bcm - 0%
gasflared_mtco2 - 0%
gasoline_cons_kbd - 0%
gasprod_b

In [43]:
# fill all null values with 0 since anyways year is never 0
df1[numerical_cols] = df1[numerical_cols].fillna(0)
df1[numerical_cols].isnull().sum()

Year                    0
pop                     0
biodiesel_cons_kboed    0
biodiesel_cons_pj       0
biodiesel_prod_kboed    0
                       ..
solar_twh               0
solar_twh_net           0
wind_ej                 0
wind_twh                0
wind_twh_net            0
Length: 92, dtype: int64

### Operations

In [47]:
def addNewCol(df1, df2, col1, col2, new_col, operation):
  df3 = df1.copy()

  if operation == "+":
    df3[new_col] = df1[col1] + df2[col2]
  elif operation == "-":
    df3[new_col] = df1[col1] - df2[col2]
  elif operation == "*":
    df3[new_col] = df1[col1] * df2[col2]
  elif operation == "/":
    # avoid division by 0
    df3[new_col] = df1[col1] / (df2[col2] + 1e-7)

  return df3
  


In [52]:
tmp_df = addNewCol(df1, df1, 
                    "wind_twh_net",
                    "wind_twh",
                    "new_col",
                    "+"
                  )


### Augmentation of two datasets
Assumption: both datasets have same number of rows and key to join is 'Country' and 'Year'

In [53]:
def augmentData(df1, df2):
  # join df1, df2 on 'Country' and 'Year'
  df3 = pd.merge(df1, df2, on=['Country', 'Year'])
  return df3


In [54]:
df3 = augmentData(df1, df2)

In [56]:
df3.head()

Unnamed: 0,Country,Year,pop,Region,SubRegion,biodiesel_cons_kboed,biodiesel_cons_pj,biodiesel_prod_kboed,biodiesel_prod_pj,biofuels_cons_ej,...,ren_power_twh,ren_power_twh_net,renewables_ej,solar_ej,solar_twh,solar_twh_net,wind_ej,wind_twh,wind_twh_net,Population
0,Algeria,1980,18.739378,Africa,Northern Africa,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18739378.0
1,Algeria,1981,19.351357,Africa,Northern Africa,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19351357.0
2,Algeria,1982,20.000096,Africa,Northern Africa,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20000096.0
3,Algeria,1983,20.682111,Africa,Northern Africa,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20682111.0
4,Algeria,1984,21.39353,Africa,Northern Africa,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21393530.0
