In [10]:
import pandas as pd
import os
import simpledbf
from simpledbf import Dbf5

In [127]:
%run variables_functions.ipynb

In [12]:
month_to_number= {'JAN':1,
		'FEB':2,
		'MAR':3,
		'APR':4,
		'MAY':5,
		'JUN':6,
		'JUL':7,
		'AUG':8,
		'SEP':9,
		'OCT':10,
		'NOV':11,
		'DEC':12}

In [None]:
def read_unknown_header(path):
    header = 0
    df_year = pd.read_excel(path, header = header)
    while df_year.columns[0][:6] != 'Entity':
        header +=1
        df_year = pd.read_excel(path, header = header)
    return(df_year)


df_capacity = pd.DataFrame()
for folder in os.listdir('Data/Capacity_Complete/'):
    path = 'Data/Capacity_Complete/' + folder
    df_year = read_unknown_header(path)
    month = month_to_number[folder.split('_')[0][:3].upper()]
    year = float(folder.split('.')[0][-4:])
    print(year, month)
    df_year['Year'] = year
    df_year['Month'] = month
    df_year.columns = df_year.columns.str.replace('\n', '')
    df_year.columns = df_year.columns.str.replace(' ', '')
    df_tool = df_year.rename(columns={'PlantID':'Plant_Id','PlantState':'State','PrimeMoverCode':'Mover_Code',
                   'EnergySourceCode':'Source_Code','NetSummerCapacity(MW)':'Capacity_MW'})
    df_interest = df_tool[['Year','Month','Plant_Id','State','Mover_Code','Source_Code','Capacity_MW']]
    df_capacity = df_capacity.append(df_interest,ignore_index=True,sort=False)

In [56]:
def to_float(x):
    try:
        return(float(x))
    except:
        return(0)
    

df_capacity.Capacity_MW = df_capacity.Capacity_MW.apply(lambda x: to_float(x))

df_capacity.to_csv('treated_data/Capacity/capacity_complete.csv',sep=';')

# Capacity Coal Interpolation

In [147]:
df_capacity['Technology'] = df_capacity.Source_Code.apply(lambda x: convert_to_technology(x))
df_tool = df_capacity.where(df_capacity.Technology=='Coal').dropna()[['Year','Month','Plant_Id','Capacity_MW']]
coal_capacity = df_tool.groupby(['Year','Plant_Id','Month']).sum()

In [212]:
time_series = pd.date_range(start='01/2015', end='12/2021', freq = 'M')
df_tool_ = pd.Series(range(len(time_series)), index = time_series)
df_tool = df_tool_.reset_index()

df_time = pd.DataFrame()
df_time['Year'] = df_tool['index'].apply(lambda x: x.year)
df_time['Month'] = df_tool['index'].apply(lambda x: x.month)

df_time_capacity = df_time.where(df_time.Year>2014).dropna()

In [215]:
df_plants_alltimes = pd.DataFrame()

for plant_id in coal_capacity.reset_index().Plant_Id.unique():
    df_plant = pd.DataFrame()
    df_plant[['Year','Month']] = df_time_capacity[['Year','Month']]
    df_plant['Plant_Id'] = plant_id
    df_plants_alltimes = df_plants_alltimes.append(df_plant)

In [217]:
df_final = df_plants_alltimes.join(coal_capacity, on = ['Year','Plant_Id','Month'], how='left')

In [218]:
print(f'df_coal: {len(coal_capacity)} alltimes: {len(df_plants_alltimes)}, result:{len(df_final)} ')

df_coal: 24267 alltimes: 39342, result:39342 


In [220]:
df_interpolated = df_final.interpolate(mehod='linear')

In [222]:
print(f' antes: {len(df_interpolated)} depois: {len(df_interpolated.dropna())}')

 antes: 39342 depois: 39336


In [223]:
df_interpolated.dropna().to_csv('treated_data/Capacity/capacity_coal_interpolated.csv',sep=';')

# Old Capacity Data

In [225]:
old_capacity = pd.DataFrame()

In [268]:
energies = ['Onshore Wind Turbine','Natural Gas Steam Turbine', 'Conventional Steam Coal', 'Solar Photovoltaic']

df_capacity = pd.DataFrame()

for year in year_infos:
    path, energy_source, capacity = year_infos[year][0],year_infos[year][1], year_infos[year][2]
    state, plant_id, month = year_infos[year][3],year_infos[year][4], year_infos[year][5]
    print(year)

    if year <2008:
        continue

    elif year<=2010:
        df_tool = pd.read_excel(path,header = 0)
    
    else: 
        df_tool = pd.read_excel(path,header = 1)
    
    df_tool = df_tool.rename(columns = {energy_source:'Source_code',capacity:'Capacity_MW',
                                       state:'State',plant_id:'Plant_Id',month: 'Month'})
    df_tool['Year']=year
    df_year = df_tool[['Year','Month','Plant_Id','State','Source_code','Capacity_MW']]
    df_capacity = df_capacity.append(df_year, ignore_index=True)

2010
2017
2019
2018
2020
2016
2011
2002
2005
2004
2003
2014
2013
2012
2015
2008
2006
2001
2007
2009


In [271]:
df_capacity['Technology'] = df_capacity.Source_code.apply(lambda x: convert_to_technology(x))
df_tool_coal = df_capacity.where(df_capacity.Technology=='Coal').dropna()[['Year','Month','Plant_Id','Capacity_MW']]
coal_capacity = df_tool.groupby(['Year','Plant_Id','Month']).sum()

In [274]:
df_capacity.to_csv('treated_data/Capacity/capacity_general.csv')

In [299]:
time_series = pd.date_range(start='12/2008', end='12/2021', freq = 'M')
df_tool_ = pd.Series(range(len(time_series)), index = time_series)
df_tool = df_tool_.reset_index()

df_time = pd.DataFrame()
df_time['Year'] = df_tool['index'].apply(lambda x: x.year)
df_time['Month'] = df_tool['index'].apply(lambda x: x.month)

In [301]:
df_plants_alltimes = pd.DataFrame()

for plant_id in coal_capacity.reset_index().Plant_Id.unique():
    df_plant = pd.DataFrame()
    df_plant[['Year','Month']] = df_time[['Year','Month']]
    df_plant['Plant_Id'] = plant_id
    df_plants_alltimes = df_plants_alltimes.append(df_plant)

In [303]:
df_final = df_plants_alltimes.join(coal_capacity, on = ['Year','Plant_Id','Month'], how='left')

In [304]:
print(f'df_coal: {len(coal_capacity)} alltimes: {len(df_plants_alltimes)}, result:{len(df_final)} ')

df_coal: 10939 alltimes: 108108, result:108108 


In [305]:
df_interpolated = df_final.interpolate(mehod='linear')

In [306]:
df_interpolated.to_csv('treated_data/Capacity/capacity_coal_interpolated_2008_2021.csv',sep = ';')

In [308]:
df_capacity.Technology.unique()

array(['Hidroeletric', 'Coal', 'Natural Gas', 'Petroleum Products',
       'Nuclear', 'Solar', 'Geothermical', 'Wind',
       'Wood and Wood-Delivered Fuels', 'Other', 'Other Biomass',
       'Not in the Appendix', 'Other Gases'], dtype=object)

In [310]:
for technology in ['Coal','Natural Gas','Solar','Wind']:
    df_tool = df_capacity.where(df_capacity.Technology==technology).dropna()[['Year','Month','Plant_Id','Capacity_MW']]
    energy_capacity = df_tool.groupby(['Year','Plant_Id','Month']).sum()
    
    df_plants_alltimes = pd.DataFrame()

    for plant_id in energy_capacity.reset_index().Plant_Id.unique():
        df_plant = pd.DataFrame()
        df_plant[['Year','Month']] = df_time[['Year','Month']]
        df_plant['Plant_Id'] = plant_id
        df_plants_alltimes = df_plants_alltimes.append(df_plant)
        
    df_tool2 = df_plants_alltimes.join(coal_capacity, on = ['Year','Plant_Id','Month'], how='left')
    df_interpolated = df_tool2.interpolate(mehod='linear')
    df_interpolated.to_csv('treated_data/Capacity/capacity_'+technology.lower()+'_interpolated_2008_2021.csv',sep = ';')