In [1]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di

# This line will hide code by default when the notebook is exported as HTML
#di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)
# This line will add a button to toggle visibility of code blocks, for use with the HTML export version
di.display_html('''<button onclick="jQuery('.input_area').toggle(); jQuery('.prompt').toggle();">Toggle code</button>''', raw=True)

### Clean Development Mechanism (CDM) data cleaning project part 1
Data source: http://www.cdmpipeline.org/cdm-projects-region.htm

Objective: clean data and reorganize so that team can more easily find relevant data and information.

The team was interested in types of CDM projects, number of CERs (certified emission reduction units) per project, trends by region and country, approving organizations, and failed projects. 

In [2]:
import pandas as pd
import numpy as np
import openpyxl
import xlrd
import xlwt
import xlsxwriter
import math

In [3]:
tallies_file = "CDMStatesAndProvinces.xlsx"
tallies = pd.ExcelFile(tallies_file)
n = len(tallies.sheet_names) - 1

#### Example of one sheet inside raw Excel file
The raw data was organized such that each country had it's own sheet. There were then counts for the number of CDM projects of each type by county within each country. The team was not interested in counties, so I extracted totals for each country and made a new Excel file with one sheet. Each country got a row and the total types of each CDM project were listed. 

In [6]:
tallies.parse('Albania')

Unnamed: 0,"This workbook was produced by Jørgen Fenhann, UNEP DTU Partnership from the CDMPipeline of 1st October 2018, jqfe@dtu.dk, Phone (+45)40202789",Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29
,,,,,,,,,,,...,,,,,,,,,,
Albania Counties,Afforestation,Agriculture,Biomass energy,Cement,CO2 capture,Coal bed/mine methane,Energy distribution,EE households,EE industry,EE own generation,...,N2O,PFCs and SF6,Reforestation,Solar,Tidal,Transport,Wind,Total,GDP/cap,Population (millions)
Berat,,,,,,,,,,,...,,,,,,,,0,,
Diber,,,,,,,,,,,...,,,1,,,,,1,,
Durres,,,,,,,,,,,...,,,,,,,,0,,
Elbasan,,,,,,,,,,,...,,,1,,,,,2,,
Fier,,,,,,,,,,,...,,,,,,,,0,,
Gjirokaster,,,,,,,,,,,...,,,,,,,,0,,
Korce,,,,,,,,,,,...,,,1,,,,,2,,
Kukes,,,,,,,,,,,...,,,1,,,,,1,,


In [7]:
test = tallies.parse('Albania')
test = test.drop(test.index[0])
header = test.iloc[0]
test = test[1:]
test = test.rename(columns = header)
test = test.loc['Total',:]
test = test.to_frame().transpose()
test = test.rename({'Total': 'Albania'})
test = test.iloc[:,0:27]

In [8]:
columns = test.columns.tolist()
tallies_sheets = []
for sheet in tallies.sheet_names[1:]:
    df = tallies.parse(sheet)
    df = df.drop(df.index[0])
    header = df.iloc[0]
    df = df[1:]
    df = df.rename(columns = header)
    df = df.loc['Total',:]
    df = df.to_frame().transpose()
    df = df.rename({'Total': sheet})
    df = df.iloc[:,0:27]
    df.columns = columns
    tallies_sheets.append(df)

In [9]:
cdm_df = pd.concat(tallies_sheets)

In [10]:
cdm_df['Total'] = cdm_df.sum(axis=1)

In [11]:
cols = [cdm_df.columns[-1]] + [col for col in cdm_df if col != cdm_df.columns[-1]]
cdm_df = cdm_df[cols]

#### Final spreadsheet
Below are the first 5 rows (countries) of the new spreadsheet. Notice that I also moved the total column to the front, so it would be easier to get that information if that is all the team was looking for. 

In [12]:
cdm_df.head()

Unnamed: 0,Total,Afforestation,Agriculture,Biomass energy,Cement,CO2 capture,Coal bed/mine methane,Energy distribution,EE households,EE industry,...,Landfill gas,Methane avoidance,Mixed renewables,N2O,PFCs and SF6,Reforestation,Solar,Tidal,Transport,Wind
Albania,9.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,5,0,0,0,0
Algeria,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Angola,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Argentina,53.0,1,0,6,0,1,0,0,0,2,...,12,8,0,0,1,1,2,0,0,11
Armenia,6.0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0


In [18]:
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('cdm_consolidated.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
cdm_df.to_excel(writer, sheet_name='Sheet1')

# Close the Pandas Excel writer and output the Excel file.
writer.save()