This notebook does some basic data cleaning of the "commodity_transaction" column to make the dataframe tidy and remove some spelling errors. Should be easier to use for summary statistics!

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

df = pd.read_csv("../input/international-energy-statistics/all_energy_statistics.csv")

In [2]:
df.shape

(1189482, 7)

In [3]:
df.head()

Unnamed: 0,country_or_area,commodity_transaction,year,unit,quantity,quantity_footnotes,category
0,Austria,Additives and Oxygenates - Exports,1996,"Metric tons, thousand",5.0,,additives_and_oxygenates
1,Austria,Additives and Oxygenates - Exports,1995,"Metric tons, thousand",17.0,,additives_and_oxygenates
2,Belgium,Additives and Oxygenates - Exports,2014,"Metric tons, thousand",0.0,,additives_and_oxygenates
3,Belgium,Additives and Oxygenates - Exports,2013,"Metric tons, thousand",0.0,,additives_and_oxygenates
4,Belgium,Additives and Oxygenates - Exports,2012,"Metric tons, thousand",35.0,,additives_and_oxygenates


In [4]:
# the category column doesn't look to be in need of cleaning
pd.set_option("display.max_rows", 100)
df.category.value_counts()

total_electricity                                              133916
gas_oil_diesel_oil                                              97645
fuel_oil                                                        75132
natural_gas_including_lng                                       64161
liquified_petroleum_gas                                         62156
motor_gasoline                                                  53198
fuelwood                                                        52032
electricity_net_installed_capacity_of_electric_power_plants     50229
other_kerosene                                                  43466
hard_coal                                                       42307
kerosene_type_jet_fuel                                          34558
heat                                                            29656
charcoal                                                        28000
conventional_crude_oil                                          27352
other_bituminous_coa

In [5]:
# commodity transcation looks to have mulitple pieces of information in each row
df.commodity_transaction.value_counts().tail(50)

Bitumen - Consumption by mining and quarrying                                                         1
Coking coal - consumption not elsewhere specified (transport)                                         1
Coking coal - consumption by wood and wood products industry                                          1
Charcoal - Consumption by machinery                                                                   1
Lubricants - Consumption by food and tobacco                                                          1
Other liquid biofuels - Transformation in heat plants - autoproducers                                 1
Naphtha - Transformation in heat plants - main activity producers                                     1
Charcoal - Transformation in electricity plants - main activity producers                             1
Lubricants - Consumption by machinery                                                                 1
Paraffin waxes - Consumption by wood and wood products          

In [6]:
df.commodity_transaction.head()

0    Additives and Oxygenates - Exports
1    Additives and Oxygenates - Exports
2    Additives and Oxygenates - Exports
3    Additives and Oxygenates - Exports
4    Additives and Oxygenates - Exports
Name: commodity_transaction, dtype: object

In [7]:
# how mnay hypens per row?
df.commodity_transaction.str.count(" - ").value_counts()

#splits pandas column by string

1    1112309
0      41416
2      35757
Name: commodity_transaction, dtype: int64

In [8]:
?str.split

In [9]:
# split commodity_transaction column into more useful columns
split_commodities = df.commodity_transaction.str.split(" - | – ",  expand=True)
split_commodities.head()

Unnamed: 0,0,1,2
0,Additives and Oxygenates,Exports,
1,Additives and Oxygenates,Exports,
2,Additives and Oxygenates,Exports,
3,Additives and Oxygenates,Exports,
4,Additives and Oxygenates,Exports,


In [10]:
# more informative column names
split_commodities.columns = ["commodity","transaction_type","additional_transaction_info"]
split_commodities.head()

Unnamed: 0,commodity,transaction_type,additional_transaction_info
0,Additives and Oxygenates,Exports,
1,Additives and Oxygenates,Exports,
2,Additives and Oxygenates,Exports,
3,Additives and Oxygenates,Exports,
4,Additives and Oxygenates,Exports,


In [11]:
# check out the commodities: they actually look pretty good
pd.set_option('display.max_rows', 100)
split_commodities.commodity.str.lower().value_counts()

electricity                                                  165140
gas oil/ diesel oil                                           97441
fuel oil                                                      75132
natural gas (including lng)                                   64161
motor gasoline                                                53198
fuelwood                                                      52032
liquefied petroleum gas (lpg)                                 49078
other kerosene                                                43466
hard coal                                                     40214
kerosene-type jet fuel                                        34558
charcoal                                                      28000
other bituminous coal                                         23138
other oil products n.e.c.                                     22081
lubricants                                                    21374
conventional crude oil                          

In [12]:
# cleaning transaction type
pd.set_option('display.max_rows', 250)
split_commodities.transaction_type = split_commodities.transaction_type.str.lower().str.strip()

# spelling corrections: 
# transformatin -> transformation 
# " /" -> "/"
# "/ " -> "/"
# "non energy uses"  -> "consumption for non-energy uses"

split_commodities.transaction_type = split_commodities.transaction_type.str.replace("transformatin", "transformation")
split_commodities.transaction_type = split_commodities.transaction_type.str.replace("non energy uses", "consumption for non-energy uses")
split_commodities.transaction_type = split_commodities.transaction_type.str.replace(" /", "/")
split_commodities.transaction_type = split_commodities.transaction_type.str.replace("/ ", "/")

split_commodities.transaction_type.value_counts()

total energy supply                                                                    89806
final consumption                                                                      73149
final energy consumption                                                               68195
imports                                                                                61151
production                                                                             61129
consumption by other                                                                   43549
exports                                                                                36052
transformation                                                                         35725
stock changes                                                                          32816
consumption by households                                                              32367
production from refineries                                            

In [13]:
# check out column #3 & make sure it looks good
split_commodities.additional_transaction_info.str.lower().value_counts()

main activity producers    18462
autoproducers              15788
electricity plants          5887
chp plants                  3974
heat plants                 1854
total                       1507
Name: additional_transaction_info, dtype: int64

In [14]:
# add cleaned columns to our output dataframe
df = pd.concat([df, 
                split_commodities.commodity.str.lower(),
                split_commodities.transaction_type,
                split_commodities.additional_transaction_info.str.lower()], 
               axis=1)

In [15]:
df.head()

Unnamed: 0,country_or_area,commodity_transaction,year,unit,quantity,quantity_footnotes,category,commodity,transaction_type,additional_transaction_info
0,Austria,Additives and Oxygenates - Exports,1996,"Metric tons, thousand",5.0,,additives_and_oxygenates,additives and oxygenates,exports,
1,Austria,Additives and Oxygenates - Exports,1995,"Metric tons, thousand",17.0,,additives_and_oxygenates,additives and oxygenates,exports,
2,Belgium,Additives and Oxygenates - Exports,2014,"Metric tons, thousand",0.0,,additives_and_oxygenates,additives and oxygenates,exports,
3,Belgium,Additives and Oxygenates - Exports,2013,"Metric tons, thousand",0.0,,additives_and_oxygenates,additives and oxygenates,exports,
4,Belgium,Additives and Oxygenates - Exports,2012,"Metric tons, thousand",35.0,,additives_and_oxygenates,additives and oxygenates,exports,


In [16]:
with open("cleaned_energy_data.csv", "w+") as file:
    file.write(df.to_csv())