In [1]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
import pickle
from collections import Counter
from sklearn.preprocessing import Imputer
%matplotlib inline

In [2]:
livestock_desc = pd.read_csv("data/metadata/var_desc_livestock.csv")
crops_desc = pd.read_csv("data/metadata/var_desc_crops.csv")

In [3]:
# Load cleaned NCD data from pickled files
out = open('data/clean/var_desc_livestock.p', 'rb')
livestock_desc = pickle.load(out)
out.close()
out = open('data/clean/var_desc_crops.p', 'rb')
crops_desc = pickle.load(out)
out.close()

In [4]:
livestock_desc['Description'] = livestock_desc['Description'].astype('string')
crops_desc['Description'] = crops_desc['Description'].astype('string')

In [5]:
livestock_desc.head()

Unnamed: 0,Item Code,Item,Description,HS Code,HS07 Code,HS12 Code,CPC Code
0,2946,Animal fats,,,,,
1,2941,Animal Products,,,,,
2,2769,"Aquatic Animals, Others","Default composition: 1587 Aqutc Anim F, 1588 A...",,,,
3,2775,Aquatic Plants,"Default composition: 1594 Aquatic plants, fres...",,,,
4,2961,"Aquatic Products, Other",,,,,


In [6]:
crops_desc.head()

Unnamed: 0,Item Code,Item,Description,HS Code,HS07 Code,HS12 Code,CPC Code
0,2924,Alcoholic Beverages,,,,,
1,2617,Apples and products,"Default composition: 515 Apples, 518 Juice, ap...",,,,
2,2615,Bananas,Default composition: 486 Bananas,,,,
3,2513,Barley and products,"Default composition: 44 Barley, 45 Barley, pot...",,,,
4,2546,Beans,"Default composition: 176 Beans, dry",,,,


We would like to assess whether multiple items contain a given code in their descriptions, which would result in double counting that code.

In [13]:
# create counters for each code in the description
livestock_counter = Counter()
crops_counter = Counter()

# map each item to the codes it contains
livestock_dict = {}
crops_dict = {}

# go through all descriptions in livestock
for index, val in enumerate(livestock_desc['Description']):
    ingredients = [int(num) for num in val.split() if num.isdigit()]
    # check that the list is not empty
    if ingredients:
        item = livestock_desc.iloc[index, :]['Item']
        for ingredient in ingredients:
            livestock_counter[ingredient] += 1
        livestock_dict[item] = ingredients
    
# go through all description in crops
for index, val in enumerate(crops_desc['Description']):
    ingredients = [int(num) for num in val.split() if num.isdigit()]
    # check that the list is not empty
    if ingredients:
        item = crops_desc.iloc[index, :]['Item']
        for ingredient in ingredients:
            crops_counter[ingredient] += 1
        crops_dict[item] = ingredients

# store redudant livestock codes
redundant_livestock_codes = []
# store redundant crop codes
redundant_crop_codes = []

# get redundant item codes for livestock and crops
print "Redundant Livestock codes:"
for key, value in livestock_counter.items():
    if value > 1:
        print key
        redundant_livestock_codes.append(key)

print "Redundant Crop codes:"
for key, value in crops_counter.items():
    if value > 1:
        print key
        redundant_crop_codes.append(key)

Redundant Livestock codes:
Redundant Crop codes:
27
33
35
154
155
160
161
162
163
165
166
167
172
173
242
567
568


In [12]:
for key, value in crops_dict.items():
    for code in value:
        if code in redundant_crop_codes:
            print key + ": " + str(code)

Vegetables, Other: 567
Vegetables, Other: 568
Fruits, Other: 567
Fruits, Other: 568
Sugar, Refined Equiv: 162
Sugar (Raw Equivalent): 162
Groundnuts (in Shell Eq): 242
Sugar, Raw Equivalent: 154
Sugar, Raw Equivalent: 155
Sugar, Raw Equivalent: 160
Sugar, Raw Equivalent: 161
Sugar, Raw Equivalent: 162
Sugar, Raw Equivalent: 163
Sugar, Raw Equivalent: 166
Sugar, Raw Equivalent: 167
Sugar, Raw Equivalent: 172
Sugar, Raw Equivalent: 173
Molasses: 165
Sugar non-centrifugal: 163
Sweeteners, Other: 154
Sweeteners, Other: 155
Sweeteners, Other: 160
Sweeteners, Other: 161
Sweeteners, Other: 165
Sweeteners, Other: 166
Sweeteners, Other: 167
Sweeteners, Other: 172
Sweeteners, Other: 173
Rice (Paddy Equivalent): 27
Rice (Paddy Equivalent): 33
Rice (Paddy Equivalent): 35
Groundnuts (Shelled Eq): 242
Rice (Milled Equivalent): 27
Rice (Milled Equivalent): 33
Rice (Milled Equivalent): 35


There were no redundancies for livestock.
For crops, redundancies were:

* 567: Watermelon
    * Vegetables, Other
    * Fruits, Other
* 568: Melon
    * Vegetables, Other
    * Fruits, Other
* 242: Groundnuts
    * Groundnuts (in Shell Eq)	
    * Groundnuts (Shelled Eq)	
    * Oilcrops Oil, Other
* 154: Fructose chemically pure
    * Sugar, Raw Equivalent	
    * Sweeteners, Other	
* 155: Maltose chemically pure
    * Sugar, Raw Equivalent
    * Sweeteners, Other
* 160: Maple sugar and syrups
    * Sugar, Raw Equivalent	
    * Sweeteners, Other	
* 161: Sugar crops, nes
    * Sugar, Raw Equivalent
    * Sweeteners, Other
* 162: Sugar Raw Centrifugal
    * Sugar (Raw Equivalent)
    * Sugar, Raw Equivalent	
    * Sugar, Refined Equiv	
* 163: Sugar non-centrifugal
    * Sugar non-centrifugal
    * Sugar, Raw Equivalent	
* 165: Molasses
    * Molasses
    * Sweeteners, Other
* 166: Fructose and syrup, other
    * Sugar, Raw Equivalent
    * Sweeteners, Other
* 167: Sugar, nes
    * Sugar, Raw Equivalent	
    * Sweeteners, Other	
* 172: Glucose and dextrose
    * Sugar, Raw Equivalent	
    * Sweeteners, Other	
* 173: Lactose
    * Sugar, Raw Equivalent
    * Sweeteners, Other	
* 27: Rice
    * Rice (Milled Equivalent)
    * Rice (Paddy Equivalent)
* 33: Gluten
    * Rice (Milled Equivalent)	
    * Rice (Paddy Equivalent)	
* 35: Bran, rice
    * Rice (Milled Equivalent)	
    * Rice (Paddy Equivalent)
* 242: Groundnuts
    * Groundnuts (in Shell Eq)	
    * Groundnuts (Shelled Eq)
    * Oilcrops Oil, Other	



* We will keep both `Vegetables, Other` and `Fruit, Other` because both have numerous other constituent crops besides watermelon and melon
* We will drop `Groundnuts (in Shell Eq)` since `Groundnuts (Shelled Eq)` contains all crops in `Groundnuts (in Shell Eq)`
* We will drop `Sweeteners, Other` (the "others" in that category were already dropped because of many Nans) and keep `Sugar, Raw Equivalent` because it seems to contain more relevant subcategories and they have a large number of subcategories in common
* We will drop `Sugar, Refined Equiv` because it only contains one subcategory that is already contained in `Sugar, Raw Equivalent`
* We will keep `Sugar (Raw Equivalent)` because it only has one subcategory in common with `Sugar, Raw Equivalent` and has many additional subcategories that may be relevant
* `Rice (Milled Equivalent)` contains all the subcategories in `Rice (Paddy Equivalent)` and some additional, so we will drop `Rice (Paddy Equivalent)`
* We will keep `Oilcrops Oil, Other` because it only has one subcategory in common with `Groundnuts (Shelled Eq)` and contains many unique subcategories