In [1]:
import numpy as np
import pandas as pd

class ReadData():
    """
    import pandas at first
    Usage:
    df = ReadData(file_name, url).createDF() 
    """
    def __init__(self, file_name, 
                 url = None,
                 org_name = 'good-food', 
                 repo_name = 'datamap2019', 
                 folder_path = '/data/', 
                 mode = 'wb'
                ):
        
        self.file_name = file_name
        self.org_name = org_name
        self.repo_name = repo_name
        self.folder_path = folder_path
        
        self.url = url
        self.mode = mode
        
        self.root_path = 'https://github.com/'
        self.raw_path = 'https://raw.githubusercontent.com/'
        self.branch = 'master'

    def getPath(self):
        if self.url == None:
            path = self.root_path + self.org_name + '/' + self.repo_name + '/blob/' + self.branch + self.folder_path + self.file_name
            raw_path = self.raw_path + self.org_name + '/' + self.repo_name + '/' + self.branch + self.folder_path + self.file_name
            print('Url: ', path)
            print('Raw data:', raw_path)
        else:
            print('Url: ', self.url)
            print('Raw data:', self.url)
        return raw_path

    def curl(self):
        import urllib.request, urllib.parse, urllib.error
        
        url = self.getPath()
        response = urllib.request.urlopen(url).read()
        with open(self.file_name, self.mode) as file_handle:
            file_handle.write(response)

    def delFile(self):
        import os
        if os.path.exists(self.file_name): os.remove(self.file_name)
        else: print('no such file:%s'%self.file_name)

    def createDF(self):
        import time
        print(self.file_name, 'is under dealing... needs 1 min')
        self.curl()
        time.sleep(10)
        df = pd.read_csv(self.file_name)
        self.delFile()
        return df

In [2]:
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)

In [3]:
# input
item = 'product_livestock_cn'

# output
data_stock = item + '_stock.csv'
data_primary = item + '_primary.csv'

In [4]:
# If you haven't cloned the whole repository from 
# [Github](https://github.com/good-food/datamap2019)
# please uncomment and run the top 2 lines 
# or else, run the bottom 2 lines or clone the reporsitory at first.
# ------------------------------------------------------------------

# df_stock = ReadData(data_stock).createDF()
# df_primary = ReadData(data_primary).createDF()

df_stock = pd.read_csv('./data/'+data_stock)
df_primary = pd.read_csv('./data/'+data_primary)

In [5]:
df_prod = pd.concat([df_stock, df_primary], axis=0,join='outer')
print('shape of df_stock.shape is {}\nshape of df_primary.shape is {}\nshape of df_prod.shape is {}'.format(df_stock.shape, df_primary.shape, df_prod.shape))

shape of df_stock.shape is (1368, 14)
shape of df_primary.shape is (7090, 14)
shape of df_prod.shape is (8458, 14)


In [6]:
df_prod.head()

Unnamed: 0,Domain Code,Domain,Area Code,Area,Element Code,Element,Item Code,Item,Year Code,Year,Unit,Value,Flag,Flag Description
0,QA,Live Animals,351,China,5111,Stocks,1171,Animals live nes,1961,1961,Head,415450.0,A,"Aggregate, may include official, semi-official, estimated or calculated data"
1,QA,Live Animals,351,China,5111,Stocks,1171,Animals live nes,1962,1962,Head,587400.0,A,"Aggregate, may include official, semi-official, estimated or calculated data"
2,QA,Live Animals,351,China,5111,Stocks,1171,Animals live nes,1963,1963,Head,587665.0,A,"Aggregate, may include official, semi-official, estimated or calculated data"
3,QA,Live Animals,351,China,5111,Stocks,1171,Animals live nes,1964,1964,Head,510730.0,A,"Aggregate, may include official, semi-official, estimated or calculated data"
4,QA,Live Animals,351,China,5111,Stocks,1171,Animals live nes,1965,1965,Head,255065.0,A,"Aggregate, may include official, semi-official, estimated or calculated data"


In [7]:
df_prod.drop(columns = ['Domain Code', 'Area Code', 'Element Code', 'Item Code', 'Year Code', 'Flag'], inplace=True)
df_prod.head()

Unnamed: 0,Domain,Area,Element,Item,Year,Unit,Value,Flag Description
0,Live Animals,China,Stocks,Animals live nes,1961,Head,415450.0,"Aggregate, may include official, semi-official, estimated or calculated data"
1,Live Animals,China,Stocks,Animals live nes,1962,Head,587400.0,"Aggregate, may include official, semi-official, estimated or calculated data"
2,Live Animals,China,Stocks,Animals live nes,1963,Head,587665.0,"Aggregate, may include official, semi-official, estimated or calculated data"
3,Live Animals,China,Stocks,Animals live nes,1964,Head,510730.0,"Aggregate, may include official, semi-official, estimated or calculated data"
4,Live Animals,China,Stocks,Animals live nes,1965,Head,255065.0,"Aggregate, may include official, semi-official, estimated or calculated data"


In [8]:
df = df_prod.dropna()
df.describe(include='all')

Unnamed: 0,Domain,Area,Element,Item,Year,Unit,Value,Flag Description
count,7919,7919,7919,7919,7919.0,7919,7919.0,7919
unique,2,1,8,66,,9,,2
top,Livestock Primary,China,Production,"Meat, buffalo",,tonnes,,"Aggregate, may include official, semi-official, estimated or calculated data"
freq,6951,7919,3337,171,,2457,,5742
mean,,,,,1988.465084,,15139200.0,
std,,,,,16.057723,,62554010.0,
min,,,,,1961.0,,0.0,
25%,,,,,1975.0,,7500.0,
50%,,,,,1989.0,,144684.0,
75%,,,,,2002.0,,2002374.0,


In [9]:
df.groupby('Element').count()

Unnamed: 0_level_0,Domain,Area,Item,Year,Unit,Value,Flag Description
Element,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Laying,57,57,57,57,57,57,57
Milk Animals,285,285,285,285,285,285,285
Prod Popultn,57,57,57,57,57,57,57
Producing Animals/Slaughtered,1038,1038,1038,1038,1038,1038,1038
Production,3337,3337,3337,3337,3337,3337,3337
Stocks,968,968,968,968,968,968,968
Yield,611,611,611,611,611,611,611
Yield/Carcass Weight,1566,1566,1566,1566,1566,1566,1566


In [10]:
df.groupby('Item').count()

Unnamed: 0_level_0,Domain,Area,Element,Year,Unit,Value,Flag Description
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Animals live nes,56,56,56,56,56,56,56
Asses,57,57,57,57,57,57,57
Beehives,57,57,57,57,57,57,57
Buffaloes,57,57,57,57,57,57,57
Camels,57,57,57,57,57,57,57
Cattle,57,57,57,57,57,57,57
Chickens,57,57,57,57,57,57,57
Ducks,57,57,57,57,57,57,57
"Eggs, hen, in shell",171,171,171,171,171,171,171
"Eggs, hen, in shell (number)",57,57,57,57,57,57,57


In [11]:
elements = df['Element'].unique()
elements

array(['Stocks', 'Laying', 'Yield', 'Production',
       'Producing Animals/Slaughtered', 'Prod Popultn',
       'Yield/Carcass Weight', 'Milk Animals'], dtype=object)

In [12]:
df = df.loc[df['Element'].isin(['Stocks', 'Producing Animals/Slaughtered', 'Production', 'Yield/Carcass Weight'])]
df.head()

Unnamed: 0,Domain,Area,Element,Item,Year,Unit,Value,Flag Description
0,Live Animals,China,Stocks,Animals live nes,1961,Head,415450.0,"Aggregate, may include official, semi-official, estimated or calculated data"
1,Live Animals,China,Stocks,Animals live nes,1962,Head,587400.0,"Aggregate, may include official, semi-official, estimated or calculated data"
2,Live Animals,China,Stocks,Animals live nes,1963,Head,587665.0,"Aggregate, may include official, semi-official, estimated or calculated data"
3,Live Animals,China,Stocks,Animals live nes,1964,Head,510730.0,"Aggregate, may include official, semi-official, estimated or calculated data"
4,Live Animals,China,Stocks,Animals live nes,1965,Head,255065.0,"Aggregate, may include official, semi-official, estimated or calculated data"


In [16]:
df = df.loc[df['Item'].isin([
    # stock
    'Cattle', 'Camels', 'Horses', 'Mules',                          
    'Goats', 'Sheep', 'Pigs',
    'Chickens', 'Ducks', 'Geese and guinea fowls',
    # meat
    'Meat, cattle', 'Meat, camel', 'Meat, horse', 'Meat, mules',
    'Meat, goat', 'Meat, sheep', 'Meat, pig',
    'Meat, chicken', 'Meat, duck', 'Meat, goose and guinea fowl',
    # egg
    'Eggs, hen, in shell', 'Eggs, other bird, in shell',
    # milk
    'Milk, whole fresh cow', 'Milk, whole fresh camel',
    'Milk, whole fresh goat', 'Milk, whole fresh sheep'])]
df.shape

(2451, 8)

In [17]:
df.groupby(['Element', 'Item']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Domain,Area,Year,Unit,Value,Flag Description
Element,Item,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Producing Animals/Slaughtered,"Meat, camel",57,57,57,57,57,57
Producing Animals/Slaughtered,"Meat, cattle",57,57,57,57,57,57
Producing Animals/Slaughtered,"Meat, chicken",57,57,57,57,57,57
Producing Animals/Slaughtered,"Meat, duck",57,57,57,57,57,57
Producing Animals/Slaughtered,"Meat, goat",57,57,57,57,57,57
Producing Animals/Slaughtered,"Meat, goose and guinea fowl",57,57,57,57,57,57
Producing Animals/Slaughtered,"Meat, horse",57,57,57,57,57,57
Producing Animals/Slaughtered,"Meat, pig",57,57,57,57,57,57
Producing Animals/Slaughtered,"Meat, sheep",57,57,57,57,57,57
Production,"Eggs, hen, in shell",57,57,57,57,57,57


In [18]:
df['Unit'].unique()

array(['Head', '1000 Head', 'tonnes', 'hg/An', '0.1g/An'], dtype=object)

In [19]:
df.loc[df['Unit']=='Head'].head()

Unnamed: 0,Domain,Area,Element,Item,Year,Unit,Value,Flag Description
285,Live Animals,China,Stocks,Camels,1961,Head,381000.0,"Aggregate, may include official, semi-official, estimated or calculated data"
286,Live Animals,China,Stocks,Camels,1962,Head,378000.0,"Aggregate, may include official, semi-official, estimated or calculated data"
287,Live Animals,China,Stocks,Camels,1963,Head,386000.0,"Aggregate, may include official, semi-official, estimated or calculated data"
288,Live Animals,China,Stocks,Camels,1964,Head,402000.0,"Aggregate, may include official, semi-official, estimated or calculated data"
289,Live Animals,China,Stocks,Camels,1965,Head,425000.0,"Aggregate, may include official, semi-official, estimated or calculated data"


In [20]:
df.loc[df['Unit']=='1000 Head'].head()

Unnamed: 0,Domain,Area,Element,Item,Year,Unit,Value,Flag Description
456,Live Animals,China,Stocks,Chickens,1961,1000 Head,540850.0,"Aggregate, may include official, semi-official, estimated or calculated data"
457,Live Animals,China,Stocks,Chickens,1962,1000 Head,554423.0,"Aggregate, may include official, semi-official, estimated or calculated data"
458,Live Animals,China,Stocks,Chickens,1963,1000 Head,570972.0,"Aggregate, may include official, semi-official, estimated or calculated data"
459,Live Animals,China,Stocks,Chickens,1964,1000 Head,586801.0,"Aggregate, may include official, semi-official, estimated or calculated data"
460,Live Animals,China,Stocks,Chickens,1965,1000 Head,590895.0,"Aggregate, may include official, semi-official, estimated or calculated data"


In [21]:
df.loc[df['Unit']=='tonnes'].head()

Unnamed: 0,Domain,Area,Element,Item,Year,Unit,Value,Flag Description
114,Livestock Primary,China,Production,"Eggs, hen, in shell",1961,tonnes,1211565.0,"Aggregate, may include official, semi-official, estimated or calculated data"
115,Livestock Primary,China,Production,"Eggs, hen, in shell",1962,tonnes,1229591.0,"Aggregate, may include official, semi-official, estimated or calculated data"
116,Livestock Primary,China,Production,"Eggs, hen, in shell",1963,tonnes,1247638.0,"Aggregate, may include official, semi-official, estimated or calculated data"
117,Livestock Primary,China,Production,"Eggs, hen, in shell",1964,tonnes,1282830.0,"Aggregate, may include official, semi-official, estimated or calculated data"
118,Livestock Primary,China,Production,"Eggs, hen, in shell",1965,tonnes,1313836.0,"Aggregate, may include official, semi-official, estimated or calculated data"


In [22]:
df.loc[df['Unit']=='hg/An'].head()

Unnamed: 0,Domain,Area,Element,Item,Year,Unit,Value,Flag Description
3649,Livestock Primary,China,Yield/Carcass Weight,"Meat, camel",1961,hg/An,2200.0,Calculated data
3650,Livestock Primary,China,Yield/Carcass Weight,"Meat, camel",1962,hg/An,2200.0,Calculated data
3651,Livestock Primary,China,Yield/Carcass Weight,"Meat, camel",1963,hg/An,2200.0,Calculated data
3652,Livestock Primary,China,Yield/Carcass Weight,"Meat, camel",1964,hg/An,2200.0,Calculated data
3653,Livestock Primary,China,Yield/Carcass Weight,"Meat, camel",1965,hg/An,2200.0,Calculated data


In [23]:
df.loc[df['Unit']=='0.1g/An'].head()

Unnamed: 0,Domain,Area,Element,Item,Year,Unit,Value,Flag Description
3991,Livestock Primary,China,Yield/Carcass Weight,"Meat, chicken",1961,0.1g/An,10144.0,Calculated data
3992,Livestock Primary,China,Yield/Carcass Weight,"Meat, chicken",1962,0.1g/An,10141.0,Calculated data
3993,Livestock Primary,China,Yield/Carcass Weight,"Meat, chicken",1963,0.1g/An,10144.0,Calculated data
3994,Livestock Primary,China,Yield/Carcass Weight,"Meat, chicken",1964,0.1g/An,10146.0,Calculated data
3995,Livestock Primary,China,Yield/Carcass Weight,"Meat, chicken",1965,0.1g/An,10149.0,Calculated data


In [24]:
df.loc[df['Unit'] =='Head','Value']=df['Value']/10000
df.loc[df['Unit']=='Head', 'Unit']= '10000 Head'

df.loc[df['Unit'] =='1000 Head','Value']=df['Value']/10
df.loc[df['Unit']=='1000 Head', 'Unit']= '10000 Head'

df.loc[df['Unit'] =='tonnes','Value']=df['Value']/10000
df.loc[df['Unit']=='tonnes', 'Unit']= '10000 tonnes'

df.loc[df['Unit'] =='hg/An','Value']=df['Value']/10
df.loc[df['Unit']=='hg/An', 'Unit']= 'kg/An'

df.loc[df['Unit'] =='0.1g/An','Value']=df['Value']/10
df.loc[df['Unit']=='0.1g/An', 'Unit']= 'g/An'

In [25]:
df['Unit'].unique()

array(['10000 Head', '10000 tonnes', 'kg/An', 'g/An'], dtype=object)

In [26]:
df.loc[df['Unit']=='10000 Head'].head()

Unnamed: 0,Domain,Area,Element,Item,Year,Unit,Value,Flag Description
285,Live Animals,China,Stocks,Camels,1961,10000 Head,38.1,"Aggregate, may include official, semi-official, estimated or calculated data"
286,Live Animals,China,Stocks,Camels,1962,10000 Head,37.8,"Aggregate, may include official, semi-official, estimated or calculated data"
287,Live Animals,China,Stocks,Camels,1963,10000 Head,38.6,"Aggregate, may include official, semi-official, estimated or calculated data"
288,Live Animals,China,Stocks,Camels,1964,10000 Head,40.2,"Aggregate, may include official, semi-official, estimated or calculated data"
289,Live Animals,China,Stocks,Camels,1965,10000 Head,42.5,"Aggregate, may include official, semi-official, estimated or calculated data"


In [27]:
df.loc[df['Unit']=='10000 tonnes'].head()

Unnamed: 0,Domain,Area,Element,Item,Year,Unit,Value,Flag Description
114,Livestock Primary,China,Production,"Eggs, hen, in shell",1961,10000 tonnes,121.1565,"Aggregate, may include official, semi-official, estimated or calculated data"
115,Livestock Primary,China,Production,"Eggs, hen, in shell",1962,10000 tonnes,122.9591,"Aggregate, may include official, semi-official, estimated or calculated data"
116,Livestock Primary,China,Production,"Eggs, hen, in shell",1963,10000 tonnes,124.7638,"Aggregate, may include official, semi-official, estimated or calculated data"
117,Livestock Primary,China,Production,"Eggs, hen, in shell",1964,10000 tonnes,128.283,"Aggregate, may include official, semi-official, estimated or calculated data"
118,Livestock Primary,China,Production,"Eggs, hen, in shell",1965,10000 tonnes,131.3836,"Aggregate, may include official, semi-official, estimated or calculated data"


In [28]:
df.loc[df['Unit']=='kg/An'].head()

Unnamed: 0,Domain,Area,Element,Item,Year,Unit,Value,Flag Description
3649,Livestock Primary,China,Yield/Carcass Weight,"Meat, camel",1961,kg/An,220.0,Calculated data
3650,Livestock Primary,China,Yield/Carcass Weight,"Meat, camel",1962,kg/An,220.0,Calculated data
3651,Livestock Primary,China,Yield/Carcass Weight,"Meat, camel",1963,kg/An,220.0,Calculated data
3652,Livestock Primary,China,Yield/Carcass Weight,"Meat, camel",1964,kg/An,220.0,Calculated data
3653,Livestock Primary,China,Yield/Carcass Weight,"Meat, camel",1965,kg/An,220.0,Calculated data


In [29]:
df.loc[df['Unit']=='g/An'].head()

Unnamed: 0,Domain,Area,Element,Item,Year,Unit,Value,Flag Description
3991,Livestock Primary,China,Yield/Carcass Weight,"Meat, chicken",1961,g/An,1014.4,Calculated data
3992,Livestock Primary,China,Yield/Carcass Weight,"Meat, chicken",1962,g/An,1014.1,Calculated data
3993,Livestock Primary,China,Yield/Carcass Weight,"Meat, chicken",1963,g/An,1014.4,Calculated data
3994,Livestock Primary,China,Yield/Carcass Weight,"Meat, chicken",1964,g/An,1014.6,Calculated data
3995,Livestock Primary,China,Yield/Carcass Weight,"Meat, chicken",1965,g/An,1014.9,Calculated data


In [30]:
df.shape

(2451, 8)

In [33]:
df.to_csv('./data/product_livestock_cn_clean.csv', index=False)