# Livestock Production

原始数据来源与维护见 [READM.md](https://github.com/good-food/datamap2019/blob/master/README.md)

In [1]:
class ReadData():
    """
    # Usage:

    # input:
    file_name = 'file_name'
    url = None
    data = ReadGithubData(file_name, url)

    # output:
    rawpath = data.getRawPath()
    path = data.getPath()
    print('rawpath:', rawpath)
    print('path:', path)
    df = data.createDF()   
    """

    def __init__(self, file_name, 
                 url = None,
                 org_name = 'good-food', 
                 repo_name = 'datamap2019', 
                 folder_path = '/data/', 
                 mode = 'wb'
                ):
        
        self.file_name = file_name
        self.org_name = org_name
        self.repo_name = repo_name
        self.folder_path = folder_path
        
        self.url = url
        self.mode = mode
        
        self.root_path = 'https://github.com/'
        self.raw_path = 'https://raw.githubusercontent.com/'
        self.branch = 'master'

    def getPath(self):
        if self.url == None:
            url = self.root_path + self.org_name + '/' + self.repo_name + '/blob/' + self.branch + self.folder_path + self.file_name
        else:
            url = self.url
        return url

    def getRawPath(self):
        if self.url == None:
            url = self.raw_path + self.org_name + '/' + self.repo_name + '/' + self.branch + self.folder_path + self.file_name
        else:
            url = self.url
        return url

    def curl(self):
        import urllib.request, urllib.parse, urllib.error
        
        url = self.getRawPath()
        
        response = urllib.request.urlopen(url).read()
        with open(self.file_name, self.mode) as file_handle:
            file_handle.write(response)

    def delFile(self):
        import os

        if os.path.exists(self.file_name):
            os.remove(self.file_name)

        else:
            print('no such file:%s'%self.file_name)

    def createDF(self):
        import pandas as pd

        self.curl()

        df = pd.read_csv(self.file_name)
        self.delFile()
        return df

## Data Pre-processing

In [2]:
# input:
item = 'product_livestock'
year_now = 2019

# Read in data from GitHub repository

**Data Frame**

In [3]:
data_name = item + '.csv'
print(data_name, 'is under dealing...')

data = ReadData(data_name)

data_rawpath = data.getRawPath()
data_path = data.getPath()
df_data = data.createDF()

print('\n')
print('rawpath of data:', data_rawpath)
print('path of data:', data_path)
df_data.head()

product_livestock.csv is under dealing...


rawpath of data: https://raw.githubusercontent.com/good-food/datamap2019/master/data/product_livestock.csv
path of data: https://github.com/good-food/datamap2019/blob/master/data/product_livestock.csv


Unnamed: 0,AreaName,Year,Stock_largeAni,Stock_cattle,Stock_horse,Stock_donkey,Stock_mule,Stock_camel,Out_hog,Stock_hog,...,MeatYeild_gt_shp,MilkYeild,MilkYeild_cattle,WoolYeild_sheep,WoolYeild_sheep_fine,WoolYeild_sheep_semi,WoolYeild_goat_thick,WoolYeild_goat_cashmere,EggYeild,HoneyYeild
0,China,1996,13360.6,11031.8,871.5,944.4,478.0,34.9,41225.1,36283.6,...,181.0,735.8,629.4,298102.0,121020.0,74099.0,35255.0,9585.0,1965.2,18.4
1,China,1997,14541.8,12175.7,891.2,952.8,480.6,35.0,46483.7,40034.8,...,212.8,681.1,601.1,255059.0,116054.0,55683.0,25865.0,8626.0,1895.3,21.1
2,China,1998,14803.2,12441.9,898.1,955.8,473.9,33.5,50215.1,42256.3,...,234.6,745.4,662.9,277545.0,115752.0,68775.0,31417.0,9799.0,2021.3,20.7
3,China,1999,15024.75,12698.34,891.41,934.77,467.27,32.96,51977.2,43144.2,...,251.2643,806.9073,717.5939,283152.0,114103.0,73700.0,31849.0,10179.65,2134.667,22.9907
4,China,2000,15151.51,12866.34,876.58,922.73,453.03,32.62,52673.34,44681.54,...,273.958296,919.118908,827.43079,292502.0,117386.0,84921.0,33266.0,11057.0,2243.3,24.6


**Data Explanations**

In [4]:
doc_name = item + '_doc.csv'
print(doc_name, 'is under dealing...')

doc = ReadData(doc_name)

doc_rawpath = doc.getRawPath()
doc_path = doc.getPath()
df_doc = doc.createDF()

print('\n')
print('rawpath of documentation:', doc_rawpath)
print('path of documentation:', doc_path)
df_doc

product_livestock_doc.csv is under dealing...


rawpath of documentation: https://raw.githubusercontent.com/good-food/datamap2019/master/data/product_livestock_doc.csv
path of documentation: https://github.com/good-food/datamap2019/blob/master/data/product_livestock_doc.csv


Unnamed: 0,Feild name,中文解释,Explanation,单位: Unit
0,AreaName,地区 (省级),Area (provincial),Nan
1,Year,年份,Year,Nan
2,Stock_largeAni,大动物存栏量 (牛&马&驴&骡&骆驼),Stock of cattle&hores&donkey&mule&camel,万只: 10000 #
3,Stock_cattle,牛存栏量,Stock of cattle,万只: 10000 #
4,Stock_horse,马存栏量,Stock of horse,万只: 10000 #
5,Stock_donkey,驴存栏量,Stock of donkey,万只: 10000 #
6,Stock_mule,骡存栏量,Stock of mule,万只: 10000 #
7,Stock_camel,骆驼存栏量,Stock of camel,万只: 10000 #
8,Out_hog,猪出栏量,Output of hog,万只: 10000 #
9,Stock_hog,猪存栏量,Stock of hog,万只: 10000 #


## Data Outline

### Checking data shape

In [5]:
df_data.shape

(642, 30)

In [6]:
colunms = df_data.columns.values
print('Column names =',colunms)

Column names = ['AreaName' 'Year' 'Stock_largeAni' 'Stock_cattle' 'Stock_horse'
 'Stock_donkey' 'Stock_mule' 'Stock_camel' 'Out_hog' 'Stock_hog'
 'Stock_gt_shp' 'Stock_goat' 'Stock_sheep' 'Out_cattle' 'Out_gt_shp'
 'Out_poultry' 'MeatYeild' 'MeatYeild_hg_ct_gt_shp' 'MeatYeild_hog'
 'MeatYeild_cattle' 'MeatYeild_gt_shp' 'MilkYeild' 'MilkYeild_cattle'
 'WoolYeild_sheep' 'WoolYeild_sheep_fine' 'WoolYeild_sheep_semi'
 'WoolYeild_goat_thick' 'WoolYeild_goat_cashmere' 'EggYeild' 'HoneyYeild']


In [7]:
areas = df_data['AreaName'].unique()
print('AreaNames =',areas)

AreaNames = ['China' 'Beijing' 'Tianjin' 'Hebei' 'Shanxi' 'Inner Mongolia' 'Liaoning'
 'Jilin' 'Heilongjiang' 'Shanghai' 'Jiangsu' 'Zhejiang' 'Anhui' 'Fujian'
 'Jiangxi' 'Shandong' 'Henan' 'Hubei' 'Hunan' 'Guangdong' 'Guangxi'
 'Hainan' 'Chongqing' 'Sichuan' 'Guizhou' 'Yunnan' 'Tibet' 'Shaanxi'
 'Gansu' 'Qinghai' 'Ningxia' 'Xinjiang']


In [8]:
years = df_data['Year'].unique()
year_newest = years.max()

if year_newest+2 == year_now:
    print('The data is up to date to year', year_now)
else:
    print('The newest data of National Bureau of Statistics is updated to year', year_now)
    print('While our data is updated to year', year_newest+2)
    print('Pleas see the 1st line of this notebook to update data.')
    print('But you can still play with the old data.')
    
print('Year list =',years)

The data is up to date to year 2019
Year list = [1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009
 2010 2011 2012 2013 2014 2015 2016 2017]


## Missing values

### Checking the missing valuse

In [9]:
# mask missing data with 'True'

df_null = df_data.isnull()
df_null.head()

Unnamed: 0,AreaName,Year,Stock_largeAni,Stock_cattle,Stock_horse,Stock_donkey,Stock_mule,Stock_camel,Out_hog,Stock_hog,...,MeatYeild_gt_shp,MilkYeild,MilkYeild_cattle,WoolYeild_sheep,WoolYeild_sheep_fine,WoolYeild_sheep_semi,WoolYeild_goat_thick,WoolYeild_goat_cashmere,EggYeild,HoneyYeild
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [10]:
# find the columns where missing values exist
null_counts = {}
for column in df_data.columns.tolist():    
    null_count = df_null.groupby([column]).size()[False]
    if null_count < len(df_data.index.values): null_counts[column] = null_count
null_counts

{'Stock_horse': 561,
 'Stock_donkey': 529,
 'Stock_mule': 529,
 'Stock_camel': 158,
 'Stock_sheep': 544,
 'Out_cattle': 558,
 'Out_gt_shp': 558,
 'Out_poultry': 558,
 'MeatYeild_cattle': 633,
 'WoolYeild_sheep': 540,
 'WoolYeild_sheep_fine': 484,
 'WoolYeild_sheep_semi': 504,
 'WoolYeild_goat_thick': 559,
 'WoolYeild_goat_cashmere': 426,
 'HoneyYeild': 601}

### Dealing with missing values

**1. Except 'Out_cattle', 'Out_gt_shp', 'Out_poultry', the missing values in other columns because they are too small, replace them with zeros.**

In [11]:
columns = ['Stock_horse','Stock_donkey','Stock_mule','Stock_camel','Stock_sheep','MeatYeild_cattle',\
'WoolYeild_sheep','WoolYeild_sheep_fine','WoolYeild_sheep_semi','WoolYeild_goat_thick','WoolYeild_goat_cashmere',\
'HoneyYeild']

df_data[columns] = df_data[columns].fillna(0)
df_data[columns].describe(include='all')

Unnamed: 0,Stock_horse,Stock_donkey,Stock_mule,Stock_camel,Stock_sheep,MeatYeild_cattle,WoolYeild_sheep,WoolYeild_sheep_fine,WoolYeild_sheep_semi,WoolYeild_goat_thick,WoolYeild_goat_cashmere,HoneyYeild
count,642.0,642.0,642.0,642.0,642.0,642.0,642.0,642.0,642.0,642.0,642.0,642.0
mean,44.209491,44.440794,19.939555,1.876654,960.496308,39.552913,23574.031009,8111.313219,7217.599774,2410.425487,994.260982,2.324778
std,123.495124,128.605836,60.566196,5.960201,2700.268734,104.787724,66989.492238,23963.64205,20118.954097,6589.1018,2916.48464,6.725113
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.224125,0.1925,0.160025,0.0,1.1075,5.30065,14.1325,0.0,3.66775,10.0,0.0,0.2
50%,4.48275,6.965,2.405,0.0,114.147458,13.07,3316.0,336.065,1148.045,405.5,34.7,0.638234
75%,40.23,32.25,13.425,0.0,612.221085,33.137062,14979.3475,3565.5,5200.75,1708.75,769.5514,1.27311
max,898.1,955.8,480.6,35.0,17088.22986,675.871024,413133.7695,132876.7011,137972.7359,40505.29747,18844.21205,55.528363


**2. The national total of 'Out_cattle', 'Out_gt_shp', 'Out_poultry' can be caculated by adding up the provices data after 1999 and before the newest year.**

In [12]:
import numpy as np

years = range(1999, year_newest)
areas = ['Beijing','Tianjin','Hebei','Shanxi','Inner Mongolia',\
          'Liaoning','Jilin','Heilongjiang',\
          'Shanghai','Jiangsu','Zhejiang','Anhui','Fujian','Jiangxi','Shandong',\
          'Henan','Hubei','Hunan','Guangdong','Guangxi','Hainan',\
          'Chongqing','Sichuan','Guizhou','Yunnan','Tibet',\
          'Shaanxi','Gansu','Qinghai','Ningxia','Xinjiang']

def cn_total(term):
    for year in years:
        vals = list()
        for area in areas:
            val = df_data[term][(df_data['Year']==year) & (df_data['AreaName']==area)].values.tolist()
            vals.extend(val)
        vals = np.array(vals)
        df_data[term][(df_data['Year']==year) & (df_data['AreaName']=='China')] = vals.sum()
    
cn_total('Out_cattle')
cn_total('Out_gt_shp')
cn_total('Out_poultry')

df_data[['AreaName', 'Year', 'Out_cattle','Out_gt_shp','Out_poultry']][(df_data['AreaName']=='China')]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,AreaName,Year,Out_cattle,Out_gt_shp,Out_poultry
0,China,1996,,,
1,China,1997,,,
2,China,1998,,,
3,China,1999,3766.19,18820.35,743165.05
4,China,2000,3964.81,20472.69,809857.07
5,China,2001,4118.37,21722.45,808834.8
6,China,2002,4401.12,23280.78,832894.09
7,China,2003,4703.03,25958.28,888587.77
8,China,2004,5018.9,28342.98,907021.77
9,China,2005,5287.64,30804.54,986491.81


**3. Before 1998 or in the newest year, the missing values in 'Out_cattle', 'Out_gt_shp', 'Out_poultry' because of the lacking of statistics , just leave them as NaN.**

## Save Clean Data

This data is clean, save it to a new file for data visualization.

In [13]:
df_data.to_csv('./data/product_livestock_clean.csv', index = False)