# Data Cleaning - Livestock Production - Areas

原始数据来源与维护见 [READM.md](https://github.com/good-food/datamap2019/blob/master/README.md)

In [1]:
import pandas as pd
import numpy as np

class ReadData():
    """
    # Usage:
    # df = ReadData(file_name, url).createDF() 
    """
    def __init__(self, file_name, 
                 url = None,
                 org_name = 'good-food', 
                 repo_name = 'datamap2019', 
                 folder_path = '/data/', 
                 mode = 'wb'
                ):
        
        self.file_name = file_name
        self.org_name = org_name
        self.repo_name = repo_name
        self.folder_path = folder_path
        
        self.url = url
        self.mode = mode
        
        self.root_path = 'https://github.com/'
        self.raw_path = 'https://raw.githubusercontent.com/'
        self.branch = 'master'

    def getPath(self):
        if self.url == None:
            path = self.root_path + self.org_name + '/' + self.repo_name + '/blob/' + self.branch + self.folder_path + self.file_name
            raw_path = self.raw_path + self.org_name + '/' + self.repo_name + '/' + self.branch + self.folder_path + self.file_name
            print('Url: ', path)
            print('Raw data:', raw_path)
        else:
            print('Url: ', self.url)
            print('Raw data:', self.url)
        return raw_path

    def curl(self):
        import urllib.request, urllib.parse, urllib.error
        
        url = self.getPath()
        response = urllib.request.urlopen(url).read()
        with open(self.file_name, self.mode) as file_handle:
            file_handle.write(response)

    def delFile(self):
        import os
        if os.path.exists(self.file_name): os.remove(self.file_name)
        else: print('no such file:%s'%self.file_name)

    def createDF(self):
        import pandas as pd
        import time
        print(self.file_name, 'is under dealing... needs 1 min')
        self.curl()
        time.sleep(10)
        df = pd.read_csv(self.file_name)
        self.delFile()
        return df

In [2]:
# input
item = 'product_livestock_areas'

data_name = item + '_raw.csv'
doc_name = item + '_doc.csv'

## Read Data from GitHub Repository
### Read Data and Explain Data Index

In [3]:
# If you haven't cloned the whole repository from [Github](https://github.com/good-food/datamap2019)
# please uncomment and the following cell to run.

# df_data = ReadData(data_name).createDF()
# df_doc = ReadData(doc_name).createDF()

df_data = pd.read_csv('./data/'+data_name)
# df_doc = pd.read_csv('./data/'+doc_name)

In [4]:
df_data.head()

Unnamed: 0,Area,Year,Stock_largeAni,Stock_cattle,Stock_horse,Stock_donkey,Stock_mule,Stock_camel,Out_pig,Stock_pig,...,MeatProd_gt_shp,MilkProd,MilkProd_cattle,WoolProd_sheep,WoolProd_sheep_fine,WoolProd_sheep_semi,WoolProd_goat_thick,WoolProd_goat_cashmere,EggProd,HoneyProd
0,China,1996,13360.6,11031.8,871.5,944.4,478.0,34.9,41225.1,36283.6,...,181.0,735.8,629.4,298102.0,121020.0,74099.0,35255.0,9585.0,1965.2,18.4
1,China,1997,14541.8,12175.7,891.2,952.8,480.6,35.0,46483.7,40034.8,...,212.8,681.1,601.1,255059.0,116054.0,55683.0,25865.0,8626.0,1895.3,21.1
2,China,1998,14803.2,12441.9,898.1,955.8,473.9,33.5,50215.1,42256.3,...,234.6,745.4,662.9,277545.0,115752.0,68775.0,31417.0,9799.0,2021.3,20.7
3,China,1999,15024.75,12698.34,891.41,934.77,467.27,32.96,51977.2,43144.2,...,251.2643,806.9073,717.5939,283152.0,114103.0,73700.0,31849.0,10179.65,2134.667,22.9907
4,China,2000,15151.51,12866.34,876.58,922.73,453.03,32.62,52673.34,44681.54,...,273.958296,919.118908,827.43079,292502.0,117386.0,84921.0,33266.0,11057.0,2243.3,24.6


### Checking data updates

In [5]:
import datetime

years = df_data['Year'].unique()
years.sort()
print('Year list =',years)

year_now = datetime.date.today().year
year_newest = years.max()
if year_newest+2 == year_now:
    print('The data is updated to year {}.'.format(year_now-1))
else:
    print('The newest data of National Bureau of Statistics is updated to year', year_now-1)
    print('While our data is updated to year', year_newest+1)
    print('Pleas check the 1st line of this notebook to update data.')
    print('But you can still play with the old data.')

Year list = [1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009
 2010 2011 2012 2013 2014 2015 2016 2017]
The data is updated to year 2018.


### Data Outline

In [6]:
print('There are {} rows and {} columns in the data frame.'.format(df_data.shape[0],df_data.shape[1]))

There are 642 rows and 30 columns in the data frame.


In [9]:
feilds = df_data.columns.values.tolist()
print('Fields list =', feilds)

Fields list = ['AreaName', 'Year', 'Stock_largeAni', 'Stock_cattle', 'Stock_horse', 'Stock_donkey', 'Stock_mule', 'Stock_camel', 'Out_hog', 'Stock_hog', 'Stock_gt_shp', 'Stock_goat', 'Stock_sheep', 'Out_cattle', 'Out_gt_shp', 'Out_poultry', 'MeatProd', 'MeatProd_hg_ct_gt_shp', 'MeatProd_hog', 'MeatProd_cattle', 'MeatProd_gt_shp', 'MilkProd', 'MilkProd_cattle', 'WoolProd_sheep', 'WoolProd_sheep_fine', 'WoolProd_sheep_semi', 'WoolProd_goat_thick', 'WoolProd_goat_cashmere', 'EggProd', 'HoneyProd']


In [7]:
areas = df_data['Area'].unique()
print('Areas list =',areas)

Areas list = ['China' 'Beijing' 'Tianjin' 'Hebei' 'Shanxi' 'Inner Mongolia' 'Liaoning'
 'Jilin' 'Heilongjiang' 'Shanghai' 'Jiangsu' 'Zhejiang' 'Anhui' 'Fujian'
 'Jiangxi' 'Shandong' 'Henan' 'Hubei' 'Hunan' 'Guangdong' 'Guangxi'
 'Hainan' 'Chongqing' 'Sichuan' 'Guizhou' 'Yunnan' 'Tibet' 'Shaanxi'
 'Gansu' 'Qinghai' 'Ningxia' 'Xinjiang']


## Dealing with Missing Values 
### Check Missing Values

Print all the columns with missing values (length of which shorter than data frame's length).

In [8]:
# mask missing data with 'True'

df_null = df_data.isnull()
# print(df_null.head())

# Find the columns where missing values exist
null_counts = {}
for column in df_data.columns.tolist():    
    null_count = df_null.groupby([column]).size()[False]
    if null_count < df_data.shape[0]: null_counts[column] = null_count
null_counts

{'Stock_horse': 561,
 'Stock_donkey': 529,
 'Stock_mule': 529,
 'Stock_camel': 158,
 'Stock_sheep': 544,
 'Out_cattle': 558,
 'Out_gt_shp': 558,
 'Out_poultry': 558,
 'MeatProd_cattle': 633,
 'WoolProd_sheep': 540,
 'WoolProd_sheep_fine': 484,
 'WoolProd_sheep_semi': 504,
 'WoolProd_goat_thick': 559,
 'WoolProd_goat_cashmere': 426,
 'HoneyProd': 601}

### Missing Value Explanation and Cleaning
**1. Except 'Out_cattle', 'Out_gt_shp', 'Out_poultry', the missing values in other columns because they are too small, replace them with zeros.**

In [9]:
columns = ['Stock_horse','Stock_donkey','Stock_mule','Stock_camel','Stock_sheep','MeatProd_cattle',\
'WoolProd_sheep','WoolProd_sheep_fine','WoolProd_sheep_semi','WoolProd_goat_thick','WoolProd_goat_cashmere',\
'HoneyProd']

df_data[columns] = df_data[columns].fillna(0)
df_data[columns].describe(include='all')

Unnamed: 0,Stock_horse,Stock_donkey,Stock_mule,Stock_camel,Stock_sheep,MeatProd_cattle,WoolProd_sheep,WoolProd_sheep_fine,WoolProd_sheep_semi,WoolProd_goat_thick,WoolProd_goat_cashmere,HoneyProd
count,642.0,642.0,642.0,642.0,642.0,642.0,642.0,642.0,642.0,642.0,642.0,642.0
mean,44.209491,44.440794,19.939555,1.876654,960.496308,39.552913,23574.031009,8111.313219,7217.599774,2410.425487,994.260982,2.324778
std,123.495124,128.605836,60.566196,5.960201,2700.268734,104.787724,66989.492238,23963.64205,20118.954097,6589.1018,2916.48464,6.725113
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.224125,0.1925,0.160025,0.0,1.1075,5.30065,14.1325,0.0,3.66775,10.0,0.0,0.2
50%,4.48275,6.965,2.405,0.0,114.147458,13.07,3316.0,336.065,1148.045,405.5,34.7,0.638234
75%,40.23,32.25,13.425,0.0,612.221085,33.137062,14979.3475,3565.5,5200.75,1708.75,769.5514,1.27311
max,898.1,955.8,480.6,35.0,17088.22986,675.871024,413133.7695,132876.7011,137972.7359,40505.29747,18844.21205,55.528363


**2. Between 1999 and the newest year, the national total of 'Out_cattle', 'Out_gt_shp', 'Out_poultry' can be caculated by adding up the provices data.**

In [11]:
import numpy as np

years = range(1999, year_newest)
areas = ['Beijing','Tianjin','Hebei','Shanxi','Inner Mongolia',\
          'Liaoning','Jilin','Heilongjiang',\
          'Shanghai','Jiangsu','Zhejiang','Anhui','Fujian','Jiangxi','Shandong',\
          'Henan','Hubei','Hunan','Guangdong','Guangxi','Hainan',\
          'Chongqing','Sichuan','Guizhou','Yunnan','Tibet',\
          'Shaanxi','Gansu','Qinghai','Ningxia','Xinjiang']

def cn_total(term):
    for year in years:
        vals = list()
        for area in areas:
            val = df_data[term][(df_data['Year']==year) & (df_data['Area']==area)].values.tolist()
            vals.extend(val)
        vals = np.array(vals)
        df_data[term][(df_data['Year']==year) & (df_data['Area']=='China')] = vals.sum()
    
cn_total('Out_cattle')
cn_total('Out_gt_shp')
cn_total('Out_poultry')

df_data[['Area', 'Year', 'Out_cattle','Out_gt_shp','Out_poultry']][(df_data['Area']=='China')]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Area,Year,Out_cattle,Out_gt_shp,Out_poultry
0,China,1996,,,
1,China,1997,,,
2,China,1998,,,
3,China,1999,3766.19,18820.35,743165.05
4,China,2000,3964.81,20472.69,809857.07
5,China,2001,4118.37,21722.45,808834.8
6,China,2002,4401.12,23280.78,832894.09
7,China,2003,4703.03,25958.28,888587.77
8,China,2004,5018.9,28342.98,907021.77
9,China,2005,5287.64,30804.54,986491.81


**3. Before 1999 or in the newest year, the missing values in 'Out_cattle', 'Out_gt_shp', 'Out_poultry' because of the lacking of statistics , just leave them as NaN.**

## Data Checking

In [15]:
print(df_data.describe(include='all'))

       AreaName         Year  Stock_largeAni  Stock_cattle  Stock_horse  \
count       642   642.000000      642.000000    642.000000   642.000000   
unique       32          NaN             NaN           NaN          NaN   
top       China          NaN             NaN           NaN          NaN   
freq         22          NaN             NaN           NaN          NaN   
mean        NaN  2007.465732      848.011590    737.702808    44.209491   
std         NaN     5.794417     2262.837114   1961.245434   123.495124   
min         NaN  1996.000000        1.230000      1.230000     0.000000   
25%         NaN  2002.000000      123.272883    111.035000     0.224125   
50%         NaN  2007.000000      454.940000    383.810000     4.482750   
75%         NaN  2012.000000      627.283775    541.169372    40.230000   
max         NaN  2017.000000    15737.761830  13781.822830   898.100000   

        Stock_donkey  Stock_mule  Stock_camel       Out_hog     Stock_hog  \
count     642.000000  

## Save Clean Data

This data is clean, save it to a new file for data visualization.

In [12]:
save_path = './data/' + item + '_clean.csv'
df_data.to_csv(save_path, index = False)