# Data Cleaning - Livestock Production

原始数据来源与维护见 [READM.md](https://github.com/good-food/datamap2019/blob/master/README.md)

In [1]:
import pandas as pd
import numpy as np

In [2]:
class ReadData():
    """
    # Usage:
    # df = ReadData(file_name, url).createDF() 
    """
    def __init__(self, file_name, 
                 url = None,
                 org_name = 'good-food', 
                 repo_name = 'datamap2019', 
                 folder_path = '/data/', 
                 mode = 'wb'
                ):
        
        self.file_name = file_name
        self.org_name = org_name
        self.repo_name = repo_name
        self.folder_path = folder_path
        
        self.url = url
        self.mode = mode
        
        self.root_path = 'https://github.com/'
        self.raw_path = 'https://raw.githubusercontent.com/'
        self.branch = 'master'

    def getPath(self):
        if self.url == None:
            path = self.root_path + self.org_name + '/' + self.repo_name + '/blob/' + self.branch + self.folder_path + self.file_name
            raw_path = self.raw_path + self.org_name + '/' + self.repo_name + '/' + self.branch + self.folder_path + self.file_name
            print('Url: ', path)
            print('Raw data:', raw_path)
        else:
            print('Url: ', self.url)
            print('Raw data:', self.url)
        return raw_path

    def curl(self):
        import urllib.request, urllib.parse, urllib.error
        
        url = self.getPath()
        response = urllib.request.urlopen(url).read()
        with open(self.file_name, self.mode) as file_handle:
            file_handle.write(response)

    def delFile(self):
        import os
        if os.path.exists(self.file_name): os.remove(self.file_name)
        else: print('no such file:%s'%self.file_name)

    def createDF(self):
        import pandas as pd
        import time
        print(self.file_name, 'is under dealing... needs 1 min')
        self.curl()
        time.sleep(10)
        df = pd.read_csv(self.file_name)
        self.delFile()
        return df

In [3]:
# input
item = 'product_aquatic'

# output
data_name = item + '_raw.csv'
doc_name = item + '_doc.csv'

## Read Data from GitHub Repository
### Read Data and Explain Data Index

If you haven't cloned the whole repository from [Github](https://github.com/good-food/datamap2019), please uncomment and the following 2 cells to run, or else, skip these 2 cells.

In [4]:
# df_data = ReadData(data_name).createDF()
# df_data.head()

In [5]:
# df_doc = ReadData(doc_name).createDF()
# df_doc

If you have cloned the whole repository from [Github](https://github.com/good-food/datamap2019), please run the following cell directly, or else, go back or clone the whole repository to your local.

In [6]:
df_data = pd.read_csv('./data/'+data_name)
df_data.head()

Unnamed: 0,AreaName,Year,Aqu_sum,AquSW,AquSW_grow,AquSW_cult,AquSW_fish,AquSW_crust,AquSW_shell,AquSW_algae,AquSW_other,AquFW,AquFW_grow,AquFW_cult,AquFW_fish,AuFW_crust,AquFW_shell,AquFW_other
0,CNADC,1998,27.41,27.41,27.41,0.0,25.11,0.2,0.1,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CNADC,1999,18.3,18.3,18.3,0.0,9.51,0.32,8.47,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CNADC,2000,17.5823,17.5823,17.5823,0.0,7.7602,0.2194,9.6027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CNADC,2001,18.2977,18.2977,18.2977,0.0,8.9539,0.225,9.1188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CNADC,2002,18.2177,18.2177,18.2177,,9.6,0.1,,,8.4604,,,,,,,


In [7]:
df_doc = pd.read_csv('./data/'+doc_name)
df_doc

Unnamed: 0,Field name,中文解释,Explanation,单位: Unit
0,AreaName,地区 (省级),Area (provincial),Nan
1,Year,年份,Year,Nan
2,Aqu_sum,水产总产量,Aquatic total yield (sea&fresh water),万吨: 10000 t
3,AquSW,海产品产量,Aquatic yield from sea water (fish&crust&shell...,万吨: 10000 t
4,AquSW_grow,海产品捕捞产量,Aquatic yield by catching from sea water,万吨: 10000 t
5,AquSW_cult,海产品养殖产量,Aquatic yield by aquaculture in sea water,万吨: 10000 t
6,AquSW_fish,海洋鱼类产量,Fish yield from sea water,万吨: 10000 t
7,AquSW_crust,海洋虾蟹产量,Crab&shrimp yield from sea water,万吨: 10000 t
8,AquSW_shell,海洋贝类产量,Shellfish yield from sea water,万吨: 10000 t
9,AquSW_algae,海洋藻类产量,Algae yield from sea water,万吨: 10000 t


### Checking data updates

In [10]:
import datetime

years = df_data['Year'].unique()
years.sort()
print('Year list =',years)

year_now = datetime.date.today().year
year_newest = years.max()
if year_newest+2 == year_now:
    print('The data is updated to year {}.'.format(year_now-1))
else:
    print('The newest data of National Bureau of Statistics is updated to year', year_now-1)
    print('While our data is updated to year', year_newest+1)
    print('Pleas check the 1st line of this notebook to update data.')
    print('But you can still play with the old data.')

Year list = [1978 1980 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996
 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010
 2011 2012 2013 2014 2015 2016 2017]
The data is updated to year 2018.


### Data Outline

In [30]:
print('There are {} rows and {} columns in the data frame.\n'.format(df_data.shape[0],df_data.shape[1]))

feilds = df_data.columns.values.tolist()
print('Fields list =', feilds, '\n')

areas = df_data['AreaName'].unique()
print('Areas list =',areas, '\n')

There are 675 rows and 18 columns in the data frame.

Fields list = ['AreaName', 'Year', 'Aqu_sum', 'AquSW', 'AquSW_grow', 'AquSW_cult', 'AquSW_fish', 'AquSW_crust', 'AquSW_shell', 'AquSW_algae', 'AquSW_other', 'AquFW', 'AquFW_grow', 'AquFW_cult', 'AquFW_fish', 'AuFW_crust', 'AquFW_shell', 'AquFW_other'] 

Areas list = ['CNADC' 'China' 'Beijing' 'Tianjin' 'Hebei' 'Shanxi' 'InnerMongolia'
 'Liaoning' 'Jilin' 'Heilongjiang' 'Shanghai' 'Jiangsu' 'Zhejiang' 'Anhui'
 'Fujian' 'Jiangxi' 'Shandong' 'Henan' 'Hubei' 'Hunan' 'Guangdong'
 'Guangxi' 'Hainan' 'Chongqing' 'Sichuan' 'Guizhou' 'Yunnan' 'Tibet'
 'Shaanxi' 'Gansu' 'Qinghai' 'Ningxia' 'Xinjiang'] 



## Dealing with Missing Values 
### Check Missing Values

Print all the columns with missing values (length of which shorter than data frame's length).

In [21]:
# mask missing data with 'True'

df_null = df_data.isnull()
# print(df_null.head())

# Find the columns where missing values exist
null_counts = {}
for column in df_data.columns.tolist():    
    null_count = df_null.groupby([column]).size()[False]
    if null_count < df_data.shape[0]: 
        null_counts[column] = null_count
null_counts

{'Aqu_sum': 668,
 'AquSW': 357,
 'AquSW_grow': 372,
 'AquSW_cult': 338,
 'AquSW_fish': 370,
 'AquSW_crust': 357,
 'AquSW_shell': 347,
 'AquSW_algae': 298,
 'AquSW_other': 356,
 'AquFW': 656,
 'AquFW_grow': 610,
 'AquFW_cult': 650,
 'AquFW_fish': 648,
 'AuFW_crust': 581,
 'AquFW_shell': 472,
 'AquFW_other': 530}

### Missing Value Explanation and Cleaning
**All the missing values in the data frame because they are too small, replace them with zeros.**

In [23]:
columns = list(null_counts.keys())

df_data[columns] = df_data[columns].fillna(0)
df_data[columns].describe(include='all')

Unnamed: 0,Aqu_sum,AquSW,AquSW_grow,AquSW_cult,AquSW_fish,AquSW_crust,AquSW_shell,AquSW_algae,AquSW_other,AquFW,AquFW_grow,AquFW_cult,AquFW_fish,AuFW_crust,AquFW_shell,AquFW_other
count,675.0,675.0,675.0,675.0,675.0,675.0,675.0,675.0,675.0,675.0,675.0,675.0,675.0,675.0,675.0,675.0
mean,346.973871,189.088358,98.12797,90.958528,72.827924,21.910302,75.494327,10.478144,7.435813,157.933323,15.785764,142.271453,139.827072,12.27901,3.344386,2.688333
std,934.820322,519.508999,265.099175,264.83042,196.620161,59.74567,216.298751,31.2027,22.380124,431.659325,41.618231,392.304726,379.457989,38.752682,9.23745,8.319867
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,10.73635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.3476,0.60825,7.5022,7.9964,0.0593,0.0,0.0
50%,48.0863,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36.1472,3.0597,31.4,34.7962,0.73,0.1072,0.23
75%,310.7501,134.49475,66.24835,48.82885,42.6841,17.001,47.1148,1.41955,4.9943,127.16265,11.2314,113.62415,120.21365,6.6733,3.199,1.69575
max,6881.509,3470.4038,1533.9811,2000.6973,1255.335,396.0946,1481.4194,224.7814,176.0986,3430.8484,258.7307,3199.0055,2986.6531,335.8549,56.8961,75.1913


## Data Checking

In [24]:
print(df_data.describe(include='all'))

       AreaName         Year      Aqu_sum        AquSW   AquSW_grow  \
count       675   675.000000   675.000000   675.000000   675.000000   
unique       33          NaN          NaN          NaN          NaN   
top       China          NaN          NaN          NaN          NaN   
freq         35          NaN          NaN          NaN          NaN   
mean        NaN  2007.097778   346.973871   189.088358    98.127970   
std         NaN     6.350722   934.820322   519.508999   265.099175   
min         NaN  1978.000000     0.000000     0.000000     0.000000   
25%         NaN  2002.000000    10.736350     0.000000     0.000000   
50%         NaN  2007.000000    48.086300     0.000000     0.000000   
75%         NaN  2012.000000   310.750100   134.494750    66.248350   
max         NaN  2017.000000  6881.509000  3470.403800  1533.981100   

         AquSW_cult   AquSW_fish  AquSW_crust  AquSW_shell  AquSW_algae  \
count    675.000000   675.000000   675.000000   675.000000   675.000000 

## Save Clean Data

This data is clean, save it to a new file for data visualization.

In [25]:
save_path = './data/' + item + '_clean.csv'
df_data.to_csv(save_path, index = False)