In [1]:
class ReadData():
    """
    # Usage:

    # input:
    data = ReadData(file_name, url)

    # output:
    print('rawpath:', data.getRawPath())
    print('path:', data.getPath())
    df = data.createDF()   
    """

    def __init__(self, file_name, 
                 url = None,
                 org_name = 'good-food', 
                 repo_name = 'datamap2019', 
                 folder_path = '/data/', 
                 mode = 'wb'
                ):
        
        self.file_name = file_name
        self.org_name = org_name
        self.repo_name = repo_name
        self.folder_path = folder_path
        
        self.url = url
        self.mode = mode
        
        self.root_path = 'https://github.com/'
        self.raw_path = 'https://raw.githubusercontent.com/'
        self.branch = 'master'

    def getPath(self):
        if self.url == None:
            url = self.root_path + self.org_name + '/' + self.repo_name + '/blob/' + self.branch + self.folder_path + self.file_name
        else:
            url = self.url
        return url

    def getRawPath(self):
        if self.url == None:
            url = self.raw_path + self.org_name + '/' + self.repo_name + '/' + self.branch + self.folder_path + self.file_name
        else:
            url = self.url
        return url

    def curl(self):
        import urllib.request, urllib.parse, urllib.error
        
        url = self.getRawPath()
        response = urllib.request.urlopen(url).read()
        with open(self.file_name, self.mode) as file_handle:
            file_handle.write(response)

    def delFile(self):
        import os

        if os.path.exists(self.file_name):
            os.remove(self.file_name)
        else:
            print('no such file:%s'%self.file_name)

    def createDF(self):
        import pandas as pd

        self.curl()
        df = pd.read_csv(self.file_name)
        self.delFile()
        return df

In [2]:
item = 'product_livestock'

In [3]:
data_name = item + '_clean.csv'
print(data_name, 'is under dealing...')

data = ReadData(data_name)

print('rawpath of data:', data.getRawPath())
print('path of data:', data.getPath())
df_data = data.createDF()
df_data.head()

product_livestock_clean.csv is under dealing...
rawpath of data: https://raw.githubusercontent.com/good-food/datamap2019/master/data/product_livestock_clean.csv
path of data: https://github.com/good-food/datamap2019/blob/master/data/product_livestock_clean.csv


Unnamed: 0,AreaName,Year,Stock_largeAni,Stock_cattle,Stock_horse,Stock_donkey,Stock_mule,Stock_camel,Out_hog,Stock_hog,...,MeatYeild_gt_shp,MilkYeild,MilkYeild_cattle,WoolYeild_sheep,WoolYeild_sheep_fine,WoolYeild_sheep_semi,WoolYeild_goat_thick,WoolYeild_goat_cashmere,EggYeild,HoneyYeild
0,China,1996,13360.6,11031.8,871.5,944.4,478.0,34.9,41225.1,36283.6,...,181.0,735.8,629.4,298102.0,121020.0,74099.0,35255.0,9585.0,1965.2,18.4
1,China,1997,14541.8,12175.7,891.2,952.8,480.6,35.0,46483.7,40034.8,...,212.8,681.1,601.1,255059.0,116054.0,55683.0,25865.0,8626.0,1895.3,21.1
2,China,1998,14803.2,12441.9,898.1,955.8,473.9,33.5,50215.1,42256.3,...,234.6,745.4,662.9,277545.0,115752.0,68775.0,31417.0,9799.0,2021.3,20.7
3,China,1999,15024.75,12698.34,891.41,934.77,467.27,32.96,51977.2,43144.2,...,251.2643,806.9073,717.5939,283152.0,114103.0,73700.0,31849.0,10179.65,2134.667,22.9907
4,China,2000,15151.51,12866.34,876.58,922.73,453.03,32.62,52673.34,44681.54,...,273.958296,919.118908,827.43079,292502.0,117386.0,84921.0,33266.0,11057.0,2243.3,24.6


In [4]:
doc_name = item + '_doc.csv'
print(doc_name, 'is under dealing...')

doc = ReadData(doc_name)

print('rawpath of documentation:', doc.getRawPath())
print('path of documentation:', doc.getPath())
df_doc = doc.createDF()
df_doc

product_livestock_doc.csv is under dealing...
rawpath of documentation: https://raw.githubusercontent.com/good-food/datamap2019/master/data/product_livestock_doc.csv
path of documentation: https://github.com/good-food/datamap2019/blob/master/data/product_livestock_doc.csv


Unnamed: 0,Feild name,中文解释,Explanation,单位: Unit
0,AreaName,地区 (省级),Area (provincial),Nan
1,Year,年份,Year,Nan
2,Stock_largeAni,大动物存栏量 (牛&马&驴&骡&骆驼),Stock of cattle&hores&donkey&mule&camel,万只: 10000 #
3,Stock_cattle,牛存栏量,Stock of cattle,万只: 10000 #
4,Stock_horse,马存栏量,Stock of horse,万只: 10000 #
5,Stock_donkey,驴存栏量,Stock of donkey,万只: 10000 #
6,Stock_mule,骡存栏量,Stock of mule,万只: 10000 #
7,Stock_camel,骆驼存栏量,Stock of camel,万只: 10000 #
8,Out_hog,猪出栏量,Output of hog,万只: 10000 #
9,Stock_hog,猪存栏量,Stock of hog,万只: 10000 #


In [5]:
import pandas as pd
import numpy as np

## Columns what we are interested

In [56]:
col_stock = ['Stock_cattle','Stock_hog', 'Stock_gt_shp']
col_out = ['Out_cattle', 'Out_hog','Out_gt_shp', 'Out_poultry']
col_yeild = ['MeatYeild_cattle', 'MeatYeild_hog', 'MeatYeild_gt_shp', 'MilkYeild_cattle', 'EggYeild']

Areas = df_data['AreaName'].unique().tolist()

def dfCols(df, col_interested):
    cols = ['AreaName', 'Year']
    cols.extend(col_interested)
    df = df[cols]
    df.dropna(inplace=True)
    print('There are {} rows and {} columns.'.format(df.shape[0],df.shape[1]))
    print('Grouped by areas and count record in each area:', df.groupby(['AreaName']).count())
    
    print('Show the years in each area if the year has a record:')
    for area in Areas:
        years = df[df['AreaName'] == area]['Year'].unique().tolist()
        print('Statistic years in {} ='.format(area), years)
    return df

In [57]:
df_stock = dfCols(df_data, col_stock)

There are 642 rows and 5 columns.
Grouped by areas and count record in each area:                 Year  Stock_cattle  Stock_hog  Stock_gt_shp
AreaName                                                   
Anhui             20            20         20            20
Beijing           20            20         20            20
China             22            22         22            22
Chongqing         20            20         20            20
Fujian            20            20         20            20
Gansu             20            20         20            20
Guangdong         20            20         20            20
Guangxi           20            20         20            20
Guizhou           20            20         20            20
Hainan            20            20         20            20
Hebei             20            20         20            20
Heilongjiang      20            20         20            20
Henan             20            20         20            20
Hubei             

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [58]:
df_out = dfCols(df_data, col_out)

There are 576 rows and 6 columns.
Grouped by areas and count record in each area:                 Year  Out_cattle  Out_hog  Out_gt_shp  Out_poultry
AreaName                                                          
Anhui             18          18       18          18           18
Beijing           18          18       18          18           18
China             18          18       18          18           18
Chongqing         18          18       18          18           18
Fujian            18          18       18          18           18
Gansu             18          18       18          18           18
Guangdong         18          18       18          18           18
Guangxi           18          18       18          18           18
Guizhou           18          18       18          18           18
Hainan            18          18       18          18           18
Hebei             18          18       18          18           18
Heilongjiang      18          18       18      

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [59]:
df_yeild = dfCols(df_data, col_yeild)

There are 642 rows and 7 columns.
Grouped by areas and count record in each area:                 Year  MeatYeild_cattle  MeatYeild_hog  MeatYeild_gt_shp  \
AreaName                                                                  
Anhui             20                20             20                20   
Beijing           20                20             20                20   
China             22                22             22                22   
Chongqing         20                20             20                20   
Fujian            20                20             20                20   
Gansu             20                20             20                20   
Guangdong         20                20             20                20   
Guangxi           20                20             20                20   
Guizhou           20                20             20                20   
Hainan            20                20             20                20   
Hebei             

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


## Data Exploration

In [8]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline