In [1]:
import pandas as pd

In [2]:
coded = pd.read_csv('../data/unified.csv')
coded = coded[coded.level == 1]
coded.sort('name_en', inplace=True) # Sort so easier to read
coded = coded.drop(['name_pinyin', 'level', 'latitude', 'longitude'], axis=1) # Drop columns that aren't useful for us
coded = coded.dropna() # Drop rows where we don't have an alpha
print(len(coded))
coded.head(40)

31


Unnamed: 0,code,name_zh,name_en,alpha
1120,340000,安徽省,Anhui,AH
0,110000,北京市,Beijing,BJ
2464,500000,重庆市,Chongqing,CQ
1258,350000,福建省,Fujian,FJ
3196,620000,甘肃省,Gansu,GS
2133,440000,广东省,Guangdong,GD
2295,450000,广西壮族自治区,Guangxi,GX
2728,520000,贵州省,Guizhou,GZ
2434,460000,海南省,Hainan,HI
38,130000,河北省,Hebei,HE


In [3]:
map_data = pd.read_hdf('../data/china_provinces_from_admin_1_shp.hdf', 'df')
map_data.sort('postal', inplace=True)
print(len(map_data))
map_data.head(40)

31


Unnamed: 0_level_0,postal,woe_name,xs,ys
FID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
633,AH,Anhui,"(116.8389978365808, 116.84773115386969, 116.90...","(34.38921906191021, 34.389761664269344, 34.400..."
631,BJ,Beijing,"(117.07273034133209, 117.0876131526249, 117.09...","(40.673538316340256, 40.675011094686226, 40.67..."
630,CQ,Chongqing,"(109.3359932795662, 109.57308475126024, 109.59...","(31.705248521402126, 31.730104885232265, 31.73..."
632,FJ,Fujian,"[117.43482506600012, 117.44778172300019, 117.4...","[23.750718492000075, 23.742251607000114, 23.75..."
634,GD,Guangdong,"[110.44263756600006, 110.44752037900005, 110.4...","[20.663519598, 20.658351955000015, 20.65477122..."
626,GS,Gansu,"(98.24581749937812, 98.51520674009896, 98.5715...","(40.52452932456873, 40.533546860898895, 40.540..."
628,GX,Guangxi,"[109.10523522200018, 109.10092207100016, 109.0...","[21.02448151200018, 21.021185614000146, 21.022..."
629,GZ,Guizhou,"(107.69257937973458, 107.74952680881512, 107.7...","(29.142691759517106, 29.150133164713765, 29.15..."
637,HA,Hainan,"(111.01050866000003, 111.01636803500008, 111.0...","(19.683783270000077, 19.678045966000127, 19.64..."
645,HB,Hebei,"[118.64873628100005, 118.59432056600005, 118.5...","[39.044651167, 38.972197399, 38.91944503100002..."


In [4]:
# Clean-up 'postal' codes from Natural Earth to match 'alpha' in unified.csv

map_data.loc[map_data[map_data.postal == 'XZ'].index[0], 'woe_name'] = 'Tibet'
map_data.loc[map_data[map_data.postal == 'NM'].index[0], 'woe_name'] = 'Inner Mongolia'

In [5]:
#map_data.head(40)

In [6]:
# Merge and drop the columns we don't want
province_map_data = coded.merge(map_data, left_on='name_en', right_on='woe_name')
province_map_data = province_map_data.drop(['woe_name', 'postal'], axis=1)
print(len(province_map_data))
province_map_data.head(40)

31


Unnamed: 0,code,name_zh,name_en,alpha,xs,ys
0,340000,安徽省,Anhui,AH,"(116.8389978365808, 116.84773115386969, 116.90...","(34.38921906191021, 34.389761664269344, 34.400..."
1,110000,北京市,Beijing,BJ,"(117.07273034133209, 117.0876131526249, 117.09...","(40.673538316340256, 40.675011094686226, 40.67..."
2,500000,重庆市,Chongqing,CQ,"(109.3359932795662, 109.57308475126024, 109.59...","(31.705248521402126, 31.730104885232265, 31.73..."
3,350000,福建省,Fujian,FJ,"[117.43482506600012, 117.44778172300019, 117.4...","[23.750718492000075, 23.742251607000114, 23.75..."
4,620000,甘肃省,Gansu,GS,"(98.24581749937812, 98.51520674009896, 98.5715...","(40.52452932456873, 40.533546860898895, 40.540..."
5,440000,广东省,Guangdong,GD,"[110.44263756600006, 110.44752037900005, 110.4...","[20.663519598, 20.658351955000015, 20.65477122..."
6,450000,广西壮族自治区,Guangxi,GX,"[109.10523522200018, 109.10092207100016, 109.0...","[21.02448151200018, 21.021185614000146, 21.022..."
7,520000,贵州省,Guizhou,GZ,"(107.69257937973458, 107.74952680881512, 107.7...","(29.142691759517106, 29.150133164713765, 29.15..."
8,460000,海南省,Hainan,HI,"(111.01050866000003, 111.01636803500008, 111.0...","(19.683783270000077, 19.678045966000127, 19.64..."
9,130000,河北省,Hebei,HE,"[118.64873628100005, 118.59432056600005, 118.5...","[39.044651167, 38.972197399, 38.91944503100002..."


In [7]:
# Save to hdf and json
province_map_data.to_json('../data/province_map_data.json')
province_map_data.to_hdf('../data/province_map_data.hdf', 'df')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['name_zh', 'name_en', 'alpha', 'xs', 'ys']]

