In [1]:
# -*- coding: utf-8 -*-
import pandas as pd
import urllib
import ast
import numpy as np
from tqdm import tqdm
import datetime
from calendar import monthrange

code_data = pd.read_csv("code_data.csv") #data associating cities and regions names to their respective numeric codes

month = datetime.datetime.now().month #take current month
year = datetime.datetime.now().year #take current year
numdays = monthrange(year, month)[1] #number of days in current month
dates_sep = [str(datetime.date(year, month, day)) for day in range(1, numdays+1)] #dates as hyphen separated strings for naming
dates = [date.replace("-", "") for date in dates_sep] #dates as solid strings for algorithm readability

code_data

Unnamed: 0,City_CH,Prov_CH,Prov_EN,City_Code,City_EN,Prov_Code
0,北京市,北京市,Beijing,110000,Beijing,110000.0
1,天津市,天津市,Tianjin,120000,Tianjin,120000.0
2,石家庄市,河北省,Hebei,130100,Shijiazhuang,130000.0
3,唐山市,河北省,Hebei,130200,Tangshan,130000.0
4,秦皇岛市,河北省,Hebei,130300,Qinhuangdao,130000.0
...,...,...,...,...,...,...
364,可克达拉市,新疆,Xinjiang,659008,Kelada,650000.0
365,昆玉市,新疆,Xinjiang,659009,Kunyu,650000.0
366,台湾省,台湾省,Taiwan,710000,Taiwan,710000.0
367,香港特别行政区,特别行政区,Hongkong,810000,Hongkong,810000.0


In [None]:
### Extract move-in data from Baidu qianxi ###
error_code = [] #error produced by missing data
error_date = [] #date associated to error
errors = []
moveins = [] #list for move-in data

for d in tqdm(range(len(dates))): #loop over dates
    date = dates[d]
    by_cities = [] #list for data from cities at a given date
    for i in tqdm(range(len(code_data))):
        code = code_data.City_Code[i] #take code of city for matching
        #qianxi website by code and date
        url = "http://huiyan.baidu.com/migration/cityrank.jsonp?dt=province&id="+str(code)+"&type=move_in&date="+date
        #try runing code or except if error (add to error lists)
        try:
            file = urllib.request.urlopen(url, timeout=20) #increase timeout to avoid connection error
            file = file.read() 
            dict_str = file.decode("UTF-8") #decore file from URL as UTF-8 chracter map
            dict_str = dict_str.replace('\ncb({"errno":0,"errmsg":"SUCCESS","data":{"list":[{', "{") #transform to string
            dict_str = dict_str.replace("]}})", "") #remove irrelevant characters
            data = ast.literal_eval(dict_str) #turn string into data
            data = pd.DataFrame(list(data)) #create dataframe
            # add column names and data to dataframe by matching city codes
            data.columns = ["City_CH", "Prov_CH", "proportion"]
            data["City_EN_origin"] = [code_data[code_data.City_CH==c].City_EN.values[0] for c in data.City_CH]
            data["Prov_EN_origin"] = [code_data[code_data.City_CH==c].Prov_EN.values[0] for c in data.City_CH]
            data["City_EN_destination"] = np.repeat(code_data[code_data.City_Code==code].City_EN.values[0], len(data))
            data["Prov_EN_destination"] = np.repeat(code_data[code_data.City_Code==code].Prov_EN.values[0], len(data))
            data["City_CH_origin"] = [code_data[code_data.City_CH==c].City_CH.values[0] for c in data.City_CH]
            data["Prov_CH_origin"] = [code_data[code_data.City_CH==c].Prov_CH.values[0] for c in data.City_CH]
            data["City_CH_destination"] = data.City_CH.values
            data["Prov_CH_destination"] = data.Prov_CH.values
            data["City_code_origin"] = [code_data[code_data.City_CH==c].City_Code.values[0] for c in data.City_CH]
            data["Prov_code_origin"] = [code_data[code_data.City_CH==c].Prov_Code.values[0] for c in data.City_CH]
            data["City_code_destination"] = np.repeat(code_data[code_data.City_Code==code].City_Code.values[0], len(data))
            data["Prov_code_destination"] = np.repeat(code_data[code_data.City_Code==code].Prov_Code.values[0], len(data))
            by_cities.append(data)
        except:
            error_code.append(code)
            error_date.append(dates_sep[d])
            continue
    errors.append(pd.DataFrame({"error_code":error_code, "error_date":error_date})) #errors list
    data = pd.concat(by_cities) #data by city at a given date
    data['date'] = dates_sep[d] #add dates to dataframe
    moveins.append(data) #data list
#errors = = pd.concat(errors)
#errors.to_csv("missing_movein.csv") #save errors dataframe (optional)
data = pd.concat(moveins)
data.to_csv("baidu_qianxi_data_movein"+year+month+".csv", index=False, encoding='utf_8_sig')


  0%|                                                                                           | 0/28 [00:00<?, ?it/s]
  0%|                                                                                          | 0/369 [00:00<?, ?it/s][A
  0%|▏                                                                                 | 1/369 [00:01<06:13,  1.02s/it][A
  1%|▍                                                                                 | 2/369 [00:02<06:17,  1.03s/it][A
  1%|▋                                                                                 | 3/369 [00:02<06:01,  1.01it/s][A
  1%|▉                                                                                 | 4/369 [00:03<06:02,  1.01it/s][A
  1%|█                                                                                 | 5/369 [00:05<07:25,  1.22s/it][A
  2%|█▎                                                                                | 6/369 [00:06<06:57,  1.15s/it][A
  2%|█▌            

In [None]:
### Extract move-out data from Baidu qianxi ###
# operation is same as for move-in data above
error_code = []
error_date = []
errors = []
moveouts = []
for d in tqdm(range(len(dates))):  
    date = dates[d]
    by_cities = []
    for i in tqdm(range(len(code_data))):
        code = code_data.City_Code[i]
        url = "http://huiyan.baidu.com/migration/cityrank.jsonp?dt=province&id="+str(code)+"&type=move_out&date="+date
        try:
            file = urllib.request.urlopen(url, timeout=20) #increase timeout to avoid connection error
            file = file.read()
            dict_str = file.decode("UTF-8")
            dict_str = dict_str.replace('\ncb({"errno":0,"errmsg":"SUCCESS","data":{"list":[{', "{")
            dict_str = dict_str.replace("]}})", "")
            data = ast.literal_eval(dict_str)
            data = pd.DataFrame(list(data))
            data.columns = ["City_CH", "Prov_CH", "proportion"]
            data["City_EN_destination"] = [code_data[code_data.City_CH==c].City_EN.values[0] for c in data.City_CH]
            data["Prov_EN_destination"] = [code_data[code_data.City_CH==c].Prov_EN.values[0] for c in data.City_CH]
            data["City_EN_origin"] = np.repeat(code_data[code_data.City_Code==code].City_EN.values[0], len(data))
            data["Prov_EN_origin"] = np.repeat(code_data[code_data.City_Code==code].Prov_EN.values[0], len(data))
            data["City_CH_destination"] = [code_data[code_data.City_CH==c].City_CH.values[0] for c in data.City_CH]
            data["Prov_CH_destination"] = [code_data[code_data.City_CH==c].Prov_CH.values[0] for c in data.City_CH]
            data["City_CH_origin"] = data.City_CH.values
            data["Prov_CH_origin"] = data.Prov_CH.values
            data["City_code_origin"] = [code_data[code_data.City_CH==c].City_Code.values[0] for c in data.City_CH]
            data["Prov_code_origin"] = [code_data[code_data.City_CH==c].Prov_Code.values[0] for c in data.City_CH]
            data["City_code_destination"] = np.repeat(code_data[code_data.City_Code==code].City_Code.values[0], len(data))
            data["Prov_code_destination"] = np.repeat(code_data[code_data.City_Code==code].Prov_Code.values[0], len(data))
            by_cities.append(data)
        except:
            error_code.append(code)
            error_date.append(dates_sep[d])
            continue
    errors.append(pd.DataFrame({"error_code":error_code, "error_date":error_date}))
    data = pd.concat(by_cities)
    data['date'] = dates_sep[d]
    moveouts.append(data)
#errors = = pd.concat(errors)
#errors.to_csv("missing_moveout.csv")
data = pd.concat(moveouts)
data.to_csv("baidu_qianxi_data_moveout"+year+month+".csv", index=False, encoding='utf_8_sig')

  0%|                                                                                           | 0/28 [00:00<?, ?it/s]
  0%|                                                                                          | 0/369 [00:00<?, ?it/s][A
  0%|▏                                                                                 | 1/369 [00:01<06:23,  1.04s/it][A
  1%|▍                                                                                 | 2/369 [00:02<06:17,  1.03s/it][A
  1%|▋                                                                                 | 3/369 [00:03<06:01,  1.01it/s][A
  1%|▉                                                                                 | 4/369 [00:03<05:55,  1.03it/s][A
  1%|█                                                                                 | 5/369 [00:04<05:52,  1.03it/s][A
  2%|█▎                                                                                | 6/369 [00:05<05:59,  1.01it/s][A
  2%|█▌            