In [1]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("darkgrid")

In [4]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.sandbox.regression.predstd import wls_prediction_std

In [5]:
rates = [1.548,1.565,1.553,1.517,1.464,1.445,1.417,1.369,1.347,1.346,1.357,1.337,1.289,1.262,1.252,1.198,1.126,1.088,1.070,1.065,1.041,1.032,1.022,1.000,0.974,0.963,0.948,0.954,0.955,0.937,0.895,0.876,0.859,0.82,0.77]
years = list(range(1988, 2023))
rs = pd.Series(rates, index=years)
rs.index.name = 'year'
rs.name = 'rate'

In [6]:
X = rs.index.values
Y = rs.values
X = sm.add_constant(X)

model = sm.OLS(Y, X)
modelres = model.fit()

In [7]:
b, a = modelres.params

In [8]:
def get_inflation_rate(y):
    if y in rs.index:
        return rs.loc[y]
    else:
        return a * y + b

In [9]:
get_inflation_rate(2011)

1.0

In [10]:
forbes_person = pd.read_csv('../../ddf--entities--person.csv')

In [14]:
forbes_wealth = pd.read_csv('../../ddf--datapoints--worth--by--person--year.csv')

In [15]:
forbes_wealth

Unnamed: 0,person,year,worth
0,a_jayson_adair,2021,1000.0
1,a_jerrold_perenchio,2001,3000.0
2,a_jerrold_perenchio,2002,2600.0
3,a_jerrold_perenchio,2003,2300.0
4,a_jerrold_perenchio,2004,2700.0
...,...,...,...
61900,zygmunt_solorz_zak,2017,2500.0
61901,zygmunt_solorz_zak,2018,2800.0
61902,zygmunt_solorz_zak,2019,2700.0
61903,zygmunt_solorz_zak,2020,2400.0


In [16]:
from dataclasses import dataclass

In [17]:
# create a list of all hurun names

In [687]:
hurun_data = dict()
for i in range(2012, 2023):
    df = pd.read_csv(f'../source/hurun_{i}.csv')
    if i <= 2016:
        df.columns = ['rank', 'wealth', 'name', 'sex', 'birth', 'companies', 'industry', 'year']
        df['wealth'] = df['wealth'].map(lambda x: x.split(' ')[1]).astype(int) / 1000.0
    elif i <= 2018:
        df = df[['Ranking', 'NameEn', 'Wealth', 'Birthday', 'CNameEn', 'IndustryEn', 'year']]
        df.columns = ['rank', 'name', 'wealth', 'birth', 'companies', 'industry', 'year']
    else:
        # 2019-2022 list have some duplications. So we need to remove them...
        df = df[['hs_Character_Fullname_En', 'hs_Character_Permanent_En', 'hs_Character_Gender_Lang', 
                 'hs_Character_ID', 'hs_Rank_Global_Wealth_USD', 'hs_Rank_Global_Industry_En', 'hs_Character_Age', 
                 'hs_Rank_Global_ComName_En', 'year']].copy()
        df.columns = ['name', 'geo', 'sex', 'id', 'wealth', 'industry', 'age', 'companies', 'year']
        df = handle_dups(df)
        
    hurun_data[i] = df

In [52]:
df = hurun_data[2019].copy()

In [54]:
np.any(df['id'].duplicated())

False

In [688]:
df

Unnamed: 0,name,geo,sex,id,wealth,industry,age,companies,year
33,Ma Yun,China-Zhejiang-Hangzhou,Male,1,37.0,E-Commerce,58,Alibaba,2022
27,Ma Huateng,China-Guangdong-Shenzhen,Male,2,52.0,Gaming,51,Tencent,2022
414,Xu Jiayin,China-Guangdong-Guangzhou,Male,3,7.6,Developer,64,Evergrande,2022
34,He Xiangjian,China-Guangdong-Foshan,Male,4,36.0,Household Appliances,80,Midea,2022
240,Sun Piaoyang,China-Shanghai,Male,5,11.0,Pharmaceuticals,64,Hengrui、 Hansoh,2022
...,...,...,...,...,...,...,...,...,...
2062,Xu Chengchen,China-Nantong,Male,7958,1.9,Green Energy,34,Haili,2022
2268,Jiang Xueying,China-Hangzhou,Female,7959,1.7,Biotechnology,49,Assuro Tech,2022
3302,Reshma Shetty,USA-Boston,Female,7960,1.1,Biotechnology,未知,Ginkgo Bioworks Holdings,2022
3315,Niejuan,China-Changsha,Female,7961,1.1,Medical Devices,43,Cofoe Medical,2022


In [689]:
res = hurun_data[2012].copy()

for k, v in hurun_data.items():
    if k == 2012:
        continue
    print(k)
    res = pd.concat([res, v], ignore_index=True)

2013
2014
2015
2016
2017
2018
2019
2020
2021
2022


In [673]:
res

Unnamed: 0,rank,wealth,name,sex,birth,companies,industry,year,geo,id,age
0,1.0,55.0,Carlos Slim Helu family,Male,1940,America Movil,Telecom,2012,,,
1,2.0,50.0,Bill Gates,Male,1956,Microsoft,Investments,2012,,,
2,3.0,48.0,Warren Buffett brothers,Male,1931,Berkshire Hathaway,Investments,2012,,,
3,4.0,42.0,Bernard Arnault,Male,1950,LVMH,Single Brand Retailing,2012,,,
4,5.0,37.0,Larry Ellison,Male,1945,Oracle,Technology,2012,,,
...,...,...,...,...,...,...,...,...,...,...,...
17967,,1.9,Xu Chengchen,Male,,Haili,Green Energy,2022,China-Nantong,7958.0,34
17968,,1.7,Jiang Xueying,Female,,Assuro Tech,Biotechnology,2022,China-Hangzhou,7959.0,49
17969,,1.1,Reshma Shetty,Female,,Ginkgo Bioworks Holdings,Biotechnology,2022,USA-Boston,7960.0,未知
17970,,1.1,Niejuan,Female,,Cofoe Medical,Medical Devices,2022,China-Changsha,7961.0,43


In [690]:
res = res.drop(columns=['rank', 'id'])

In [691]:
res = res[['name', 'sex', 'birth', 'age', 'geo', 'companies', 'industry', 'year', 'wealth']]

In [692]:
res = res.sort_values(by=['name', 'year'])

In [693]:
res.to_csv('hurun_all.csv', index=False)

In [135]:
res

Unnamed: 0,name,sex,birth,age,geo,companies,industry,year,wealth
7698,Abdulla Ahmed Al Ghurair,Male,,未知,United Arab Emirates-Dubai,Mashreq Bank,Banking,2019,1.2
10349,Abdulla Ahmed Al Ghurair,Male,,未知,United Arab Emirates-Dubai,Mashreq Bank,Banking,2020,1.2
13367,Abdulla Ahmed Al Ghurair,Male,,未知,United Arab Emirates-Dubai,Mashreq Bank,Banking,2021,1.2
16663,Abdulla Ahmed Al Ghurair,Male,,未知,United Arab Emirates-Dubai,Mashreq Bank,Banking,2022,2.8
4814,Abdulla Ahmed Al Ghurair & Family,,1955,,,Mashreq Bank,Banking,2018,1.3
...,...,...,...,...,...,...,...,...,...
16038,Zygmunt Solorz-Zak,Male,,66,Poland-Warsaw,Cyfrowy Polsat,Media,2022,3.2
17692,li Yanggu,Male,,未知,China -Beijing,Pony Testing,Professional Services,2022,1.2
5106,Ángel Losada Moreno,,1956,,,Grupo Gigante,Retail,2018,1.1
7910,Ángel Losada Moreno,Male,,未知,Mexico-Mexico City,Grupo Gigante,Retail,2019,1.0


In [77]:
from ddf_utils.str import to_concept_id

In [667]:
def split_and_add_lastname(name, sep):
    n = list(map(str.strip, name.split(sep)))
    if ' ' not in n[0]:
        if ' ' not in n[-1]:
            print("unreconized names: ", name)
            return n
        else:
            lastname = n[-1].split(' ')[-1]
            for i in range(0, len(n)-1):
                n[i] = ' '.join([n[i], lastname])
    return ';'.join(map(to_concept_id, n)) 

def name_to_id(name):
    if not isinstance(name, str):
        return name
    n = name.lower().strip()
    if '& family' in n:
        n = to_concept_id(n.replace('& family', ''))
    elif '& famiy' in n:
        n = to_concept_id(n.replace('& famiy', ''))
    elif '&family' in n:
        n = to_concept_id(n.replace('&family', ''))
    elif ' brothers' in n:
        n = to_concept_id(n.replace(' brothers', ''))
#     elif 'family' in n:
#         n = to_concept_id(n.replace('family', ''))
    elif '&' in n:
        n = to_concept_id(n.replace('&', ' and '))
    elif '、' in n:
        n = to_concept_id(n.replace('、', ' and '))
    else:
        n = to_concept_id(n)
        
    return n

In [670]:
name_to_id('Eliodoro、Bernardo、Patricia Matte')

'eliodoro_and_bernardo_and_patricia_matte'

In [671]:
name_to_id('Shi Yonglei&family')

'shi_yonglei'

In [694]:
names = res[['name']].drop_duplicates()

In [695]:
names['name_id'] = names['name'].map(name_to_id)

In [696]:
mapping = names.dropna().set_index('name')['name_id'].to_dict()

In [697]:
res['name_id'] = res['name'].map(lambda x: mapping.get(x, None))

In [698]:
res.to_csv('hurun_all.csv', index=False)

In [102]:
# appending rows for duplicated ones

In [699]:
res = res.dropna(how='all', subset=['name_id'])

In [710]:
res_multiple = res[res['name_id'].str.contains('_and_')].copy()

In [711]:
res_multiple

Unnamed: 0,name,sex,birth,age,geo,companies,industry,year,wealth,name_id
1961,Alain Taravella & Jacques Nicolet,,1950,,,Altarea Cogedim,Real Estate,2017,1.5,alain_taravella_and_jacques_nicolet
4305,Alain Taravella & Jacques Nicolet,,1949,,,Altarea Cogedim,Real Estate,2018,1.8,alain_taravella_and_jacques_nicolet
1873,Alicia & Tannetta Fentener van Vlissingen,,/,,,Shv Holdings,Energy,2017,1.6,alicia_and_tannetta_fentener_van_vlissingen
2289,Anders & Soren Westermann,,1962,,,WIDEX,Medical Equipments,2017,1.2,anders_and_soren_westermann
4827,Anders & Soren Westermann,,1948,,,Widex,Medical Equipments,2018,1.3,anders_and_soren_westermann
...,...,...,...,...,...,...,...,...,...,...
4974,Zhu Min & Xu Yuqing,,1948、1949,,,Cybernaut,Investments,2018,1.3,zhu_min_and_xu_yuqing
2167,Zhu Yicai & Wu Xueqin,,1956,,,Yurun,Food & Beverages,2017,1.4,zhu_yicai_and_wu_xueqin
2739,Zuo Hongbo & Zhe Shuxia,,1965、1963,,,Aurora,Manufacturing,2017,1.0,zuo_hongbo_and_zhe_shuxia
5283,Zuo Hongbo & Zhe Shuxia,,1965、1963,,,Aurora,Photoelectric material,2018,1.1,zuo_hongbo_and_zhe_shuxia


In [712]:
# TODO: search and find recent name for these entries

In [713]:
res_multiple.age.unique()

array([nan, '未知', '65', '66', '67'], dtype=object)

In [589]:
res2 = []

for _, row in res_multiple.iterrows():
    sex = row['sex']
    birth = row['birth']
    # age = ['age']
    for i, nid in enumerate(row['name_id'].split(';')):
        row_new = row.copy()
        row_new['name_id'] = nid
        if sex and isinstance(sex, str):
            if '、' in sex:
                row_new['sex'] = sex.split('、')[i]
        if birth and isinstance(birth, str):
            if '、' in birth:
                row_new['birth'] = birth.split('、')[i]
        res2.append(row_new)

In [590]:
res_split = pd.DataFrame(res2).sort_values(by=['name_id', 'year'])

In [591]:
res_split

Unnamed: 0,name,sex,birth,age,geo,companies,industry,year,wealth,name_id
2255,Salman & Ahmed Rahman,,1964,,,Beximco,Pharmaceutical,2017,1.3,ahmed_rahman
1961,Alain Taravella & Jacques Nicolet,,1950,,,Altarea Cogedim,Real Estate,2017,1.5,alain_taravella
4305,Alain Taravella & Jacques Nicolet,,1949,,,Altarea Cogedim,Real Estate,2018,1.8,alain_taravella
1873,Alicia & Tannetta Fentener van Vlissingen,,/,,,Shv Holdings,Energy,2017,1.6,alicia_vlissingen
2075,Barbara Agnes Ferrari Bengolea & Amalia Amoedo,,1953,,,Loma Negra,Cement,2017,1.4,amalia_amoedo
...,...,...,...,...,...,...,...,...,...,...
2739,Zuo Hongbo & Zhe Shuxia,,1965,,,Aurora,Manufacturing,2017,1.0,zuo_hongbo
5283,Zuo Hongbo & Zhe Shuxia,,1965,,,Aurora,Photoelectric material,2018,1.1,zuo_hongbo
1812,Huang Lianxi & Zuo Xiaoping,,1973,,,L&S,Plastic pipes and fittings,2017,1.7,zuo_xiaoping
4334,Huang Lianxi & Zuo Xiaoping,,1973,,,L&S,Plastic pipes and fittings,2018,1.8,zuo_xiaoping


In [592]:
res_part1 = res[~res['name_id'].str.contains(';')].copy()

In [593]:
res2 = pd.concat([res_part1, res_split], ignore_index=True)

In [594]:
res2 = res2.sort_values(by=['name_id', 'name', 'year'])

In [595]:
res2.to_csv('hurun_all_split.csv', index=False)

In [702]:
# double checking
res2 = res.copy()
res2[~res2['name_id'].str.contains('_')]

Unnamed: 0,name,sex,birth,age,geo,companies,industry,year,wealth,name_id
6437,Andrew,Male,,未知,United States-California-Los Angeles,Panda Express,Hospitality,2019,5.0,andrew
9198,Andrew,Male,,未知,United States-California-Los Angeles,Panda Express,Hospitality,2020,5.9,andrew
12267,Andrew,Male,,未知,United States-California-Los Angeles,Panda Express,Restaurant,2021,6.9,andrew
15622,Andrew,Male,,未知,United States-California-Los Angeles,Panda Express,Restaurants,2022,6.5,andrew
14371,Dongxue,Female,,未知,China-Hefei,Laoxiangji,Restaurant,2021,1.2,dongxue
17560,Dongxue,Female,,未知,China-Hefei,Laoxiangji,Restaurants,2022,1.3,dongxue
17970,Niejuan,Female,,43,China-Changsha,Cofoe Medical,Medical Devices,2022,1.1,niejuan
982,Samwer Brothers,,/,,,Rocket Internet,E-commerce Venture capital,2017,3.8,samwer
7442,Tahir,Male,,67,Indonesia-Jakarta,Bank Mayapada,Banking,2019,1.6,tahir
10128,Tahir,Male,,68,Indonesia-Jakarta,Bank Mayapada,Banking,2020,2.5,tahir


In [703]:
# create synonyms for manually edits
synonyms = res2[['name', 'sex', 'birth', 'geo', 'companies', 'industry', 'name_id']].drop_duplicates(subset=['name', 'name_id', 'companies'])

In [704]:
synonyms.to_csv('person_synonyms_2.csv', index=False)

In [503]:
# check: 
# 1. people with 2 companies 
# 2. people with 2 entry for same year
# (which possibly means same name for different person)

In [705]:
# 1. 
cs = synonyms.groupby(['name_id'])['companies'].count()
synonyms[synonyms.name_id.isin(cs[cs>1].index)]

Unnamed: 0,name,sex,birth,geo,companies,industry,name_id
7698,Abdulla Ahmed Al Ghurair,Male,,United Arab Emirates-Dubai,Mashreq Bank,Banking,abdulla_ahmed_al_ghurair
4814,Abdulla Ahmed Al Ghurair & Family,,1955,,Mashreq Bank,Banking,abdulla_ahmed_al_ghurair
2168,Abdulla Ahmed Al Ghurair & family,,1956,,Mashreq Bank,Banking,abdulla_ahmed_al_ghurair
10771,Abdullah bin Sulaiman Al Rajhi,Male,,Saudi Arabia-Riyadh,Al Rajhi Banking And Investment,Banking,abdullah_bin_sulaiman_al_rajhi
13775,Abdullah bin Sulaiman Al Rajhi,Male,,Saudi Arabia-Riyadh,Al Rajhi Banking & Investment,Banking,abdullah_bin_sulaiman_al_rajhi
...,...,...,...,...,...,...,...
15210,Zuo Xiaoping,Female,,China-Guangdong-Foshan,Lesso,Construction Materials,zuo_xiaoping
5949,Zuo Zongshen,Male,,China-Chongqing,Zongshen,"Motorcycles, Real estate",zuo_zongshen
2595,Zuo Zongshen & family,,1953,,Zongshen,Motorcycles、 Real estate,zuo_zongshen
1518,Zygi Wilf,,1951,,Garden Homes,Real Estate,zygi_wilf


In [706]:
synonyms[synonyms.name_id.isin(cs[cs>1].index)].to_csv('multiple_companies.csv', index=False)

In [707]:
# 2.
ws = res2.groupby(['name_id', 'year'])['wealth'].count()
ws_2 = ws[ws>1]

In [708]:
res2[res2.name_id.isin(ws_2.index.get_level_values(0))].to_csv('multiple_year.csv', index=False)

In [709]:
ws_2.index.get_level_values(0).unique()

Index(['chen_yanni', 'huang_wei', 'huang_zheng', 'james_d_slavik', 'jiang_bin',
       'jiang_long', 'jiang_wei', 'jim_davis', 'leonardo_del_vecchio',
       'li_hua', 'li_li', 'li_min', 'li_ping', 'li_xiaoming', 'robert_miller',
       'tu_jianhua', 'vagit_alekperov', 'wang_jian', 'wang_jun',
       'wang_yanqing', 'willis_j_johnson', 'xu_xin', 'yeung_kin_man',
       'zhang_changhong', 'zhang_jin', 'zhang_liang', 'zhang_xin',
       'zhang_yong', 'zhou_jian'],
      dtype='object', name='name_id')

In [518]:
res2[res2.name_id == 'leonardo_del_vecchio']

Unnamed: 0,name,sex,birth,age,geo,companies,industry,year,wealth,name_id
8361,Leonardo Del Vecchio,,1936.0,,,Luxottica,Luxury Goods,2017,18.0,leonardo_del_vecchio
8362,Leonardo Del Vecchio,,1936.0,,,Luxottica,Luxury Goods,2018,25.0,leonardo_del_vecchio
8363,Leonardo Del Vecchio,Male,,84.0,Italy-Lombardy-Milan,Luxottica,Luxury Goods,2019,23.0,leonardo_del_vecchio
8364,Leonardo Del Vecchio,Male,,85.0,Italy-Lombardy-Milan,Luxottica,Luxury Goods,2020,29.0,leonardo_del_vecchio
8365,Leonardo Del Vecchio,Male,,86.0,Italy-Lombardy-Milan,Luxottica,Luxury Goods,2021,28.0,leonardo_del_vecchio
8366,Leonardo Del Vecchio,Male,,87.0,Italy-Lombardy-Milan,EssilorLuxottica,Luxury Goods,2022,33.0,leonardo_del_vecchio
8367,Leonardo Del Vecchio,Male,1936.0,,,Luxottica,Eye Wear,2012,13.0,leonardo_del_vecchio
8368,Leonardo Del Vecchio,Male,1936.0,,,Luxottica,Eye Wear,2013,16.0,leonardo_del_vecchio
8369,Leonardo Del Vecchio,Male,1936.0,,,Luxottica,Eye Wear,2014,18.0,leonardo_del_vecchio
8370,Leonardo Del Vecchio,Male,1936.0,,,Luxottica,Eye Wear,2015,19.0,leonardo_del_vecchio


In [519]:
# ^ above guy has duplicated 2016 entry
# also below guys

In [525]:
guy = 'vagit_alekperov'
guy = 'willis_j_johnson'
res2[res2.name_id == guy].sort_values(by=['companies', 'year'])

Unnamed: 0,name,sex,birth,age,geo,companies,industry,year,wealth,name_id
17016,Zhang Jin,Male,,48,China-Guangdong-Guangzhou,Cedar,"Investment, real estate",2019,5.6,zhang_jin
17018,Zhang Jin,Male,,49,China-Guangdong-Guangzhou,Cedar,"Logistics, Chemical Industry",2020,5.8,zhang_jin
17020,Zhang Jin,Male,,50,China-Guangdong-Guangzhou,Cedar,Investments,2021,4.6,zhang_jin
17014,Zhang Jin,,1971.0,,,Junhua,Investment、 real estate、 automobile,2017,3.3,zhang_jin
17015,Zhang Jin,,1971.0,,,Junhua,"Investment, real estate, automobile",2018,4.8,zhang_jin
17021,Zhang Jin,Female,,未知,China-Guangdong-Shenzhen,Maoye,Estate Holding,2021,2.9,zhang_jin
18219,Huang Maoru & Zhang Jin,,1965.0,,,Maoye,Real Estate、 Retail,2017,3.0,zhang_jin
18220,Huang Maoru & Zhang Jin,,1965.0,,,Maoye,"Real Estate, Retail",2018,3.3,zhang_jin
17017,Zhang Jin,Female,,未知,China-Guangdong-Shenzhen,Maoye,"Real estate,",2019,2.8,zhang_jin
17019,Zhang Jin,Female,,未知,China-Guangdong-Shenzhen,Maoye,"Real Estate,",2020,2.9,zhang_jin


In [650]:
# complicated cases
# guy = 'zhang_liang'
# guy = 'zhang_yong'
# guy = 'jiang_wei'
# guy = 'zhou_yifeng'
guy = 'paul_demarais_jr'
res2[res2.name_id == guy].sort_values(by=['companies', 'year'])

Unnamed: 0,name,sex,birth,age,geo,companies,industry,year,wealth,name_id
11497,Paul Demarais Jr. & Family,,1955.0,,,Power Corp. of Canada,Conglomerate,2018,2.7,paul_demarais_jr
11496,Paul Demarais Jr,Male,,65.0,Canada-Montreal,Power Corp. of Canada,Conglomerate,2019,2.3,paul_demarais_jr


In [648]:
1953+69

2022

In [665]:
a = res2[(res2.name.str.contains('&'))]
a[a.year == 2021]

Unnamed: 0,name,sex,birth,age,geo,companies,industry,year,wealth,name_id
17735,Jose & Francisco Jose Calderon Rojas,Male,,66,Mexico-Monterrey,FEMSA,Food & Beverages,2021,1.3,jose_rojas
17818,Jose & Francisco Jose Calderon Rojas,Male,,66,Mexico-Monterrey,FEMSA,Food & Beverages,2021,1.3,jose_rojas


In [651]:
# FIXME: 1. don't insert new row for multiple names entry. just use one name.
# 2. double check duplicated time after mapping to person id.

In [714]:
# after manual edits
synonyms_old = pd.read_csv('person_synonyms_1.csv')
# synonym_map = synonyms.set_index(['name', 'companies'])['name_id'].to_dict()

In [717]:
s1 = synonyms.set_index(['name', 'companies'])
s2 = synonyms_old.set_index(['name', 'companies'])

In [720]:
s1 = s1.sort_index()
s2 = s2.sort_index()

In [721]:
name_id_edited = []
for i in s1.index:
    s2_id = s2.loc[i]
    if s2_id.shape[0] > 1:
        name_id_edited.append(np.nan)
    else:
        name_id_edited.append(s2_id['name_id'].iloc[0])

In [722]:
s1['name_id_new'] = name_id_edited

In [724]:
s1[s1['name_id_new'] != s1['name_id']]

Unnamed: 0_level_0,Unnamed: 1_level_0,sex,birth,geo,industry,name_id,name_id_new
name,companies,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Acharya Balkrishna,Patanjali Ayurved,Male,,India-Haridwar,Consumer Goods,acharya_balkrishna,acharya_balakrishna
Adam Kwok Kai-Fai & Family,Sun Hung Kai,,1983,,Real Estate,adam_kwok_kai_fai,adam_kwok
Adam Kwok Kai-fai,Sun Hung Kai,Male,,China-Hong Kong,Real Estate,adam_kwok_kai_fai,adam_kwok
Adam Neumann,WeWork,Male,,United States-New York-New York,Real Estate,adam_neumann,adam_neuman
Alain Taravella & Jacques Nicolet,Altarea Cogedim,,1950,,Real Estate,alain_taravella_and_jacques_nicolet,
...,...,...,...,...,...,...,...
Zhu Min & Xu Yuqing,Cybernaut,,1948、1949,,Investments,zhu_min_and_xu_yuqing,
Zhu Yicai & Wu Xueqin,Yurun,,1956,,Food & Beverages,zhu_yicai_and_wu_xueqin,
Zong Qinghou family,Wahaha,Male,1945,,Drinks,zong_qinghou_family,zong_qinghou
Zuo Hongbo & Zhe Shuxia,Aurora,,1965、1963,,Manufacturing,zuo_hongbo_and_zhe_shuxia,


In [726]:
s1['name_id_new'] = s1['name_id_new'].fillna(s1['name_id'])

In [728]:
s1.to_csv('person_synonyms_2.csv')

In [729]:
synonym_map = s1['name_id_new'].to_dict()

In [730]:
synonym_map[('Alain Taravella & Jacques Nicolet', 'Altarea Cogedim   ')]

'alain_taravella_and_jacques_nicolet'

In [731]:
name_ids = []
for i, row in res2.iterrows():
    k = (row['name'], row['companies'])
    name_ids.append(synonym_map[k])

In [732]:
# res2['name_id'] = res2['name'].map(lambda x: synonym_map[x])
res2['name_id'] = name_ids

In [733]:
res2[pd.isnull(res2['name_id'])]

Unnamed: 0,name,sex,birth,age,geo,companies,industry,year,wealth,name_id


In [734]:
res2[~res2['name_id'].str.contains('_')]

Unnamed: 0,name,sex,birth,age,geo,companies,industry,year,wealth,name_id
14371,Dongxue,Female,,未知,China-Hefei,Laoxiangji,Restaurant,2021,1.2,dongxue
17560,Dongxue,Female,,未知,China-Hefei,Laoxiangji,Restaurants,2022,1.3,dongxue
17970,Niejuan,Female,,43,China-Changsha,Cofoe Medical,Medical Devices,2022,1.1,niejuan
7442,Tahir,Male,,67,Indonesia-Jakarta,Bank Mayapada,Banking,2019,1.6,tahir
10128,Tahir,Male,,68,Indonesia-Jakarta,Bank Mayapada,Banking,2020,2.5,tahir
13162,Tahir,Male,,69,Indonesia-Jakarta,Bank Mayapada,Banking,2021,2.0,tahir
16481,Tahir,Male,,70,Indonesia-Jakarta,Bank Mayapada,Banking,2022,1.3,tahir
2411,Valerie & family,,/,,,Rosemount Estate,Food & Beverages,2017,1.2,valerie


In [736]:
res2

Unnamed: 0,name,sex,birth,age,geo,companies,industry,year,wealth,name_id
7698,Abdulla Ahmed Al Ghurair,Male,,未知,United Arab Emirates-Dubai,Mashreq Bank,Banking,2019,1.2,abdulla_ahmed_al_ghurair
10349,Abdulla Ahmed Al Ghurair,Male,,未知,United Arab Emirates-Dubai,Mashreq Bank,Banking,2020,1.2,abdulla_ahmed_al_ghurair
13367,Abdulla Ahmed Al Ghurair,Male,,未知,United Arab Emirates-Dubai,Mashreq Bank,Banking,2021,1.2,abdulla_ahmed_al_ghurair
16663,Abdulla Ahmed Al Ghurair,Male,,未知,United Arab Emirates-Dubai,Mashreq Bank,Banking,2022,2.8,abdulla_ahmed_al_ghurair
4814,Abdulla Ahmed Al Ghurair & Family,,1955,,,Mashreq Bank,Banking,2018,1.3,abdulla_ahmed_al_ghurair
...,...,...,...,...,...,...,...,...,...,...
12690,Zygmunt Solorz-Zak,Male,,65,Poland-Warsaw,Cyfrowy Polsat,Media,2021,3.2,zygmunt_solorz_zak
16038,Zygmunt Solorz-Zak,Male,,66,Poland-Warsaw,Cyfrowy Polsat,Media,2022,3.2,zygmunt_solorz_zak
17692,li Yanggu,Male,,未知,China -Beijing,Pony Testing,Professional Services,2022,1.2,li_yanggu
5106,Ángel Losada Moreno,,1956,,,Grupo Gigante,Retail,2018,1.1,angel_losada_moreno


In [735]:
res2.to_csv('hurun_all_correct_id.csv', index=False)

In [737]:
geo_syns = pd.read_csv('../../../ddf--open_numbers/ddf--synonyms--geo.csv')
geo_map = geo_syns.set_index('synonym')['geo'].to_dict()

In [738]:
def group_func(ser):
    res = ser.dropna()
    if res.empty:
        return np.nan
    return res.iloc[0]

person_geo = res2.groupby('name_id')['geo'].agg(group_func)

In [739]:
person_geo.hasnans

True

In [740]:
person_geo.loc[pd.isnull(person_geo)].index

Index(['achal_anil_bakeri', 'adani_vinodbhai_shantilal', 'ahsen_ozokur',
       'alain_taravella_and_jacques_nicolet', 'albert_frere', 'alex_c_lo',
       'alexander_lebedev', 'alexander_spanos', 'alexandre_ricard',
       'alexei_ananyev',
       ...
       'zhu_xianglan', 'zhu_yaowen', 'zhu_yicai_and_wu_xueqin', 'zhu_yiwen',
       'zhuang_min', 'zong_liping', 'zu_liechtenstein', 'zuber_issa',
       'zuo_hongbo_and_zhe_shuxia', 'zuo_zongshen_and_yuan_dexiu'],
      dtype='object', name='name_id', length=801)

In [741]:
person_geo.loc[pd.isnull(person_geo)].index.values

array(['achal_anil_bakeri', 'adani_vinodbhai_shantilal', 'ahsen_ozokur',
       'alain_taravella_and_jacques_nicolet', 'albert_frere', 'alex_c_lo',
       'alexander_lebedev', 'alexander_spanos', 'alexandre_ricard',
       'alexei_ananyev', 'ali_wakrim',
       'alicia_and_tannetta_fentener_van_vlissingen', 'allan_gray',
       'alwin_lehner', 'americo_amorim', 'anders_and_soren_westermann',
       'andreas_e_rihs', 'andrew_and_peggy_cherng', 'andrew_buckeridge',
       'anne_cox_chambers', 'anne_gittinger', 'antonio_ermirio_de_moraes',
       'ao_xiaoqiang', 'archbold_d_van_beuren', 'arif_chowdhury',
       'asha_burman', 'ashok_parmanand_hinduja', 'ayman_asfari',
       'azad_moopen', 'bai_baokun', 'balram_garg',
       'barbara_agnes_ferrari_bengolea_and_amalia_amoedo',
       'barry_charles_diller', 'barry_sherman', 'beljinder_boparan',
       'belmiro_de_azevedo', 'benedicta_chamberlain',
       'berthold_jr_albrecht_and_theo_jr_albrecht_brothers',
       'bhumibol_adulyadej', 'bi

In [742]:
person_geo

name_id
abdulla_ahmed_al_ghurair          United Arab Emirates-Dubai
abdulla_al_futtaim                United Arab Emirates-Dubai
abdullah_al_rajhi                        Saudi Arabia-Riyadh
abdullah_bin_sulaiman_al_rajhi           Saudi Arabia-Riyadh
abdulsamad_rabiu                               Nigeria-Lagos
                                             ...            
zuo_xiaoping                          China-Guangdong-Foshan
zuo_zongshen                                 China-Chongqing
zuo_zongshen_and_yuan_dexiu                              NaN
zygi_wilf                             United States-Millburn
zygmunt_solorz_zak                             Poland-Warsaw
Name: geo, Length: 4949, dtype: object

In [743]:
res2.shape

(17971, 10)

In [744]:
res2[res2.name_id.isin(person_geo.loc[pd.isnull(person_geo)].index)]

Unnamed: 0,name,sex,birth,age,geo,companies,industry,year,wealth,name_id
4699,Achal Anil Bakeri & Family,,1961,,,Symphony,Consumer Durables,2018,1.4,achal_anil_bakeri
2596,Achal Anil Bakeri & family,,1961,,,Symphony,Consumer Durables,2017,1.0,achal_anil_bakeri
3697,Adani Vinodbhai Shantilal,,/,,,Adani Enterprises,Conglomerate,2018,2.9,adani_vinodbhai_shantilal
2434,Ahsen Ozokur,,1951,,,Ulker Biskuvi,Food & Beverages,2017,1.1,ahsen_ozokur
1961,Alain Taravella & Jacques Nicolet,,1950,,,Altarea Cogedim,Real Estate,2017,1.5,alain_taravella_and_jacques_nicolet
...,...,...,...,...,...,...,...,...,...,...
13573,Zuber Issa,Female,,未知,,EG Group,Multi-Brand Retail,2021,3.2,zuber_issa
16849,Zuber Issa,Female,,未知,,EG Group,Multi-Brand Retail,2022,3.7,zuber_issa
2739,Zuo Hongbo & Zhe Shuxia,,1965、1963,,,Aurora,Manufacturing,2017,1.0,zuo_hongbo_and_zhe_shuxia
5283,Zuo Hongbo & Zhe Shuxia,,1965、1963,,,Aurora,Photoelectric material,2018,1.1,zuo_hongbo_and_zhe_shuxia


In [288]:
# FIXME: 1. maybe manually add the geo for above people.
# 2. calculate the birth for all people
# but I am not going to do this right now

In [289]:
# now drop those people without geo info and translate place to geo

In [745]:
person_geo = person_geo.dropna()
person_geo = person_geo.to_frame()

In [746]:
def get_country(x):
    if 'Taipei' in x:
        return "Taiwan"
    if 'Hong Kong' in x:
        return "Hong Kong"
    if x == 'ChinaChangsha':
        return 'China'
    if x == 'United StatesNew Jersey':
        return "United States"
    res = x.split('-')[0].strip()
    if res == 'Dubai':
        return "United Arab Emirates"
    if res == 'Melbourne':
        return 'Australia'
    return res

person_geo['country'] = person_geo['geo'].map(get_country)

In [747]:
person_geo

Unnamed: 0_level_0,geo,country
name_id,Unnamed: 1_level_1,Unnamed: 2_level_1
abdulla_ahmed_al_ghurair,United Arab Emirates-Dubai,United Arab Emirates
abdulla_al_futtaim,United Arab Emirates-Dubai,United Arab Emirates
abdullah_al_rajhi,Saudi Arabia-Riyadh,Saudi Arabia
abdullah_bin_sulaiman_al_rajhi,Saudi Arabia-Riyadh,Saudi Arabia
abdulsamad_rabiu,Nigeria-Lagos,Nigeria
...,...,...
zuo_s,China -Beijing,China
zuo_xiaoping,China-Guangdong-Foshan,China
zuo_zongshen,China-Chongqing,China
zygi_wilf,United States-Millburn,United States


In [748]:
person_geo['country'].unique()

array(['United Arab Emirates', 'Saudi Arabia', 'Nigeria', 'India',
       'United States', 'Brazil', 'USA', 'Hong Kong', 'Denmark', 'Turkey',
       'China', 'UK', 'Russia', 'Thailand', 'Japan', 'Canada', 'France',
       'Australia', 'Switzerland', 'United Kingdom', 'Germany', 'Spain',
       'Mexico', 'Italy', 'Argentina', 'Kazakhstan', 'Indonesia',
       'Norway', 'Netherlands', 'Chile', 'Israel', 'Peru', 'Malaysia',
       'Czech Republic', 'Philippines', 'Sweden', 'Finland', 'Taiwan',
       'Monaco', 'Singapore', 'Morocco', 'Belgium', 'UAE', 'Georgia',
       'Nepal', 'South Korea', 'Bahamas', 'Colombia', 'South Africa',
       'Liechtenstein', 'Ireland', 'Austria', 'Poland', 'Portugal',
       'Qatar', 'Kuwait', 'New Zealand', 'Venezuela', 'Brunei', 'Vietnam',
       'Romania', 'Angola', 'Algeria', 'Slovakia', 'Cayman Islands',
       'Ukraine', 'Cambodia', 'Egypt', 'Oman', 'Tanzania', 'Lebanon',
       'Cyprus', 'Hungary', 'Bangladesh', 'Greece', 'Seychelles'],
      dtype=obj

In [749]:
person_geo['on_country'] = person_geo['country'].map(lambda x: geo_map[x])

In [750]:
person_geo

Unnamed: 0_level_0,geo,country,on_country
name_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
abdulla_ahmed_al_ghurair,United Arab Emirates-Dubai,United Arab Emirates,are
abdulla_al_futtaim,United Arab Emirates-Dubai,United Arab Emirates,are
abdullah_al_rajhi,Saudi Arabia-Riyadh,Saudi Arabia,sau
abdullah_bin_sulaiman_al_rajhi,Saudi Arabia-Riyadh,Saudi Arabia,sau
abdulsamad_rabiu,Nigeria-Lagos,Nigeria,nga
...,...,...,...
zuo_s,China -Beijing,China,chn
zuo_xiaoping,China-Guangdong-Foshan,China,chn
zuo_zongshen,China-Chongqing,China,chn
zygi_wilf,United States-Millburn,United States,usa


In [330]:
# next: create datapoints, join with Forbes

In [751]:
geo_mapping = person_geo['on_country'].to_dict()

In [752]:
res3 = res2.copy()

In [753]:
res3['on_country'] = res3['name_id'].map(lambda x: geo_mapping.get(x, None))

In [754]:
res3

Unnamed: 0,name,sex,birth,age,geo,companies,industry,year,wealth,name_id,on_country
7698,Abdulla Ahmed Al Ghurair,Male,,未知,United Arab Emirates-Dubai,Mashreq Bank,Banking,2019,1.2,abdulla_ahmed_al_ghurair,are
10349,Abdulla Ahmed Al Ghurair,Male,,未知,United Arab Emirates-Dubai,Mashreq Bank,Banking,2020,1.2,abdulla_ahmed_al_ghurair,are
13367,Abdulla Ahmed Al Ghurair,Male,,未知,United Arab Emirates-Dubai,Mashreq Bank,Banking,2021,1.2,abdulla_ahmed_al_ghurair,are
16663,Abdulla Ahmed Al Ghurair,Male,,未知,United Arab Emirates-Dubai,Mashreq Bank,Banking,2022,2.8,abdulla_ahmed_al_ghurair,are
4814,Abdulla Ahmed Al Ghurair & Family,,1955,,,Mashreq Bank,Banking,2018,1.3,abdulla_ahmed_al_ghurair,are
...,...,...,...,...,...,...,...,...,...,...,...
12690,Zygmunt Solorz-Zak,Male,,65,Poland-Warsaw,Cyfrowy Polsat,Media,2021,3.2,zygmunt_solorz_zak,pol
16038,Zygmunt Solorz-Zak,Male,,66,Poland-Warsaw,Cyfrowy Polsat,Media,2022,3.2,zygmunt_solorz_zak,pol
17692,li Yanggu,Male,,未知,China -Beijing,Pony Testing,Professional Services,2022,1.2,li_yanggu,chn
5106,Ángel Losada Moreno,,1956,,,Grupo Gigante,Retail,2018,1.1,angel_losada_moreno,mex


In [755]:
dps = res3[['on_country', 'name_id', 'year', 'wealth']].copy()
dps = dps.dropna(how='any')

In [756]:
dps

Unnamed: 0,on_country,name_id,year,wealth
7698,are,abdulla_ahmed_al_ghurair,2019,1.2
10349,are,abdulla_ahmed_al_ghurair,2020,1.2
13367,are,abdulla_ahmed_al_ghurair,2021,1.2
16663,are,abdulla_ahmed_al_ghurair,2022,2.8
4814,are,abdulla_ahmed_al_ghurair,2018,1.3
...,...,...,...,...
12690,pol,zygmunt_solorz_zak,2021,3.2
16038,pol,zygmunt_solorz_zak,2022,3.2
17692,chn,li_yanggu,2022,1.2
5106,mex,angel_losada_moreno,2018,1.1


In [757]:
dps1 = dps[['name_id', 'year', 'wealth']].copy()
dps1

Unnamed: 0,name_id,year,wealth
7698,abdulla_ahmed_al_ghurair,2019,1.2
10349,abdulla_ahmed_al_ghurair,2020,1.2
13367,abdulla_ahmed_al_ghurair,2021,1.2
16663,abdulla_ahmed_al_ghurair,2022,2.8
4814,abdulla_ahmed_al_ghurair,2018,1.3
...,...,...,...
12690,zygmunt_solorz_zak,2021,3.2
16038,zygmunt_solorz_zak,2022,3.2
17692,li_yanggu,2022,1.2
5106,angel_losada_moreno,2018,1.1


In [758]:
dps1.columns = ['person', 'year', 'wealth']

In [759]:
dps1 = dps1.set_index(['person', 'year'])['wealth']
dps1 = dps1.sort_index()

In [763]:
dps1[dps1.index.duplicated()]

person                year
leonardo_del_vecchio  2016    23.0
vagit_alekperov       2012    12.0
willis_j_johnson      2021     1.3
Name: wealth, dtype: float64

In [764]:
# drop them
dps1 = dps1[~dps1.index.duplicated()]

In [765]:
dps1.to_csv('../../hurun/ddf--datapoionts--wealth--by--person--year.csv')

In [771]:
forbes_income = pd.read_csv('../../forbes/ddf--datapoints--annual_income--by--person--year.csv').set_index(['person', 'year'])

In [766]:
# wealth to income
# assume average annual return on assets is 3%, calculate the income for all data points
r = 0.03

income = dps1 * r * 1e9

In [787]:
dps[dps['name_id'].str.contains('charli')]

Unnamed: 0,on_country,name_id,year,wealth
7389,usa,charlie_munger,2019,1.6
10077,usa,charlie_munger,2020,1.7
13116,usa,charlie_munger,2021,1.8
16438,usa,charlie_munger,2022,2.4


In [768]:
income.loc['jeff_bezos', 2022]

5640000000.0

In [777]:
forbes_income.loc['bill_gates', 2021]

annual_income    3.720000e+09
Name: (bill_gates, 2021), dtype: float64

In [789]:
income.loc['charlie_munger', 2022] / forbes_income.loc['charles_munger', 2021]

annual_income    1.2
Name: (charles_munger, 2021), dtype: float64

In [None]:
# looks good because it's around 1

In [790]:
income.to_csv('../../hurun/ddf--datapoints--annual_income--by--person--year.csv')

In [791]:
income

person                    year
abdulla_ahmed_al_ghurair  2017    39000000.0
                          2018    39000000.0
                          2019    36000000.0
                          2020    36000000.0
                          2021    36000000.0
                                     ...    
zygmunt_solorz_zak        2018    90000000.0
                          2019    78000000.0
                          2020    84000000.0
                          2021    96000000.0
                          2022    96000000.0
Name: wealth, Length: 16702, dtype: float64

In [359]:
# try to merge it into Forbes list!

In [806]:
res3

Unnamed: 0,name,sex,birth,age,geo,companies,industry,year,wealth,name_id,on_country
7698,Abdulla Ahmed Al Ghurair,Male,,未知,United Arab Emirates-Dubai,Mashreq Bank,Banking,2019,1.2,abdulla_ahmed_al_ghurair,are
10349,Abdulla Ahmed Al Ghurair,Male,,未知,United Arab Emirates-Dubai,Mashreq Bank,Banking,2020,1.2,abdulla_ahmed_al_ghurair,are
13367,Abdulla Ahmed Al Ghurair,Male,,未知,United Arab Emirates-Dubai,Mashreq Bank,Banking,2021,1.2,abdulla_ahmed_al_ghurair,are
16663,Abdulla Ahmed Al Ghurair,Male,,未知,United Arab Emirates-Dubai,Mashreq Bank,Banking,2022,2.8,abdulla_ahmed_al_ghurair,are
4814,Abdulla Ahmed Al Ghurair & Family,,1955,,,Mashreq Bank,Banking,2018,1.3,abdulla_ahmed_al_ghurair,are
...,...,...,...,...,...,...,...,...,...,...,...
12690,Zygmunt Solorz-Zak,Male,,65,Poland-Warsaw,Cyfrowy Polsat,Media,2021,3.2,zygmunt_solorz_zak,pol
16038,Zygmunt Solorz-Zak,Male,,66,Poland-Warsaw,Cyfrowy Polsat,Media,2022,3.2,zygmunt_solorz_zak,pol
17692,li Yanggu,Male,,未知,China -Beijing,Pony Testing,Professional Services,2022,1.2,li_yanggu,chn
5106,Ángel Losada Moreno,,1956,,,Grupo Gigante,Retail,2018,1.1,angel_losada_moreno,mex


In [870]:
hurun_person = res3.copy()

In [871]:
hurun_person['age'] = hurun_person['age'].replace('未知', np.nan)

In [872]:
hurun_person['sex'] = hurun_person['sex'].replace('未知', np.nan)

In [873]:
hurun_person['birth'] = hurun_person['birth'].replace('/', np.nan)

In [874]:
gs = hurun_person.groupby('name_id')
gs.get_group('abdulla_ahmed_al_ghurair')

Unnamed: 0,name,sex,birth,age,geo,companies,industry,year,wealth,name_id,on_country
7698,Abdulla Ahmed Al Ghurair,Male,,,United Arab Emirates-Dubai,Mashreq Bank,Banking,2019,1.2,abdulla_ahmed_al_ghurair,are
10349,Abdulla Ahmed Al Ghurair,Male,,,United Arab Emirates-Dubai,Mashreq Bank,Banking,2020,1.2,abdulla_ahmed_al_ghurair,are
13367,Abdulla Ahmed Al Ghurair,Male,,,United Arab Emirates-Dubai,Mashreq Bank,Banking,2021,1.2,abdulla_ahmed_al_ghurair,are
16663,Abdulla Ahmed Al Ghurair,Male,,,United Arab Emirates-Dubai,Mashreq Bank,Banking,2022,2.8,abdulla_ahmed_al_ghurair,are
4814,Abdulla Ahmed Al Ghurair & Family,,1955.0,,,Mashreq Bank,Banking,2018,1.3,abdulla_ahmed_al_ghurair,are
2168,Abdulla Ahmed Al Ghurair & family,,1956.0,,,Mashreq Bank,Banking,2017,1.3,abdulla_ahmed_al_ghurair,are


In [875]:
def process_values(lst):
    if len(lst) == 1:
        return lst[0]
    else:
        return ';'.join(map(lambda x: str(x), lst))

def group_func(df):
    res = dict()
    cols = ['name', 'sex', 'birth', 'geo', 'companies', 'industry', 'on_country']
    for c in cols:
        dropna = df[c].dropna()
        if dropna.empty:
            res[c] = np.nan
        else:
            res[c] = process_values(dropna.unique())
    res['latest_year'] = df['year'].max()
    try:
        res['age'] = df['age'].dropna().max()
    except:
        res['age'] = process_values(df['age'].dropna().unique())
    return pd.DataFrame.from_records([res])

In [876]:
df = gs.get_group('abdulla_ahmed_al_ghurair')

In [877]:
group_func(df)

Unnamed: 0,name,sex,birth,geo,companies,industry,on_country,latest_year,age
0,Abdulla Ahmed Al Ghurair;Abdulla Ahmed Al Ghur...,Male,1955;1956,United Arab Emirates-Dubai,Mashreq Bank,Banking,are,2022,


In [878]:
hp = gs.apply(group_func)

In [879]:
hp.reset_index(level=1, drop=True)

Unnamed: 0_level_0,name,sex,birth,geo,companies,industry,on_country,latest_year,age
name_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
abdulla_ahmed_al_ghurair,Abdulla Ahmed Al Ghurair;Abdulla Ahmed Al Ghur...,Male,1955;1956,United Arab Emirates-Dubai,Mashreq Bank,Banking,are,2022,
abdulla_al_futtaim,Abdulla Al Futtaim,Male,,United Arab Emirates-Dubai,Al Futtaim,Automobiles & Auto Components;Automobiles & Co...,are,2022,
abdullah_al_rajhi,Abdullah Al Rajhi,Male,,Saudi Arabia-Riyadh,Al Rajhi Banking And Investment,Banking,sau,2019,
abdullah_bin_sulaiman_al_rajhi,Abdullah bin Sulaiman Al Rajhi,Male,,Saudi Arabia-Riyadh,Al Rajhi Banking And Investment;Al Rajhi Banki...,Banking,sau,2022,93
abdulsamad_rabiu,Abdulsamad Rabiu,Male,1950;1960,Nigeria-Lagos,Bua ;BUA,Multi Brand Retailing;Conglomerate;Constructio...,nga,2022,62
...,...,...,...,...,...,...,...,...,...
zuo_xiaoping,Zuo Xiaoping,Female,,China-Guangdong-Foshan,L&S;Lesso,Plastic pipes and fittings ;Industrial Product...,chn,2022,49
zuo_zongshen,Zuo Zongshen;Zuo Zongshen & family,Male,1953,China-Chongqing,Zongshen,"Motorcycles, Real estate;Motorcycles, Real Est...",chn,2022,69
zuo_zongshen_and_yuan_dexiu,Zuo Zongshen & Yuan Dexiu,,1953,,Zongshen,"Motorcycles, Real estate",,2018,
zygi_wilf,Zygi Wilf,Male,1951,United States-Millburn,Garden Homes;Minnesota Vikings,Real Estate;Sports,usa,2022,72


In [880]:
hp = hp.reset_index(level=1, drop=True)

In [881]:
hp = hp.dropna(subset=['on_country'])

In [882]:
hp

Unnamed: 0_level_0,name,sex,birth,geo,companies,industry,on_country,latest_year,age
name_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
abdulla_ahmed_al_ghurair,Abdulla Ahmed Al Ghurair;Abdulla Ahmed Al Ghur...,Male,1955;1956,United Arab Emirates-Dubai,Mashreq Bank,Banking,are,2022,
abdulla_al_futtaim,Abdulla Al Futtaim,Male,,United Arab Emirates-Dubai,Al Futtaim,Automobiles & Auto Components;Automobiles & Co...,are,2022,
abdullah_al_rajhi,Abdullah Al Rajhi,Male,,Saudi Arabia-Riyadh,Al Rajhi Banking And Investment,Banking,sau,2019,
abdullah_bin_sulaiman_al_rajhi,Abdullah bin Sulaiman Al Rajhi,Male,,Saudi Arabia-Riyadh,Al Rajhi Banking And Investment;Al Rajhi Banki...,Banking,sau,2022,93
abdulsamad_rabiu,Abdulsamad Rabiu,Male,1950;1960,Nigeria-Lagos,Bua ;BUA,Multi Brand Retailing;Conglomerate;Constructio...,nga,2022,62
...,...,...,...,...,...,...,...,...,...
zuo_s,Zuo's,Male,,China -Beijing,KE,Real Estate Services,chn,2022,51
zuo_xiaoping,Zuo Xiaoping,Female,,China-Guangdong-Foshan,L&S;Lesso,Plastic pipes and fittings ;Industrial Product...,chn,2022,49
zuo_zongshen,Zuo Zongshen;Zuo Zongshen & family,Male,1953,China-Chongqing,Zongshen,"Motorcycles, Real estate;Motorcycles, Real Est...",chn,2022,69
zygi_wilf,Zygi Wilf,Male,1951,United States-Millburn,Garden Homes;Minnesota Vikings,Real Estate;Sports,usa,2022,72


In [883]:
hp.to_csv('../../hurun/ddf--entities--person.csv')

In [894]:
geo_map['Eswatini (Swaziland)'] = 'swz'

In [895]:
def forbes_geo(x):
    if ';' in x:
        x = x.split(';')[0]
    return geo_map[x]

forbes_person['on_country'] = forbes_person['country'].map(forbes_geo)

In [826]:
forbes_person

Unnamed: 0,person,name,age,gender,country,source,industry,on_country
0,a_jayson_adair,A. Jayson Adair,52.0,M,United States,damaged cars,Automotive,usa
1,a_jerrold_perenchio,A. Jerrold Perenchio,91.0,M,United States,"Univision; television; television, Univision",Media; Media & Entertainment,usa
2,abdul_aziz_al_ghurair,Abdul Aziz Al Ghurair,67.0,M,United Arab Emirates,banking,Finance,are
3,abdul_majeed_alhokair,Abdul Majeed Alhokair,52.0,M,Saudi Arabia,"retail, real estate",Fashion & Retail,sau
4,abdul_rasyid,Abdul Rasyid,64.0,M,Indonesia,"timber, palm oil",Manufacturing,idn
...,...,...,...,...,...,...,...,...
3939,ziyavudin_magomedov,Ziyavudin Magomedov,53.0,M,Russia,"port, gas",Energy,rus
3940,zong_qinghou,Zong Qinghou; QInghou Zong,76.0,M,China,beverages; Beverages,Food & Beverage; Beverages; Food and Beverage,chn
3941,zugen_ni,Zugen Ni,65.0,M,China,appliances,Manufacturing,chn
3942,zuo_hui,Zuo Hui,51.0,M,China,real estate services,Real Estate,chn


In [914]:
forbes_person['birth'] = 2022 - forbes_person['age']

In [884]:
hp = hp.reset_index()

In [885]:
hp.columns

Index(['name_id', 'name', 'sex', 'birth', 'geo', 'companies', 'industry',
       'on_country', 'latest_year', 'age'],
      dtype='object')

In [901]:
hp.columns = ['person', 'name', 'gender', 'birth', 'country', 'companies', 'industry',
       'on_country', 'latest_year', 'age']

In [922]:
# check duplicates
all_forbes_names = []
for v in forbes_person['name'].values:
    if ';' in v:
        for n in v.split(';'):
            all_forbes_names.append(n.strip())
    else:
        all_forbes_names.append(v.strip())

In [925]:
dups = []
for _, row in hp.iterrows():
    p = row['person']
    if '_1' in p or '_2' in p or '_3' in p:
        continue
    if p in forbes_person['person']:
        dups.append(p)
        continue
    for n in row['name'].split(';'):
        if n.strip() in all_forbes_names:
            dups.append(p)
            break

In [927]:
len(dups)

2388

In [928]:
hp.shape

(4148, 10)

In [929]:
hp[~hp.person.isin(dups)]

Unnamed: 0,person,name,gender,birth,country,companies,industry,on_country,latest_year,age
0,abdulla_ahmed_al_ghurair,Abdulla Ahmed Al Ghurair;Abdulla Ahmed Al Ghur...,Male,1955;1956,United Arab Emirates-Dubai,Mashreq Bank,Banking,are,2022,
3,abdullah_bin_sulaiman_al_rajhi,Abdullah bin Sulaiman Al Rajhi,Male,,Saudi Arabia-Riyadh,Al Rajhi Banking And Investment;Al Rajhi Banki...,Banking,sau,2022,93
7,abhaykumar_firodia,Abhaykumar Firodia,Male,,India-Pune,Force Motors,Automobiles & Components;Automobile,ind,2022,
9,abilio_diniz,Abilio Diniz,Male,1937,Brazil-Sao Paulo,Companhia Brasileira De Distribuicao;Carrefour,Multi Brand Retailing;Multi-Brand Retail;Retail,bra,2022,
12,adam_kwok,Adam Kwok & family;Adam Kwok Kai-Fai & Family;...,Male,1983,China-Hong Kong,Sun Hung Kai,Real Estate;Estate Holding,hkg,2022,
...,...,...,...,...,...,...,...,...,...,...
4141,zou_jieming,Zou Jieming,Male,,China-Guangxi-Guilin,Sanjin Pharmaceutical,Pharmaceuticals,chn,2021,78
4143,zuo_s,Zuo's,Male,,China -Beijing,KE,Real Estate Services,chn,2022,51
4144,zuo_xiaoping,Zuo Xiaoping,Female,,China-Guangdong-Foshan,L&S;Lesso,Plastic pipes and fittings ;Industrial Product...,chn,2022,49
4145,zuo_zongshen,Zuo Zongshen;Zuo Zongshen & family,Male,1953,China-Chongqing,Zongshen,"Motorcycles, Real estate;Motorcycles, Real Est...",chn,2022,69


In [935]:
# manually add duplications
dups_manual = [
    'jack_ma_yun',
    'ma_huateng',
    'charlie_munger',
    'alice_n_schwartz',
    'angela_leong_on_ki',
    'archie_aldis_emerson',
    'austin_russel',
    'banwarilal_bawri',
    'barry_strenlicht',
    'beate_heister',
    'cai_hongbing',
    'chen_jianming_1',
    'chirayu_r_amin',
    'douglas_hsu_hsu_tung',
    'fabricio_bittar_garcia',
    'forrest_li_xiaodong',
    'frederic_b_luddy',
    'gerald_j_ford',
    'girdharilal_bawri',
    'giuseppe_de_longhi',
    'hamilton_e_james',
    'herbert_t_sy',
    'henry_t_sy_jr',
    'harley_tan_sy',
    'hans_tan_sy',
    'jack_cowins',
    'jiang_wei_1',
    'jim_davis_2',
    'joseph_tsai_chung_hsin',
    'lee_yeow_chor',
    'li_li_1',
    'li_ping_2',
    'li_zhengguo',
    'mahendra_c_choksi',
    'michael_ying_lee_yuen',
    'ong_beng_seng',
    'pamela_mars_wright',
    'peter_woo_kwong_ching',
    'pratap_reddy',
    'pv_ramaprasad_reddy',
    'samuel_yin_yan_liang',
    'rubens_ometto',
    's_curtin_johnson',
    'steven_udvar',
    'thomas_lau_luen_hung',
    # 
    'angela_leong_on_ki',
    'bruce_cheng_chung_hua',
    'chen_jianmin',
    'daniel_tsai_ming_chung',
    'huang_zheng_1',
    'daniel_chiu_tat_jung',
    'li_xueling',
    'francis_choi_chee_ming',
    'henry_cheng_kar_shun',
    'jason_chang_cs',
    'kong_jianmin',
    'pansy_ho_chiu_king',
    'peter_woo_kwong_ching',
    'pierre_chen_tai_ming',
    'samuel_yin_yan_liang',
    'scott_lin_yao_ying',
    'terry_guo_tai_ming'
]

In [936]:
dups_all = set(dups)
for i in dups_manual:
    dups_all.add(i)

In [937]:
merged = pd.concat([forbes_person, hp[~hp.person.isin(dups_all)]], ignore_index=True)

In [938]:
merged.columns

Index(['person', 'name', 'age', 'gender', 'country', 'source', 'industry',
       'on_country', 'birth', 'companies', 'latest_year'],
      dtype='object')

In [939]:
merged = merged[['person', 'on_country', 'name', 'birth', 'gender', 'country', 'source', 'industry', 'age', 'companies', 'latest_year']]

In [940]:
merged = merged.drop_duplicates(subset=['person'], keep='first')

In [941]:
merged.sort_values(by=['person']).to_csv('merged-3.csv', index=False)

In [942]:
# now create a ddf version

In [944]:
merged.to_csv('../../ddf--entities--person.csv', index=False)

In [945]:
merged

Unnamed: 0,person,on_country,name,birth,gender,country,source,industry,age,companies,latest_year
0,a_jayson_adair,usa,A. Jayson Adair,1970.0,M,United States,damaged cars,Automotive,52.0,,
1,a_jerrold_perenchio,usa,A. Jerrold Perenchio,1931.0,M,United States,"Univision; television; television, Univision",Media; Media & Entertainment,91.0,,
2,abdul_aziz_al_ghurair,are,Abdul Aziz Al Ghurair,1955.0,M,United Arab Emirates,banking,Finance,67.0,,
3,abdul_majeed_alhokair,sau,Abdul Majeed Alhokair,1970.0,M,Saudi Arabia,"retail, real estate",Fashion & Retail,52.0,,
4,abdul_rasyid,idn,Abdul Rasyid,1958.0,M,Indonesia,"timber, palm oil",Manufacturing,64.0,,
...,...,...,...,...,...,...,...,...,...,...,...
5641,zou_jieming,chn,Zou Jieming,,Male,China-Guangxi-Guilin,,Pharmaceuticals,78,Sanjin Pharmaceutical,2021.0
5642,zuo_s,chn,Zuo's,,Male,China -Beijing,,Real Estate Services,51,KE,2022.0
5643,zuo_xiaoping,chn,Zuo Xiaoping,,Female,China-Guangdong-Foshan,,Plastic pipes and fittings ;Industrial Product...,49,L&S;Lesso,2022.0
5644,zuo_zongshen,chn,Zuo Zongshen;Zuo Zongshen & family,1953,Male,China-Chongqing,,"Motorcycles, Real estate;Motorcycles, Real Est...",69,Zongshen,2022.0


In [948]:
hurun_wealth = dps1.reset_index()

In [950]:
hurun_wealth.columns = ['person', 'year', 'worth']
hurun_wealth = hurun_wealth[~hurun_wealth.person.isin(dups_all)]

In [952]:
hurun_wealth = hurun_wealth.drop_duplicates(subset=['person', 'year'])

In [953]:
hurun_wealth['worth'] = hurun_wealth['worth'] * 1000

In [954]:
hurun_wealth

Unnamed: 0,person,year,worth
0,abdulla_ahmed_al_ghurair,2017,1300.0
1,abdulla_ahmed_al_ghurair,2018,1300.0
2,abdulla_ahmed_al_ghurair,2019,1200.0
3,abdulla_ahmed_al_ghurair,2020,1200.0
4,abdulla_ahmed_al_ghurair,2021,1200.0
...,...,...,...
16691,zygi_wilf,2018,2400.0
16692,zygi_wilf,2019,2400.0
16693,zygi_wilf,2020,2700.0
16694,zygi_wilf,2021,3000.0


In [947]:
forbes_wealth

Unnamed: 0,person,year,worth
0,a_jayson_adair,2021,1000.0
1,a_jerrold_perenchio,2001,3000.0
2,a_jerrold_perenchio,2002,2600.0
3,a_jerrold_perenchio,2003,2300.0
4,a_jerrold_perenchio,2004,2700.0
...,...,...,...
61900,zygmunt_solorz_zak,2017,2500.0
61901,zygmunt_solorz_zak,2018,2800.0
61902,zygmunt_solorz_zak,2019,2700.0
61903,zygmunt_solorz_zak,2020,2400.0


In [960]:
merged_wealth = pd.concat([forbes_wealth, hurun_wealth], ignore_index=True)

In [957]:
merged_wealth.to_csv('../../ddf--datapoints--worth--by--person--year.csv', index=False)

In [958]:
merged_wealth

Unnamed: 0,person,year,worth
0,a_jayson_adair,2021,1000.0
1,a_jerrold_perenchio,2001,3000.0
2,a_jerrold_perenchio,2002,2600.0
3,a_jerrold_perenchio,2003,2300.0
4,a_jerrold_perenchio,2004,2700.0
...,...,...,...
67209,zygi_wilf,2018,2400.0
67210,zygi_wilf,2019,2400.0
67211,zygi_wilf,2020,2700.0
67212,zygi_wilf,2021,3000.0


In [961]:
merged_annual_income = merged_wealth.copy()
merged_annual_income.columns = ['person', 'year', 'annual_income']

merged_annual_income['annual_income'] = merged_annual_income['annual_income'] * 0.03 * 1e6

In [968]:
merged_annual_income.set_index(['person', 'year']).loc['jeff_bezos', 2021]

  merged_annual_income.set_index(['person', 'year']).loc['jeff_bezos', 2021]


Unnamed: 0_level_0,Unnamed: 1_level_0,annual_income
person,year,Unnamed: 2_level_1
jeff_bezos,2021,5310000000.0


In [970]:
forbes_income.loc['jeff_bezos', 2021]

annual_income    5.310000e+09
Name: (jeff_bezos, 2021), dtype: float64

In [972]:
merged_annual_income.to_csv('../../ddf--datapoints--annual_income--by--person--year.csv', index=False)

In [293]:
forbes_person['country'].unique()

array(['United States', 'United Arab Emirates', 'Saudi Arabia',
       'Indonesia', 'Nigeria', 'India', 'Brazil', 'Hong Kong', 'Israel',
       'Denmark', 'Turkey', 'Russia', 'Thailand', 'Japan', 'Canada',
       'France', 'United Kingdom', 'Australia', 'Germany', 'Belgium',
       'Spain', 'Mexico', 'Peru', 'Italy', 'Argentina', 'Switzerland',
       'United States; Colombia', 'Kazakhstan; Israel', 'Norway',
       'Portugal', 'Sweden', 'Kazakhstan', 'South Africa',
       'India; Thailand', 'Chile', 'China', 'Malaysia', 'Morocco',
       'Taiwan', 'Germany; United States', 'Czechia; Czech Republic',
       'Philippines', 'Ukraine', 'Finland', 'Greece', 'Netherlands',
       'Singapore', 'Lebanon; Saudi Arabia', 'Lebanon; Switzerland',
       'South Korea', 'Kuwait', 'Colombia', 'Georgia; Russia', 'Nepal',
       'Hong Kong; China', 'Liechtenstein', 'New Zealand', 'Cyprus',
       'Romania', 'Poland', 'Macau', 'Iceland', 'Monaco', 'Ireland',
       'Austria', 'Lebanon', 'Czechia', 'Br

In [36]:
df.columns

Index(['hs_Character_Gender', 'hs_Character_Birthday', 'hs_Character_Age',
       'hs_Character_MTime', 'hs_Character_Gender_Lang', 'hs_Character_ID',
       'hs_Character_Fullname_Cn', 'hs_Character_Fullname_En',
       'hs_Character_Surname_Cn', 'hs_Character_Name_Cn',
       'hs_Character_Surname_En', 'hs_Character_Name_En', 'hs_Character_Photo',
       'hs_Character_Nationality', 'hs_Character_NativePlace_Cn',
       'hs_Character_NativePlace_En', 'hs_Character_BirthPlace_Cn',
       'hs_Character_BirthPlace_En', 'hs_Character_Permanent_Cn',
       'hs_Character_Permanent_En', 'hs_Character_Education_Cn',
       'hs_Character_Education_En', 'hs_Character_School_Cn',
       'hs_Character_School_En', 'hs_Character_Major_Cn',
       'hs_Character_Major_En', 'hs_Rank_Global_Nationality',
       'hs_Rank_Global_NativePlace_Cn', 'hs_Rank_Global_NativePlace_En',
       'hs_Rank_Global_BirthPlace_Cn', 'hs_Rank_Global_BirthPlace_En',
       'hs_Rank_Global_Permanent_Cn', 'hs_Rank_Global_Per

In [43]:
df = df[['hs_Character_Fullname_En', 'hs_Character_Permanent_En', 'hs_Character_Gender_Lang', 
         'hs_Character_ID', 'hs_Rank_Global_Wealth_USD', 'hs_Rank_Global_Industry_En', 'hs_Character_Age', 
        'year']].copy()

In [44]:
df.columns

Index(['hs_Character_Fullname_En', 'hs_Character_Permanent_En',
       'hs_Character_Gender_Lang', 'hs_Character_ID',
       'hs_Rank_Global_Wealth_USD', 'hs_Rank_Global_Industry_En',
       'hs_Character_Age', 'year'],
      dtype='object')

In [45]:
df.columns = ['name', 'geo', 'sex', 'id', 'wealth', 'industry', 'age', 'year']

In [50]:
# handle duplicates
# just use first value available
def handle_dups(df):
    gs = df.groupby(['id'], group_keys=False)

    def func(df):
        if df.shape[0] == 1:
            return df
        res = []
        for c in df.columns:
            try:
                v = df[c].dropna().iloc[0]
            except:
                v = np.nan
            res.append(v)
        # print(res)
        # print(df.columns)
        # print(df.index.values[0])
        return pd.DataFrame([res], columns=df.columns, index=[df.index.values[0]])

    person = gs.apply(func)
    return person

In [49]:
person.loc[0]

name                                    Jeff Bezos
geo         United States-Washington State-Seattle
sex                                           Male
id                                            2174
wealth                                       147.0
industry                             Online Retail
age                                             55
year                                          2019
Name: 0, dtype: object

In [48]:
df

Unnamed: 0,name,geo,sex,id,wealth,industry,age,year
0,Jeff Bezos,United States-Washington State-Seattle,Male,2174,147.0,Online Retail,55,2019
1,Bill Gates,United States-Washington State-Medina,Male,2175,96.0,Investments,64,2019
2,Warren Buffett,United States-Nebraska-Omaha,Male,2176,88.0,Investments,89,2019
3,Bernard Arnault,France-Paris Province-Paris,Male,2177,86.0,Single Brand Retailing,70,2019
4,Mark Zuckerberg,United States-California-Palo Alto,Male,2178,80.0,Technology,35,2019
...,...,...,...,...,...,...,...,...
2599,Zeng Lingshan,China-Beijing,Male,617,1.0,"Pharmaceuticals, investments",未知,2019
2600,Zhang Litian,China-Guangdong-Chaozhou,Male,618,1.0,Milk powder,55,2019
2601,Zhang Simin,China-Guangdong-Shenzhen,Male,568,1.0,"Pharmaceuticals, real estate",57,2019
2602,Wang Jinsong,China-Guangdong-Shenzhen,Female,1934,1.0,"Pharmaceuticals, real estate",未知,2019
