In [1]:
import pandas as pd
import requests
import re

In [3]:
def country_or_nationality(df: "country vs nationality dataframe",
                           mode: "c for country, n for nationality, b for both"):
    """
    :param df: country vs nationality dataframe
    :param mode: c for country, n for nationality, b for both
    :return: returns a list containing countries or nationalities
    """
    cn = df.values.tolist()
    coun, nati = zip(*cn)
    if mode == "c":
        return coun
    elif mode == "n":
        return nati
    elif mode == "b":
        return cn
    else:
        return None


def null_values(df, mode: "g, c or a", tolist=False):
    """
    :param df: Clean dataframe
    :param mode: g for gender, c for country and a for age
    :param tolist: creates a list of names from the dataframe
    :return: 1. returns a pandas dataframe with empty gender or country registries,
            depending on mode selection. If tolist is set to True, it will return a list of names
    """
    if mode == "g":
        # Creating dataframe with empty values
        nullgender = df[df["gender"].isnull()].iloc[:, :2]  # 550

        # Creating search name column. This column will be used to access each's millionaire wikipedia page.
        nullgender["SearchName"] = nullgender["name"].apply(lambda full_name: "_".join(full_name.strip().split()))

        if tolist:
            return nullgender["SearchName"].values.tolist()
        else:
            return nullgender

    elif mode == "c":
        # Creating dataframe with empty values
        nullcountry = df[df["country"].isnull()].iloc[:, :2]  # 1391

        # Creating search name column. This column will be used to access each's millionaire wikipedia page.
        nullcountry["SearchName"] = nullcountry["name"].apply(lambda full_name: "_".join(full_name.strip().split()))
        if tolist:
            return nullcountry["SearchName"].values.tolist()
        else:
            return nullcountry

    elif mode == "a":
        # Creating dataframe with empty values
        nullage = df[df["age"].isnull()].iloc[:, :2]  # 1391

        # Creating search name column. This column will be used to access each's millionaire wikipedia page.
        nullage["SearchName"] = nullage["name"].apply(lambda full_name: "_".join(full_name.strip().split()))
        if tolist:
            return nullage["SearchName"].values.tolist()
        else:
            return nullage
    else:
        return None


def search_list(df: "dataframe to parse"):
    return df["SearchName"].values.tolist()


def get_age(nums: "List of 19xx years"):
    if len(nums) == 1:
        return 2019 - int(nums[0])
    else:
        return None


def country_search(words: "list of words in Born tag", countries: "list of countries in countries tuple"):
    for word in words:
        for country in countries:
            if word == country:
                return country
    else:
        return None


def nationality_country(Natio: "Nationality to search", CvsN: "Country vs. Nationality relationship table"):
    for element in CvsN:
        if element[1] == Natio:
            return element[0]
    else:
        return None

In [4]:
def data_enhancement(millionaires, enh_mode, con):
    """
    :param millionaires: List of millionaires to parse
    :param enh_mode: enhancement mode, c for country, a for age
    :param con: Country vs Nationality dataframe
    :return:
    """

    counter = 0
    mill_lst = []
    for millionaire in millionaires:
        url = f"https://en.wikipedia.org/wiki/{millionaire}"

        # Checking whether the page exists.
        if str(requests.get(url)) == "<Response [404]>":
            mill_lst.append((millionaire, None))
            counter += 1
            print(counter, millionaire, "<Response [404]>")
            continue

        # Checking if there are tables in the page.
        try:
            pd.read_html(url)
        except ValueError as err:
            mill_lst.append((millionaire, None))
            counter += 1
            print(counter, millionaire, err)
            continue

        # Accessing each's millionaire wikipedia page and turning first table into a list.
        wiki = pd.read_html(url)
        wikilist = wiki[0].values.tolist()

        # Data enhancing mode: COUNTRY.
        if enh_mode == "c":

            c = country_or_nationality(con, "c")
            cn = country_or_nationality(con, "b")

            # Looping each "tag" in table/list.
            for x in wikilist:

                # Getting country of origin in "Born" tag.
                if x[0] == "Born":
                    pattern_country = r"\bSouth\s[A-Za-z]+|New\s[A-Za-z]+|[A-Za-z]\.[A-Za-z]\.|[A-Za-z]+\b"
                    born_country = re.findall(pattern_country, x[1])
                    country_found = country_search(born_country, c)
                    if country_found is not None:
                        mill_lst.append((millionaire, country_found))
                        break

                # Getting country of origin in "Nationality" tag if it was not found in previous tag.
                elif x[0] == "Nationality":
                    nationality_found = nationality_country(x[1], cn)
                    if nationality_found is not None:
                        mill_lst.append((millionaire, nationality_found))
                        break

            # Setting country to None if it was not found.
            else:
                mill_lst.append((millionaire, None))

            counter += 1
            print(counter, millionaire, millionaires.index(millionaire), "    ",
                  mill_lst.index(mill_lst[len(mill_lst) - 1]), mill_lst[len(mill_lst) - 1])

        # Data enhancing mode: AGE
        elif enh_mode == "a":

            # Looping each "tag" in table/list.
            for x in wikilist:

                # Getting year of birth in "Born" tag.
                if x[0] == "Born":
                    # Looking for "age xx" string
                    pattern_age = r"age\s\d\d"
                    born_age = re.findall(pattern_age, x[1])
                    if len(born_age) == 1:
                        age_found = int(born_age[0][len(born_age[0]) - 2:len(born_age[0])])
                        mill_lst.append((millionaire, age_found))
                        break

                    # Looking for the year and computing current age
                    pattern_age = r"19\d\d"
                    born_age = re.findall(pattern_age, x[1])
                    age_found = get_age(born_age)
                    if age_found is not None:
                        mill_lst.append((millionaire, age_found))
                        break

            # Setting age to None if it was not found.
            else:
                mill_lst.append((millionaire, None))

            counter += 1
            print(counter, millionaire, millionaires.index(millionaire), "    ",
                  mill_lst.index(mill_lst[len(mill_lst) - 1]), mill_lst[len(mill_lst) - 1])

    if enh_mode == "c":
        enhanced_df = pd.DataFrame(mill_lst, columns=["SearchName", "Nationality"])
        file_name = "enhanced_nationality_df.csv"
        enhanced_df.to_csv(f"../data/processed/{file_name}", index=False)
        print(f"{file_name} was successfully saved!")
        return enhanced_df
    elif enh_mode == "a":
        enhanced_df = pd.DataFrame(mill_lst, columns=["SearchName", "Age"])
        file_name = "enhanced_age_df.csv"
        enhanced_df.to_csv(f"../data/processed/{file_name}", index=False)
        print(f"{file_name} was successfully saved!")
        return enhanced_df
    


In [5]:
paises = pd.read_csv("../data/processed/CountryNationality.csv")
clean_df = pd.read_csv("../data/processed/dataclean_info.csv")
test = ['Bernard_Arnault', 'Amancio_Ortega', 'Alice_Walton', 'Jack_Ma', 'Steve_Ballmer', "Elon_Musk"]
mike = ["Michael_Bloomberg"]
bill = ["Bill_Gates"]
chen = ["Charles_S._Cohen"]

In [44]:
all_names = null_values(clean_df, "g", tolist=True)
#result = data_enhancement(test, "c", paises)

In [45]:
all_names

['Bernard_Arnault',
 'Amancio_Ortega',
 'Alice_Walton',
 'Jack_Ma',
 'Steve_Ballmer',
 'Wang_Jianlin',
 'Beate_Heister_&_Karl_Albrecht_Jr.',
 'David_Thomson',
 'John_Mars',
 'Giovanni_Ferrero',
 'Dietrich_Mateschitz',
 'Leonardo_Del_Vecchio',
 'Len_Blavatnik',
 'He_Xiangjian',
 'Tadashi_Yanai',
 'Laurene_Powell_Jobs',
 'Lee_Kun-Hee',
 'Ray_Dalio',
 'Gennady_Timchenko',
 'Vladimir_Potanin',
 'Lukas_Walton',
 'Marcel_Herrmann_Telles',
 'Jim_Kennedy',
 'Seo_Jung-Jin',
 'Stefano_Pessina',
 'Wang_Wenyin',
 'John_Menard,_Jr.',
 'Liu_Qiangdong',
 'Pan_Zhengmin',
 'Zong_Qinghou',
 'Gianluigi_&_Rafaela_Aponte',
 'Yao_Zhenhua',
 'James_Irving',
 'Silvio_Berlusconi',
 'Hui_Wing_Mau',
 'David_Duffield',
 'George_Kaiser',
 'Patrick_Soon-Shiong',
 'Zhou_Qunfei',
 'Pauline_Macmillan_Keinath',
 'Iskander_Makhmudov',
 'Shahid_Khan',
 'Wang_Wenxue',
 'Willi_&_Isolde_Liebherr',
 'Chan_Laiwa',
 'Tsai_Eng-Meng',
 'Oleg_Deripaska',
 'Brian_Acton',
 'Wee_Cho_Yaw',
 'Leon_Black',
 'Wei_Jianjun',
 'Francis_Cho

In [33]:
clean_df.head()

Unnamed: 0,id,name,age,gender,country,image,Sector,Company,Worth(BUSD),SearchName
0,7468,Jeff Bezos,54.0,Male,United States,https://specials-images.forbesimg.com/imageser...,Technology,Amazon,112.0,Jeff_Bezos
1,4605,Bill Gates,62.0,Male,,https://specials-images.forbesimg.com/imageser...,Technology,Microsoft,90.0,Bill_Gates
2,7368,Warren Buffett,87.0,Male,,https://specials-images.forbesimg.com/imageser...,Finance and Investments,Berkshire Hathaway,84.0,Warren_Buffett
3,3808,Bernard Arnault,69.0,,,https://specials-images.forbesimg.com/imageser...,Fashion & Retail,LVMH,72.0,Bernard_Arnault
4,3213,Mark Zuckerberg,33.0,Male,,https://specials-images.forbesimg.com/imageser...,Technology,Facebook,71.0,Mark_Zuckerberg


In [34]:
def merge_enhanced_data(original_df, toadd_df, column_to_improve, column_to_drop):
    original_df["SearchName"] = original_df["name"].apply(lambda full_name: "_".join(full_name.strip().split()))
    enhanced_df = pd.merge(original_df, toadd_df, how="left", on="SearchName")
    enhanced_df.drop(columns=["SearchName", column_to_improve], axis=1, inplace=True)
    enhanced_df.rename(columns={column_to_drop : column_to_improve}, inplace=True)
    enhanced_df = enhanced_df[['id', 'name', 'age', 'gender', 'country', 'image', 'Sector', 'Company', 'Worth(BUSD)']]
    return enhanced_df

In [35]:
merge_enhanced_data(clean_df, result, "age", "Age")

Unnamed: 0,id,name,age,gender,country,image,Sector,Company,Worth(BUSD)
0,7468,Jeff Bezos,,Male,United States,https://specials-images.forbesimg.com/imageser...,Technology,Amazon,112.0
1,4605,Bill Gates,,Male,,https://specials-images.forbesimg.com/imageser...,Technology,Microsoft,90.0
2,7368,Warren Buffett,,Male,,https://specials-images.forbesimg.com/imageser...,Finance and Investments,Berkshire Hathaway,84.0
3,3808,Bernard Arnault,70.0,,,https://specials-images.forbesimg.com/imageser...,Fashion & Retail,LVMH,72.0
4,3213,Mark Zuckerberg,,Male,,https://specials-images.forbesimg.com/imageser...,Technology,Facebook,71.0
...,...,...,...,...,...,...,...,...,...
2203,7578,Zhao Xiaoqiang,,Male,China,https://specials-images.forbesimg.com/imageser...,Fashion & Retail,"fashion, entertainment",1.0
2204,9196,Zhou Liangzhang,,Male,,https://specials-images.forbesimg.com/imageser...,Manufacturing,electrical equipment,1.0
2205,1175,Zhu Xingming,,,China,https://specials-images.forbesimg.com/imageser...,Manufacturing,electrical equipment,1.0
2206,9466,Zhuo Jun,,Female,,https://specials-images.forbesimg.com/imageser...,Manufacturing,printed circuit boards,1.0
