In [1]:
import pandas as pd
import numpy as np
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.express as px

In [2]:
init_notebook_mode(connected=True)
cf.go_offline()

In [3]:
df = pd.read_csv('forbes_billionaires_geo.csv')
df['Self_made'].replace([True, False], ['Self-made', 'Not self-made'], inplace=True)
df.columns = [col_n.lower() for col_n in df.columns]

## What Does the Data Look Like?

In [4]:
n_duplicates = df.duplicated().sum()
df_described = df.describe().round(3)
null_cnts = df.isnull().sum()
null_pcts = (df.isnull().sum() / len(df)).round(3)
df_null = pd.DataFrame({'n_null': null_cnts, 
              'pct_null': null_pcts}).sort_values('n_null', ascending=False)

print(f"Dataframe Shape: {df.shape}")
print(f"Duplicate Rows: {n_duplicates}\n")
print(f"Numerical Column Description:")
display(df_described)
print(f"All Column Null Summary:")
display(df_null)

Dataframe Shape: (2755, 13)
Duplicate Rows: 0

Numerical Column Description:


Unnamed: 0,networth,rank,age,children
count,2755.0,2755.0,2630.0,1552.0
mean,4.749,1345.664,63.267,2.978
std,9.615,772.67,13.479,1.619
min,1.0,1.0,18.0,1.0
25%,1.5,680.0,54.0,2.0
50%,2.3,1362.0,63.0,3.0
75%,4.2,2035.0,73.0,4.0
max,177.0,2674.0,99.0,23.0


All Column Null Summary:


Unnamed: 0,n_null,pct_null
education,1346,0.489
children,1203,0.437
status,665,0.241
age,125,0.045
residence,40,0.015
self_made,18,0.007
citizenship,16,0.006
name,0,0.0
networth,0,0.0
country,0,0.0


### Conclusions
Over 20% of Education, Children, and Status are null. The other columns have few null values and none of the numerical columns have null values.

## What is the Distribution of Net Worth?

In [5]:
networth_hist_01 = df[['networth']].figure(kind="histogram", 
                        bins=(0, 200, 5), 
                        title="Histogram of Net Worth (All Individuals)", 
                        xTitle="Net Worth (Billions $USD)", 
                        yTitle="Frequency",
                        theme="pearl",
                        color="blue",
                        bargap=0.1,
                        orientation="v",
                        text="networth")

networth_hist_01.update_yaxes(nticks=20)
networth_hist_01.update_xaxes(nticks=20)

networth_hist_02 = df[['networth']].figure(kind="histogram", 
                        bins=(0, 10, 1), 
                        title="Histogram of Net Worth (0-$10bn individuals)", 
                        xTitle="Net Worth (Billions $USD)", 
                        yTitle="Frequency",
                        theme="pearl",
                        color="blue",
                        bargap=0.1,
                        orientation="v",
                        text="networth")

display(networth_hist_01, networth_hist_02)

In [6]:
top_20_worth = df.sort_values('networth', ascending=False).iloc[:20] 
top_20_worth_fig = top_20_worth.figure(kind="bar", 
                   x="name", 
                   y="networth", 
                   title="Net Worth of Top 20 Wealthiest Billionaires", 
                   xTitle="name", 
                   yTitle="Net Worth (Billions $USD)",
                   color="blue")
top_20_worth_fig.update_yaxes(nticks=10)
display(top_20_worth_fig)

In [7]:
df_selfmade = df[~df['self_made'].isnull()]
fig = px.histogram(df_selfmade, 
                   x="networth", 
                   facet_row="self_made", 
                   range_x=(0, 50), 
                   range_y=(0, 1600),
                   facet_col_spacing=0.05,
                   nbins=40, 
                   title="Histograms of Net Worth by Self-made Status")

fig.update_layout(bargap=0.1)
fig.update_yaxes(range=[0, 1800])
fig.update_xaxes(nticks=20)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

display(fig)

In [8]:
fig = df_selfmade.groupby(
    "Self_made").sum()[["NetWorth"]].sort_values("NetWorth", ascending=False).round(3).figure(
    kind="bar", 
    xTitle="Self-made Status", 
    yTitle="Total Net Worth (Billions $USD)", 
    title="Total Net Worth by Self-made Status",
    text="NetWorth", 
    color="blue")
fig.update_traces(textposition='outside')
display(fig)

KeyError: 'Self_made'

In [9]:
fig = df_selfmade.groupby(
    "Country").sum()[["NetWorth"]].sort_values("NetWorth", ascending=False).round(3).iloc[:10].figure(
    kind="bar", 
    xTitle="Country", 
    yTitle="Total Net Worth (Billions $USD)", 
    title="Total Net Worth by Country",
    text="NetWorth", 
    color="blue")
fig.update_traces(textposition='outside')
display(fig)

KeyError: 'Country'

In [10]:
col = "source"
fig = df_selfmade.groupby(
    col).sum()[["networth"]].sort_values("networth", ascending=False).round(3).iloc[:10].figure(
    kind="bar", 
    xTitle=col, 
    yTitle="Total Net Worth (Billions $USD)", 
    title=f"Total Net Worth by {col}",
    text="networth", 
    color="blue")
fig.update_traces(textposition='outside')
display(fig)

In [21]:
hist_data

Unnamed: 0_level_0,networth
source,Unnamed: 1_level_1
real estate,686.1
diversified,378.2
investments,339.2
pharmaceuticals,335.3
software,274.5
hedge funds,238.3
Amazon,230.0
e-commerce,225.5
Walmart,220.2
Google,216.4


In [48]:
top_10_sources = df_selfmade.groupby('source').sum().sort_values('networth', ascending=False)[:10].round(3)
source_bar = px.bar(top_10_sources, x=top_10_sources.index, y="networth", text="networth")
source_bar.update_traces(textposition='outside')
display(source_bar)

hist_data = df_selfmade[df_selfmade['source'].isin(list(top_10_sources.index))]
fig = px.bar(hist_data, x="source", y="networth", color="name", barmode="stack")
fig.update_layout(showlegend=False)
fig.update_xaxes(categoryorder='total descending')
fig.show()

In [40]:
# help(df.iplot)
# help(df.figure)
# help(px.bar)