# Billionaire Dataset - Exploratory Data Analysis

This notebook explores a dataset of billionaires, examining demographics, wealth distribution, geographical patterns, industry distribution, and relationships with economic factors.


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

## 1. Setup and Configuration
Setting up visualization styles and display parameters.

In [5]:
# Set visualization style
plt.style.use('ggplot')
sns.set_palette("Set2")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

## 2. Data Loading
Loading data and examine basic structure of data.

In [6]:
# Load data
print("Loading dataset...")
df = pd.read_csv('../data/raw/df_ready.csv')

# Display basic information
print(f"\nDataset shape: {df.shape}")
print(f"\nFirst few rows:")
display(df.head())

# Check column names and data types
print("\nColumn info:")
display(df.info())

Loading dataset...

Dataset shape: (2591, 30)

First few rows:


Unnamed: 0,position,wealth,industry,full_name,age,country_of_residence,city_of_residence,source,citizenship,gender,birth_date,last_name,first_name,residence_state,residence_region,birth_year,birth_month,birth_day,cpi_country,cpi_change_country,gdp_country,g_tertiary_ed_enroll,g_primary_ed_enroll,life_expectancy,tax_revenue,tax_rate,country_pop,country_lat,country_long,continent
0,1,211000,Fashion & Retail,Bernard Arnault & family,74,France,Paris,LVMH,France,M,1949-03-05,Arnault,Bernard,No subdivisions info,No subdivisions info,1949,3,5,110.05,1.1,2715518274227,65.6,102.5,82.5,24.2,60.7,67059887,46.227638,2.213749,Europe
1,2,180000,Automotive,Elon Musk,51,United States,Austin,"Tesla, SpaceX",United States,M,1971-06-28,Musk,Elon,Texas,South,1971,6,28,117.24,7.5,21427700000000,88.2,101.8,78.5,9.6,36.6,328239523,37.09024,-95.712891,North America
2,3,114000,Technology,Jeff Bezos,59,United States,Medina,Amazon,United States,M,1964-01-12,Bezos,Jeff,Washington,West,1964,1,12,117.24,7.5,21427700000000,88.2,101.8,78.5,9.6,36.6,328239523,37.09024,-95.712891,North America
3,4,107000,Technology,Larry Ellison,78,United States,Lanai,Oracle,United States,M,1944-08-17,Ellison,Larry,Hawaii,West,1944,8,17,117.24,7.5,21427700000000,88.2,101.8,78.5,9.6,36.6,328239523,37.09024,-95.712891,North America
4,5,106000,Finance & Investments,Warren Buffett,92,United States,Omaha,Berkshire Hathaway,United States,M,1930-08-30,Buffett,Warren,Nebraska,Midwest,1930,8,30,117.24,7.5,21427700000000,88.2,101.8,78.5,9.6,36.6,328239523,37.09024,-95.712891,North America



Column info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2591 entries, 0 to 2590
Data columns (total 30 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   position              2591 non-null   int64  
 1   wealth                2591 non-null   int64  
 2   industry              2591 non-null   object 
 3   full_name             2591 non-null   object 
 4   age                   2591 non-null   int64  
 5   country_of_residence  2591 non-null   object 
 6   city_of_residence     2591 non-null   object 
 7   source                2591 non-null   object 
 8   citizenship           2591 non-null   object 
 9   gender                2591 non-null   object 
 10  birth_date            2591 non-null   object 
 11  last_name             2591 non-null   object 
 12  first_name            2591 non-null   object 
 13  residence_state       2591 non-null   object 
 14  residence_region      2591 non-null   object 
 15  birth_y

None

## 3. Data Preprocessing
Checking for missing values, duplicates, and explore summary statistics.

In [7]:
# Check for missing values
print("\nChecking for missing values...")
missing_values = df.isnull().sum()
print("Missing values per column:")
display(missing_values[missing_values > 0])

# Check for duplicates
print("\nChecking for duplicates...")
print(f"Number of duplicate rows: {df.duplicated().sum()}")

# Get summary statistics
print("\nSummary statistics for numerical columns:")
display(df.describe())


Checking for missing values...
Missing values per column:


Series([], dtype: int64)


Checking for duplicates...
Number of duplicate rows: 0

Summary statistics for numerical columns:


Unnamed: 0,position,wealth,age,birth_year,birth_month,birth_day,gdp_country,g_tertiary_ed_enroll,g_primary_ed_enroll,life_expectancy,tax_revenue,tax_rate,country_pop,country_lat,country_long
count,2591.0,2591.0,2591.0,2591.0,2591.0,2591.0,2591.0,2591.0,2591.0,2591.0,2591.0,2591.0,2591.0,2591.0,2591.0
mean,1282.913547,4666.19066,65.252798,1957.133925,5.646083,11.882671,11029660000000.0,64.125743,100.23242,78.409101,12.422926,44.548012,483773300.0,34.394755,16.383842
std,740.191092,9917.104258,13.055442,13.074254,3.732279,9.941884,9641891000000.0,24.527813,11.966391,3.910509,5.275224,12.212804,549756700.0,16.833864,87.389321
min,1.0,1000.0,18.0,1921.0,1.0,1.0,1044779000.0,0.0,0.0,54.3,0.0,0.0,31122.0,-40.900557,-106.346771
25%,636.0,1500.0,56.0,1948.0,2.0,1.0,1394116000000.0,50.6,100.2,77.0,9.5,36.6,60297400.0,35.86166,-95.712891
50%,1272.0,2400.0,65.0,1957.0,5.0,10.0,5081770000000.0,63.4,101.8,78.5,9.6,46.1,328239500.0,37.09024,12.56738
75%,1905.0,4300.0,75.0,1966.0,9.0,21.0,21427700000000.0,88.2,102.6,81.3,12.8,59.2,1366418000.0,38.963745,104.195397
max,2540.0,211000.0,101.0,2004.0,12.0,31.0,21427700000000.0,136.6,142.1,85.94,37.2,106.3,1397715000.0,61.92411,174.885971
