# Billionaires By Country
Goal:

## 1. Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## 2. Load the Dataset 

In [None]:
df = pd.read_csv("data/billionaires-by-country-2024.csv")

## 3. Understanding the Data

In [None]:
df.info()
df.columns
df.shape

## 4. Cleaning/Preparing Data

#### 4.1 Dropping Columns

In [None]:
# Dropping population_growthRate, and population_density_km
df = df[['population_2024', 'Country_land_area',
       'country', 'region', 'unMember', 'Population_growthRate',
       'population_densityMi', 'BillionairesRichestBillionaire2023',
       'BillionairesRichestNetWorth2023', 'BillionairesPerMillionPeople2023',
       'BillionairesTotalNetWorth2023']]

df.shape


#### 4.2 Renaming Columns

In [None]:
# Rename our columns for consistency
# THe standard year will be 2023, so anything that is not in 2023 will include a year 
df = df.rename(columns={'population_2024':'Population_2024',
                   'Country_land_area':'Land_Area',
                   'country':'Country',
                   'population_densityMi': 'Population_Density',
                    'Population_growthRate' : 'Population_GrowthRate',
                   'region': 'Region',
                   'unMember':'UnMember',
                   'BillionairesRichestBillionaire2023':'Name',
                   'BillionairesRichestNetWorth2023':'NetWorth',
                   'BillionairesPerMillionPeople2023':'PerMillionPeople',
                   'BillionairesTotalNetWorth2023':'TotalWorth'})

#### 4.3 Checking for Duplicates

In [None]:
df.duplicated().sum()

#### 4.4 Checking for Null Values

In [None]:
# Percent of null values for each column
df.isna().sum() / df.shape[0] * 100

Because less than 10% of the values are null in each column, I will not drop any of the columns.

#### 4.5 Sorting the Data

In [None]:
# Sorting the data by having the richest first
df = df.sort_values("NetWorth", ascending = False)

## 5. Univariate Analysis

#### 5.1 Numerical Data

In [None]:
# Boxplot for the net worth of the billionaires
sns.boxplot(data=df, x="NetWorth")
plt.xlim(0,215)

# Finding the data for the person with the highest and lowest net worth
df.loc[df["NetWorth"] == df["NetWorth"].max()]
df.loc[df["NetWorth"] == df["NetWorth"].min()]

##### MEANING!!!!!?????

In [None]:
# Boxplot for the total worth of the billionaires
sns.boxplot(data=df, x="TotalWorth")
plt.xlim(-100,1680)

# Finding the data for the person with the highest and lowest total worth
df.loc[df["TotalWorth"] == df["TotalWorth"].max()]
df.loc[df["TotalWorth"] == df["TotalWorth"].min()]

##### MEANING!!!?????

Because multiple people have the same minimum worth, I will check to see how many people have the same TotalWorth values.

In [None]:
# Sorts by the values that are most common
df['TotalWorth'].value_counts(ascending=False)


#### 5.2 Catergorical Data

In [None]:
# Checking how many of each country there are to see if the information can be useful to me
df['Country'].value_counts()

In [None]:
# Checking how many of each region there are to see if the information can be useful to me
df['Region'].value_counts()

In [None]:
# Finding the region with the most rich people 
plt.figure(figsize=(8,5))
plt.title("Amount of Billionaires by Region")
sns.histplot(data=df, x="Region", binwidth=20)

This graph shows the amount of billionaires that live in each region. It shows that most of the billionaires live in Europe and Asia, and the least amount of billionaires live in Oceania. However, it should be noted that there are most likely less people living in Oceania than Europe.

In [None]:
df['UnMember'].value_counts()

In [None]:
# I learned how to make a pie chart from Geeks for Geeks (https://www.geeksforgeeks.org/how-to-create-a-pie-chart-in-seaborn/) and the code
# is the same, I just added in my own data.

# Makes a pie chart of the percent of people that are vs. are not UnMembers
# Catergorical because it is True/False data
plt.figure(figsize=(50,8))
plt.title("Percent of UnMembers")

# These data numbers are taken from the value_counts of UnMembers shown above
data = [74,4]
keys = ['UnMember', 'Not a UnMember']
explode = [0, 0.1] 
palette_color = sns.color_palette('bright')
plt.pie(data,labels=keys, colors=palette_color, explode=explode, autopct='%.0f%%')
plt.show()

This pie chart shows that the majority of people are a member of the UN, but it should be considered that this seems like a lot, but I do not have that much data.

#### 5.3 Multiple Plots

##### WORK!!!???

In [None]:
# Set Seaborn style
sns.set_style("darkgrid")

# Identify numerical columns
numerical_columns = df.select_dtypes("number").columns

# Plot distribution of each numerical feature
plt.figure(figsize=(14, len(numerical_columns) * 3))
for idx, feature in enumerate(numerical_columns, 1):
	plt.subplot(len(numerical_columns), 2, idx)
	
	# Kernel density plot
	sns.histplot(df[feature], kde=True)
	plt.title(f"{feature} | Skewness: {round(df[feature].skew(), 2)}")

# Adjust layout and show plots
plt.tight_layout()
plt.show()

## 6. Multivariate Analysis

#### 6.1 Two Variables

In [None]:
plt.figure(figsize=(16,8))
plt.title("Amount of Billionaires by Population Density")
sns.histplot(data=df, x="Region", y ="Population_Density", binwidth=800)

# Interpreting Data

In [None]:
# Finding the region with the most rich people 
plt.figure(figsize=(8,5))
plt.title("Amount of Billionaires by Region")
sns.histplot(data=df, x="Region", binwidth=20)

I can't get a true interpretation of this data without figuring out the average population per region.

In [None]:
# sns.scatterplot(data=graph, x="Region", y="Population_Density")

In [None]:
# Divide the number of milionaires by population density to find the number of millionaires per miles by region
df['Region'].value_counts()

In [None]:
plt.figure(figsize=(16,8))
plt.title("Amount of Billionaires by Population Density")
sns.histplot(data=df, x="Population_Density", binwidth=800)

Most of the billionaires live in places with large population densities. Population density is the amount of people per miles.

In [None]:
# Create a boxplot with Matplotlib
sns.boxplot(data=df, x="NetWorth")
plt.xlim(0,215)

df.loc[df["Height_ft"] == df["Height_ft"].max()]

### MEANING???/??

In [None]:
sns.boxplot(data=df, x="Total_Worth")
plt.xlim(-100,1680)

### MEANING?????///??

In [None]:
df['NetWorth'].value_counts()

In [None]:
# Population density = number of people per square mile
df.groupby('Region')[['Population_Density']].mean()


In [None]:
# Region with the densest population comparatively
# 64% of the most dense populations are in Asia 
plt.figure(figsize=(50,8))
plt.title("Height vs Speed")
data = [231.603513,4713.575292, 1974.857797, 380.417271,30.413450, 76.364517]
keys = ['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']
plt.title("Population Density by Region")
plt.pie(data,labels=keys,autopct='%.0f%%')
plt.show()

In [None]:
sns.kdeplot(data=df, x="NetWorth", y='Name')
