In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression

data = pd.read_csv('pip_dataset.csv')


ModuleNotFoundError: No module named 'pandas'

In [None]:
# Display the first few rows of the DataFrame
data.head()

In [None]:
# Check the dimensions of the DataFrame (rows, columns)
data.shape

In [None]:
# Get statistical summary of the numerical columns
data.describe()

In [None]:
# Get information about the DataFrame, including column names and data types
data.info()

In [None]:
# Filter the data to only include years from 1990 onwards and data from the 2017 version
data_1990_onwards = data[(data['year'] >= 1990) & (data['ppp_version'] == 2017)]

## Further filter the data to specify the individual country data vs the world data
data_countries = data_1990_onwards[data_1990_onwards['country'] != 'World']
data_world = data_1990_onwards[data_1990_onwards['country'] == 'World']

# Data processing code
mean_gini_by_year = data_countries.groupby('year')['gini'].mean().reset_index()
mean_poverty_by_year = data_world.groupby('year')['headcount_ratio_international_povline'].mean().reset_index()

# Create a 1x2 layout for the subplots
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Define a helper function to fit and plot a trendline
def plot_trendline(x, y, ax, title):
    X = x.values.reshape(-1, 1)
    reg = LinearRegression().fit(X, y)
    y_pred = reg.predict(X)
    ax.plot(x, y, label=title)
    ax.plot(x, y_pred, color='red', linestyle='--', label='Trendline')

# Plot the Mean Gini Coefficient chart on the first subplot
plot_trendline(mean_gini_by_year['year'], mean_gini_by_year['gini'], axes[0], 'Mean Gini Coefficient')
axes[0].set_xlabel('Year')
axes[0].set_ylabel('Mean Gini Coefficient')
axes[0].set_title('Global Income Inequality (Mean Gini Coefficient) with Trendline')
axes[0].legend()
axes[0].grid(True)

# Plot the Mean Poverty Level chart for the world data on the second subplot
plot_trendline(mean_poverty_by_year['year'], mean_poverty_by_year['headcount_ratio_international_povline'], axes[1], 'Mean Poverty Level')
axes[1].set_xlabel('Year')
axes[1].set_ylabel('Mean Poverty Level')
axes[1].set_title('Global Poverty Level with Trendline')
axes[1].legend()
axes[1].grid(True)

# Display the charts
plt.show()


In [None]:
# Filter the country data for first and the last year (Gini coefficients are only available on the country level)
data_1990_extremes = data_1990_onwards[(data_1990_onwards['year'] == data_1990_onwards['year'].min()) | (data_1990_onwards['year'] == data_1990_onwards['year'].max())]

# Calculate the mean Gini coefficient for both years
gini_by_year = data_1990_extremes.groupby('year')['gini'].mean().reset_index()

# Filter the world data for first and the last year
data_1990_world_extremes = data_world[(data_world['year'] == data_world['year'].min()) | (data_world['year'] == data_world['year'].max())]

# Calculate the global mean poverty level (headcount_ratio_international_povline)
mean_poverty_by_year = data_1990_world_extremes.groupby('year')['headcount_ratio_international_povline'].mean().reset_index()

# Set index with labels for the Gini dataframe
gini_by_year.index = ['First Year', 'Last Year']
gini_by_year.index.name = 'Year'

# Set index with labels for the Poverty dataframe
mean_poverty_by_year.index = ['First Year', 'Last Year']
mean_poverty_by_year.index.name = 'Year'

# Display the dataframes with labels
print("Mean Gini Coefficient:")
print(gini_by_year)
print("\nMean Poverty Level:")
print(mean_poverty_by_year)
