## **Ecological Footprint Analysis**

> `Add blockquote`



### **Data** **Cleaning**

In [None]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
file_path='/content/drive/MyDrive/Colab Notebooks/Ecological Footprint Analysis/GlobalEcologicalFootprint.csv'

In [None]:
data=pd.read_csv(file_path, encoding='latin-1')
data.head()

In [None]:
data.info()
data.describe()

In [None]:
data.rename(columns={'Total biocapacity ': 'Total biocapacity'}, inplace=True)
data.rename(columns={'Life Exectancy': 'Life Expectancy'}, inplace=True)

# Removing '$' and ',' to convert'Per Capita GDP' to a numeric column
data['Per Capita GDP'] = data['Per Capita GDP'].str.replace('$', '')
data['Per Capita GDP'] = data['Per Capita GDP'].str.replace(',', '')
data['Per Capita GDP']

In [None]:
# Check for missing values in the entire dataset
missing_values = data.isnull().sum()

# Display columns with missing values
missing_columns = missing_values[missing_values > 0]
print(missing_columns)


HANDELING MISSING VALUES

In [None]:
# prompt: Handle missing data don't drop Impute missing values,Drop row with a high percentage of missing coloums

# Calculate the percentage of missing values in each row
row_missing_percentage = data.isnull().mean(axis=1)

# Define a threshold for the maximum allowed percentage of missing values per row
threshold = 0.5  # Example: Remove rows with more than 50% missing values

# Remove rows with a high percentage of missing values
data = data[row_missing_percentage <= threshold]

# Impute missing values with the mean for numerical columns
numerical_cols = data.select_dtypes(include=np.number).columns
for col in numerical_cols:
  data[col].fillna(data[col].mean(), inplace=True)

# Impute missing values with the mode for categorical columns
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
  data[col].fillna(data[col].mode()[0], inplace=True)

# Verify that there are no more missing values
missing_values_after_handling = data.isnull().sum()
print(missing_values_after_handling)

In [None]:
# Convert non-numeric columns to numeric
columns_to_convert = ['SDGi', 'Life Expectancy', 'HDI', 'Population (millions)', 'Per Capita GDP']

for col in columns_to_convert:
    data[col] = pd.to_numeric(data[col], errors='coerce')

In [None]:
data.info()
data.describe()

data['Income Group'].value_counts()
data.head()

### **Visualize the pairwise relationships between Ecological Footprint factors**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(6, 6))
sns.pairplot(data[['Cropland Footprint', 'Grazing Footprint', 'Forest Product Footprint', 'Carbon Footprint', 'Fish Footprint']])
plt.show()

# **Ecological Footprint Analysis:**
*.**Objective:** Compare ecological footprints across countries and analyze the factors contributing to high or low footprints.*


In [None]:

plt.figure(figsize=(12, 8))
sns.barplot(x='Total Ecological Footprint (Consumption)', y='Country', data=data.sort_values('Total Ecological Footprint (Consumption)', ascending=False).head(20)) # Changed data to data
plt.title('Top 20 Countries by Total Ecological Footprint')
plt.xlabel('Ecological Footprint (Consumption)')
plt.show()

In [None]:
# Boxplot of Ecological Footprint by Income Group
plt.figure(figsize=(10, 6))
sns.boxplot(x='Income Group', y='Total Ecological Footprint (Consumption)', data=data)
plt.title('Ecological Footprint by Income Group')
plt.xlabel('Income Group')
plt.ylabel('Total Ecological Footprint (Consumption)')
plt.show()


In [None]:
# Boxplot of Ecological Footprint by Region
plt.figure(figsize=(12, 6))
sns.boxplot(x='Region', y='Total Ecological Footprint (Consumption)', data=data)
plt.title('Ecological Footprint by Region')
plt.xticks(rotation=45)
plt.xlabel('Region')
plt.ylabel('Total Ecological Footprint (Consumption)')
plt.show()


In [None]:
# Correlation Matrix
corr_matrix = data[['Total Ecological Footprint (Consumption)', 'Per Capita GDP', 'Population (millions)', 'HDI']].corr()

# Plot the correlation heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Between Ecological Footprint and Economic Indicators')
plt.show()


In [None]:
# Select relevant numerical columns for correlation analysis
numeric_columns = ['SDGi', 'Life Expectancy', 'HDI', 'Per Capita GDP', 'Population (millions)',
                   'Total Ecological Footprint (Consumption)', 'Ecological (Deficit) or Reserve']

# Compute correlation matrix
correlation_matrix = data[numeric_columns].corr()

# Plot heatmap for better visualization
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix for Ecological Footprint and Other Factors')
plt.show()


In [None]:
# Scatter plot: Total Ecological Footprint vs. Per Capita GDP
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Per Capita GDP', y='Total Ecological Footprint (Consumption)', data=data)
plt.title('Ecological Footprint vs. Per Capita GDP')
plt.xlabel('Per Capita GDP')
plt.ylabel('Total Ecological Footprint')
plt.show()

# Scatter plot: Total Ecological Footprint vs. HDI
plt.figure(figsize=(8, 6))
sns.scatterplot(x='HDI', y='Total Ecological Footprint (Consumption)', data=data)
plt.title('Ecological Footprint vs. HDI')
plt.xlabel('Human Development Index (HDI)')
plt.ylabel('Total Ecological Footprint')
plt.show()

# Scatter plot: Total Ecological Footprint vs. Population
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Population (millions)', y='Total Ecological Footprint (Consumption)', data=data)
plt.title('Ecological Footprint vs. Population')
plt.xlabel('Population (millions)')
plt.ylabel('Total Ecological Footprint')
plt.show()


In [None]:
# Identify outliers in Total Ecological Footprint using IQR method
Q1 = data['Total Ecological Footprint (Consumption)'].quantile(0.25)
Q3 = data['Total Ecological Footprint (Consumption)'].quantile(0.75)
IQR = Q3 - Q1

outliers = data[(data['Total Ecological Footprint (Consumption)'] < (Q1 - 1.5 * IQR)) |
                        (data['Total Ecological Footprint (Consumption)'] > (Q3 + 1.5 * IQR))]

print(outliers[['Country', 'Region', 'Total Ecological Footprint (Consumption)', 'Per Capita GDP', 'HDI', 'Population (millions)']])


# **Interpretation:**

Based on the visualizations and analysis:

* Which countries or regions have the highest and lowest ecological footprints?

* How do population size and GDP relate to ecological footprint?




In [None]:
# Correlation between SDGi and Total Ecological Footprint
correlation_sdgi_footprint = data['SDGi'].corr(data['Total Ecological Footprint (Consumption)'])
print(f"Correlation between SDGi and Total Ecological Footprint: {correlation_sdgi_footprint}")


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Scatter plot of SDGi vs. Total Ecological Footprint
plt.figure(figsize=(8, 6))
sns.scatterplot(x='SDGi', y='Total Ecological Footprint (Consumption)', data=data)
plt.title('SDGi vs. Total Ecological Footprint')
plt.xlabel('SDGi (Sustainable Development Goals Index)')
plt.ylabel('Total Ecological Footprint')
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer # import the imputer

# Prepare the data
X = data[['SDGi']]
y = data['Total Ecological Footprint (Consumption)']

# Impute missing values with the mean
imputer = SimpleImputer(strategy='mean') # create an imputer instance
X = imputer.fit_transform(X) # fit and transform the data

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
r_squared = model.score(X_test, y_test)
print(f"R-squared value: {r_squared}")

# Get the model coefficients
print(f"Intercept: {model.intercept_}")
print(f"Coefficient for SDGi: {model.coef_[0]}")

Exploring the trade-off between economic growth (represented by Per Capita GDP) and environmental sustainability (represented by the Ecological Footprint and Biocapacity)

In [None]:
# Correlation analysis
correlation_gdp_env = data[['Per Capita GDP', 'Total Ecological Footprint (Consumption)', 'Total biocapacity']].corr()

# Display correlation matrix
print(correlation_gdp_env)


In [None]:
# Scatter plot: GDP vs Ecological Footprint
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Per Capita GDP', y='Total Ecological Footprint (Consumption)',hue='Region', data=data)
plt.title('Per Capita GDP vs Total Ecological Footprint')
plt.xlabel('Per Capita GDP')
plt.ylabel('Total Ecological Footprint (Consumption)')
plt.show()


In [None]:
# Scatter plot: GDP vs Biocapacity
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Per Capita GDP', y='Total biocapacity', hue='Region',data=data)
plt.title('Per Capita GDP vs Total Biocapacity')
plt.xlabel('Per Capita GDP')
plt.ylabel('Total Biocapacity')
plt.show()


In [None]:
# Scatter plot: Ecological Footprint vs Biocapacity
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Total Ecological Footprint (Consumption)', y='Total biocapacity',hue='Region' ,data=data)
plt.title('Ecological Footprint vs Biocapacity')
plt.xlabel('Total Ecological Footprint (Consumption)')
plt.ylabel('Total Biocapacity')
plt.show()


In [None]:
# Normalize the values for easier comparison
data['Normalized GDP'] = (data['Per Capita GDP'] - data['Per Capita GDP'].min()) / (data['Per Capita GDP'].max() - data['Per Capita GDP'].min())
data['Normalized Footprint'] = (data['Total Ecological Footprint (Consumption)'] - data['Total Ecological Footprint (Consumption)'].min()) / (data['Total Ecological Footprint (Consumption)'].max() - data['Total Ecological Footprint (Consumption)'].min())
data['Normalized Biocapacity'] = (data['Total biocapacity'] - data['Total biocapacity'].min()) / (data['Total biocapacity'].max() - data['Total biocapacity'].min())

# Line plot
plt.figure(figsize=(28,12))
plt.plot(data['Country'], data['Normalized GDP'], label='Per Capita GDP', marker='o')
plt.plot(data['Country'], data['Normalized Footprint'], label='Ecological Footprint', marker='x')
plt.plot(data['Country'], data['Normalized Biocapacity'], label='Biocapacity', marker='s')

plt.xticks(rotation=90)
plt.title('Comparison of Per Capita GDP, Ecological Footprint, and Biocapacity Across Countries')
plt.xlabel('Country')
plt.ylabel('Normalized Values')
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Scatter plot with FacetGrid for regions
g = sns.FacetGrid(data, col="Region", col_wrap=3, height=4)
g.map(sns.scatterplot, "Per Capita GDP", "Total Ecological Footprint (Consumption)")
g.add_legend()
plt.show()

# Scatter plot with FacetGrid for income groups
g = sns.FacetGrid(data, col="Income Group", col_wrap=2, height=4)
g.map(sns.scatterplot, "Per Capita GDP", "Total Ecological Footprint (Consumption)")
g.add_legend()
plt.show()


Sustainability Performance: Evaluate countries in terms of their ecological deficit or reserve and correlate it with development indicators like HDI, GDP, and life expectancy.

In [None]:
# Correlation analysis between Ecological Deficit/Reserve and development indicators
correlation_sustainability = data[['Ecological (Deficit) or Reserve', 'HDI', 'Per Capita GDP', 'Life Expectancy']].corr()

# Display correlation matrix
print(correlation_sustainability)

In [None]:
# Scatter plot: Ecological Deficit/Reserve vs HDI
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Ecological (Deficit) or Reserve', y='HDI', data=data)
plt.title('Ecological Deficit/Reserve vs HDI')
plt.xlabel('Ecological Deficit/Reserve')
plt.ylabel('HDI')
plt.show()


In [None]:
# Scatter plot: Ecological Deficit/Reserve vs GDP
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Ecological (Deficit) or Reserve', y='Per Capita GDP', data=data)
plt.title('Ecological Deficit/Reserve vs Per Capita GDP')
plt.xlabel('Ecological Deficit/Reserve')
plt.ylabel('Per Capita GDP')
plt.show()


In [None]:
# Scatter plot: Ecological Deficit/Reserve vs Life Expectancy
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Ecological (Deficit) or Reserve', y='Life Expectancy', data=data)
plt.title('Ecological Deficit/Reserve vs Life Expectancy')
plt.xlabel('Ecological Deficit/Reserve')
plt.ylabel('Life Expectancy')
plt.show()


In [None]:
# Facet grid: Ecological Deficit/Reserve vs HDI by Region
g = sns.FacetGrid(data, col="Region", col_wrap=3, height=4)
g.map(sns.scatterplot, "Ecological (Deficit) or Reserve", "HDI")
g.add_legend()
plt.show()
# Facet grid: Ecological Deficit/Reserve vs GDP by Income Group
g = sns.FacetGrid(data, col="Income Group", col_wrap=2, height=4)
g.map(sns.scatterplot, "Ecological (Deficit) or Reserve", "Per Capita GDP")
g.add_legend()
plt.show()


In [None]:
# Bar plot: Ranking countries by Ecological Deficit or Reserve
data_sorted = data.sort_values('Ecological (Deficit) or Reserve', ascending=False)

plt.figure(figsize=(10, 25))
sns.barplot(x='Ecological (Deficit) or Reserve', y='Country', data=data_sorted, palette='coolwarm')
plt.title('Ranking of Countries by Ecological Deficit or Reserve')
plt.xlabel('Ecological Deficit or Reserve')
plt.ylabel('Country')
plt.tight_layout()
plt.show()


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Prepare data
X = data[['HDI', 'Per Capita GDP', 'Life Expectancy']]
y = data['Ecological (Deficit) or Reserve']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
r_squared = model.score(X_test, y_test)
print(f"R-squared value: {r_squared}")

# Get the model coefficients
coefficients = model.coef_
intercept = model.intercept_
print(f"Intercept: {intercept}")
print(f"Coefficients: {coefficients}")


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer # Import the SimpleImputer class

# Prepare data
X = data[['HDI', 'Per Capita GDP', 'Life Expectancy']]
y = data['Ecological (Deficit) or Reserve']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values using the mean strategy
imputer = SimpleImputer(strategy='mean') # Create an instance of SimpleImputer
X_train = imputer.fit_transform(X_train) # Fit and transform on the training data
X_test = imputer.transform(X_test) # Transform the testing data

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
r_squared = model.score(X_test, y_test)
print(f"R-squared value: {r_squared}")

# Get the model coefficients
coefficients = model.coef_
intercept = model.intercept_
print(f"Intercept: {intercept}")
print(f"Coefficients: {coefficients}")

In [None]:
from sklearn.preprocessing import PolynomialFeatures

# Polynomial Features
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

# Train Polynomial Regression model
poly_model = LinearRegression()
poly_model.fit(X_poly, y)

# Get predictions
y_poly_pred = poly_model.predict(X_poly)

# Calculate R-squared
r_squared_poly = poly_model.score(X_poly, y)
print(f"R-squared (Polynomial Regression): {r_squared_poly}")


In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer # Import the SimpleImputer class

# Polynomial Features
poly = PolynomialFeatures(degree=2)

# Impute missing values using the mean strategy
imputer = SimpleImputer(strategy='mean') # Create an instance of SimpleImputer
X = imputer.fit_transform(X) # Fit and transform on the training data

X_poly = poly.fit_transform(X)

# Train Polynomial Regression model
poly_model = LinearRegression()
poly_model.fit(X_poly, y)

# Get predictions
y_poly_pred = poly_model.predict(X_poly)

# Calculate R-squared
r_squared_poly = poly_model.score(X_poly, y)
print(f"R-squared (Polynomial Regression): {r_squared_poly}")

In [None]:
# Correlation analysis between Life Expectancy, Per Capita GDP, and Ecological Footprint
correlation_life_income_footprint = data[['Life Expectancy', 'Per Capita GDP', 'Total Ecological Footprint (Consumption)']].corr()

# Display correlation matrix
print(correlation_life_income_footprint)


In [None]:
# Scatter plot: Life Expectancy vs Ecological Footprint
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Life Expectancy', y='Total Ecological Footprint (Consumption)', data=data)
plt.title('Life Expectancy vs Ecological Footprint')
plt.xlabel('Life Expectancy')
plt.ylabel('Total Ecological Footprint (Consumption)')
plt.show()
# Scatter plot: Per Capita GDP vs Ecological Footprint
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Per Capita GDP', y='Total Ecological Footprint (Consumption)', data=data)
plt.title('Per Capita GDP vs Ecological Footprint')
plt.xlabel('Per Capita GDP')
plt.ylabel('Total Ecological Footprint (Consumption)')
plt.show()
# Scatter plot: Life Expectancy vs Per Capita GDP
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Life Expectancy', y='Per Capita GDP', data=data)
plt.title('Life Expectancy vs Per Capita GDP')
plt.xlabel('Life Expectancy')
plt.ylabel('Per Capita GDP')
plt.show()


In [None]:
# Facet grid: Life Expectancy vs Ecological Footprint by Region
g = sns.FacetGrid(data, col="Region", col_wrap=3, height=4)
g.map(sns.scatterplot, "Life Expectancy", "Total Ecological Footprint (Consumption)")
g.add_legend()
plt.show()
# Facet grid: Per Capita GDP vs Ecological Footprint by Income Group
g = sns.FacetGrid(data, col="Income Group", col_wrap=2, height=4)
g.map(sns.scatterplot, "Per Capita GDP", "Total Ecological Footprint (Consumption)")
g.add_legend()
plt.show()
data

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Prepare data
X = data[['Life Expectancy', 'Per Capita GDP']]
y = data['Total Ecological Footprint (Consumption)']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
r_squared = model.score(X_test, y_test)
print(f"R-squared value: {r_squared}")

# Get the model coefficients
coefficients = model.coef_
intercept = model.intercept_
print(f"Intercept: {intercept}")
print(f"Coefficients (Life Expectancy, Per Capita GDP): {coefficients}")


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer # Import the SimpleImputer class

# Prepare data
X = data[['Life Expectancy', 'Per Capita GDP']]
y = data['Total Ecological Footprint (Consumption)']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values using the mean strategy
imputer = SimpleImputer(strategy='mean') # Create an instance of SimpleImputer
X_train = imputer.fit_transform(X_train) # Fit and transform on the training data
X_test = imputer.transform(X_test) # Transform the testing data

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
r_squared = model.score(X_test, y_test)
print(f"R-squared value: {r_squared}")

# Get the model coefficients
coefficients = model.coef_
intercept = model.intercept_
print(f"Intercept: {intercept}")
print(f"Coefficients (Life Expectancy, Per Capita GDP): {coefficients}")

In [None]:
# Density plot for Life Expectancy, Per Capita GDP, and Ecological Footprint
plt.figure(figsize=(8, 6))
sns.kdeplot(data['Life Expectancy'], label='Life Expectancy', fill=True)
sns.kdeplot(data['Per Capita GDP'], label='Per Capita GDP', fill=True)
sns.kdeplot(data['Total Ecological Footprint (Consumption)'], label='Ecological Footprint', fill=True)
plt.title('Density Plot for Life Expectancy, Per Capita GDP, and Ecological Footprint')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.show()


In [None]:
numerical_cols =data.select_dtypes(include=['float64', 'int64']).columns
for col in numerical_cols:
    plt.figure(figsize=(5, 5))
    sns.histplot(data[col].dropna(), kde=True)
    plt.title(f'Distribution of {col}')
    plt.show()