# Data Preparation:

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [2]:
data_regression = pd.read_csv('GEF_projects (1).csv')


In [3]:
data_regression['Focal Areas'].fillna('Unknown', inplace=True)
data_regression['GEF Grant'] = data_regression['GEF Grant'].str.replace(',', '').astype(float)
data_regression['Approval FY'].fillna(data_regression['Approval FY'].median(), inplace=True)


In [4]:
for col in data_regression.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data_regression[col] = le.fit_transform(data_regression[col].astype(str))


In [5]:
y_regression = data_regression['GEF Grant']
X_regression = data_regression.drop(columns=['GEF Grant'])
X_train_regression, X_test_regression, y_train_regression, y_test_regression = train_test_split(
    X_regression, y_regression, test_size=0.2, random_state=42)


# 2. Model Training:

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score


In [7]:
lr = LinearRegression()
lr.fit(X_train_regression, y_train_regression)


ValueError: Input y contains NaN.

In [8]:
# 1. Data Imputation:
# Fill missing 'GEF Grant' values with the median
median_gef_grant = data_regression['GEF Grant'].median()
data_regression['GEF Grant'].fillna(median_gef_grant, inplace=True)

# Split the data into features and target
y_regression = data_regression['GEF Grant']
X_regression = data_regression.drop(columns=['GEF Grant'])

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train_regression, X_test_regression, y_train_regression, y_test_regression = train_test_split(
    X_regression, y_regression, test_size=0.2, random_state=42)

# 2. Model Training:
# Train the Linear Regression model
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train_regression, y_train_regression)

# 3. Model Evaluation:
# Validate the model's training
training_score = lr.score(X_train_regression, y_train_regression)
validation_score = lr.score(X_test_regression, y_test_regression)

print(f"Training R^2 Score: {training_score}")
print(f"Validation R^2 Score: {validation_score}")


Training R^2 Score: 0.0669199042868196
Validation R^2 Score: 0.06574545356887929


# 3. Model Evaluation:

In [9]:
y_pred_lr = lr.predict(X_test_regression)
mae_lr = mean_absolute_error(y_test_regression, y_pred_lr)
r2_lr = r2_score(y_test_regression, y_pred_lr)
print(f"Linear Regression MAE: {mae_lr}, R^2 Score: {r2_lr}")


Linear Regression MAE: 3324605.427477188, R^2 Score: 0.06574545356887929


In [11]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_regression, y_train_regression)


In [12]:
y_pred_rf = rf.predict(X_test_regression)
mae_rf = mean_absolute_error(y_test_regression, y_pred_rf)
r2_rf = r2_score(y_test_regression, y_pred_rf)
print(f"Random Forest MAE: {mae_rf}, R^2 Score: {r2_rf}")


Random Forest MAE: 2100991.737364341, R^2 Score: 0.19015212451769548


# RandomForest

In [13]:
# Re-loading and preprocessing the data
data_regression = pd.read_csv('GEF_projects (1).csv')

# Handling missing values
data_regression['Focal Areas'].fillna('Unknown', inplace=True)
data_regression['GEF Grant'] = data_regression['GEF Grant'].str.replace(',', '').astype(float)
data_regression['GEF Grant'].fillna(data_regression['GEF Grant'].median(), inplace=True)
data_regression['Approval FY'].fillna(data_regression['Approval FY'].median(), inplace=True)

# Encoding categorical variables
for col in data_regression.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data_regression[col] = le.fit_transform(data_regression[col].astype(str))

# Splitting the data into features and target, and then into training and testing sets
y_regression = data_regression['GEF Grant']
X_regression = data_regression.drop(columns=['GEF Grant'])
X_train_regression, X_test_regression, y_train_regression, y_test_regression = train_test_split(
    X_regression, y_regression, test_size=0.2, random_state=42)

# Training the Random Forest model again
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_regression, y_train_regression)

# Ensuring the model has been trained correctly
training_score_rf = rf.score(X_train_regression, y_train_regression)
training_score_rf


0.8910447352866171

We successfully trained the random forest model. The model yielded approximately R2 score 0.891 (or 89.1%) on the learning data.

This means that the model can account for 89.1% of the volatility of the GEF Grant amount of the training data.

Next, I will predict the amount of GEF Grant for future projects through sample data.

In [14]:
# Selecting a random sample from the test set
sample_data = X_test_regression.sample(1, random_state=42)

# Predicting the GEF Grant amount for the sample data using the trained Random Forest model
predicted_gef_grant = rf.predict(sample_data)

sample_data['Predicted GEF Grant'] = predicted_gef_grant
sample_data


Unnamed: 0,Title,ID,Countries,Focal Areas,Type,Agencies,Cofinancing,Status,Approval FY,Funding Source (indexed field),Non-Grant Instrument (indexed field),Capacity-building Initiative for Transparency,GEF Period,Predicted GEF Grant
3936,381,2754,143,0,1,55,3526,1,2007.0,1,0,0,2,11974590.3


In [15]:
# Comparing the predicted GEF Grant amount with the distribution of GEF Grant amounts in the dataset
gef_grant_distribution = data_regression['GEF Grant'].describe()

gef_grant_distribution, predicted_gef_grant[0]


(count    5.801000e+03
 mean     3.985297e+06
 std      7.794360e+06
 min      0.000000e+00
 25%      6.658180e+05
 50%      2.000000e+06
 75%      5.154741e+06
 max      3.064390e+08
 Name: GEF Grant, dtype: float64,
 11974590.3)

The distribution of GEF Grant amounts in the dataset is as follows:

Mean Amount: Approximately $3,985,297 (approximately $3.98 million)
Median (50%): approximately $2,000,000 (approximately $2 million)
Third Quarter (75%): Approximately $5,154,741 (approximately $5.15 million)
Maximum amount: approximately $306,439,000 (approximately $306.4 million)
The estimated GEF Grant amount is $11,974,590.3 (approximately $11.97 million).

The predicted amounts are higher than the mean and median values of the dataset and higher than the third quartile (top 25% range). But it's much lower compared to the maximum amount.

In conclusion, the predicted amount is higher in the top 25% range compared to the GEF Grant amount distribution in the dataset.

Through this machine learning process, we were able to get a prediction that a top 25% amount would be needed to 'complete' a project. I think this is a strong source to be used as evidence to demonstrate the importance of increasing funding investment in climate change projects.