<a href="https://colab.research.google.com/github/mohannashahrad/Borealis_AI_Plant_Tree_Project/blob/main/Training/Final_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importing Libraries

In [74]:
# Import all the required libraries
import pandas as pd
import requests
try:
    from StringIO import StringIO
except ImportError:
    from io import StringIO
import numpy as np
from sklearn.linear_model import LinearRegression  
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer    
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import pyplot
import xgboost as xgb
import sys
!{sys.executable} -m pip install fbprophet
import fbprophet



#Defining Functions


In [75]:
# Functions
def load_DF(url):
  data = StringIO(requests.get(url).text)
  return pd.read_csv(data)

def standardize(df,col_names):
  print(type(df))
  df = df[~df.isin([np.nan, np.inf, -np.inf]).any(1)]
  df.reset_index(inplace=True, drop=True)
  features = df[col_names]
  scaler = StandardScaler().fit(features.values)
  features = scaler.transform(features.values)
  df[col_names] = features
  return df

def oneHotEncode(df, discrete_columns):
  for var in discrete_columns:
    enc = OneHotEncoder(handle_unknown='ignore')
    enc_df = pd.DataFrame(enc.fit_transform(df[[var]]).toarray())
    enc_df.columns = enc.get_feature_names([var])
    df = df.join(enc_df)
    df = df.drop([var], axis=1)
  return df

#Loading the Dataset

In [76]:
# Loading DataSet
df = load_DF('https://raw.githubusercontent.com/mohannashahrad/Borealis_AI_Plant_Tree_Project/main/Final_Data/final_data2.csv')
df = df.iloc[: , 1:]
display(df.head())

Unnamed: 0,Time,Country Name,Land Area (m2),Agriculture Land (m2),Forest Land (m2),Population,Pop Growth (%),Urban Pop (%),GDP (US$),GDP Growth (%),Forest Rents (% GDP),Coal Rents (% GDP),Oil Rents (% GDP),CO2 Emission (kt),GHG Emision (CO2 eqv),Tree Loss (ha)
0,2001,Afghanistan,652860.0,377530.0,12084.4,21606992.0,3.902805,22.169,,,,,,810.0,13490.0,88.092712
1,2002,Afghanistan,652860.0,377530.0,12084.4,22600774.0,4.496719,22.261,4055180000.0,,0.958004,0.004341,0.029101,1100.0,16090.0,178.640364
2,2003,Afghanistan,652860.0,379100.0,12084.4,23680871.0,4.668344,22.353,4515559000.0,8.832278,0.664331,0.007422,0.026686,1350.0,16780.0,244.336255
3,2004,Afghanistan,652860.0,379110.0,12084.4,24726689.0,4.32156,22.5,5226779000.0,1.414118,0.387787,0.016455,0.025399,1130.0,16520.0,201.444959
4,2005,Afghanistan,652860.0,379100.0,12084.4,25654274.0,3.6827,22.703,6209138000.0,11.229715,0.332205,0.010904,0.025973,1640.0,17400.0,235.956834


#Preprocessing 

This section consists of data normalization and one-hot encoding for discrete features.

In [77]:
df['Tree Loss (ha)'] /= df['Forest Land (m2)']
df['GDP (US$)'] /= df['Population']
df['CO2 Emission (kt)'] /= df['Population']
df['GHG Emision (CO2 eqv)'] /= df['Population']
df['Population'] /= df['Land Area (m2)']
df['Forest Land (m2)'] /= df['Land Area (m2)']
df['Agriculture Land (m2)'] /= df['Land Area (m2)']

# df.drop('Land Area (m2)', axis=1, inplace=True)
df.head(30)

Unnamed: 0,Time,Country Name,Land Area (m2),Agriculture Land (m2),Forest Land (m2),Population,Pop Growth (%),Urban Pop (%),GDP (US$),GDP Growth (%),Forest Rents (% GDP),Coal Rents (% GDP),Oil Rents (% GDP),CO2 Emission (kt),GHG Emision (CO2 eqv),Tree Loss (ha)
0,2001,Afghanistan,652860.0,0.578271,0.01851,33.095904,3.902805,22.169,,,,,,3.7e-05,0.000624,0.00729
1,2002,Afghanistan,652860.0,0.578271,0.01851,34.618102,4.496719,22.261,179.426579,,0.958004,0.004341,0.029101,4.9e-05,0.000712,0.014783
2,2003,Afghanistan,652860.0,0.580676,0.01851,36.27251,4.668344,22.353,190.683814,8.832278,0.664331,0.007422,0.026686,5.7e-05,0.000709,0.020219
3,2004,Afghanistan,652860.0,0.580691,0.01851,37.874413,4.32156,22.5,211.382074,1.414118,0.387787,0.016455,0.025399,4.6e-05,0.000668,0.01667
4,2005,Afghanistan,652860.0,0.580676,0.01851,39.295215,3.6827,22.703,242.031313,11.229715,0.332205,0.010904,0.025973,6.4e-05,0.000678,0.019526
5,2006,Afghanistan,652860.0,0.580676,0.01851,40.488095,2.990524,22.907,263.733602,5.357403,0.454077,0.011152,0.022144,7.3e-05,0.000796,0.012585
6,2007,Afghanistan,652860.0,0.580676,0.01851,41.510495,2.49383,23.113,359.693158,13.82632,0.342749,0.076415,0.014696,8.7e-05,0.000918,0.020923
7,2008,Afghanistan,652860.0,0.580676,0.01851,42.462827,2.268273,23.32,364.660679,3.924984,0.353698,0.224465,0.016138,0.000158,0.001138,0.008888
8,2009,Afghanistan,652860.0,0.580676,0.01851,43.492948,2.396978,23.528,438.076142,21.390528,0.274778,0.126422,0.005458,0.000211,0.001302,0.005826
9,2010,Afghanistan,652860.0,0.580691,0.01851,44.704088,2.746615,23.737,543.302967,14.362441,0.358436,0.212188,0.004964,0.000297,0.001539,0.00726


In [78]:
col_names = ['Agriculture Land (m2)', 'Forest Land (m2)', 'Population', 'Pop Growth (%)', 'Urban Pop (%)',
                'GDP (US$)', 'GDP Growth (%)', 'Forest Rents (% GDP)', 'Coal Rents (% GDP)', 'Oil Rents (% GDP)', 'CO2 Emission (kt)',
                'GHG Emision (CO2 eqv)', 'Tree Loss (ha)']

df = standardize(df,col_names)
discrete_columns = ["Country Name"]
df = oneHotEncode(df,discrete_columns)
df.head(30)

<class 'pandas.core.frame.DataFrame'>




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.



Unnamed: 0,Time,Land Area (m2),Agriculture Land (m2),Forest Land (m2),Population,Pop Growth (%),Urban Pop (%),GDP (US$),GDP Growth (%),Forest Rents (% GDP),Coal Rents (% GDP),Oil Rents (% GDP),CO2 Emission (kt),GHG Emision (CO2 eqv),Tree Loss (ha),Country Name_Afghanistan,Country Name_Albania,Country Name_Algeria,Country Name_Angola,Country Name_Antigua and Barbuda,Country Name_Argentina,Country Name_Armenia,Country Name_Australia,Country Name_Austria,Country Name_Azerbaijan,"Country Name_Bahamas, The",Country Name_Bangladesh,Country Name_Barbados,Country Name_Belarus,Country Name_Belgium,Country Name_Belize,Country Name_Benin,Country Name_Bhutan,Country Name_Bolivia,Country Name_Bosnia and Herzegovina,Country Name_Botswana,Country Name_Brazil,Country Name_Brunei Darussalam,Country Name_Bulgaria,Country Name_Burkina Faso,...,Country Name_Senegal,Country Name_Serbia,Country Name_Seychelles,Country Name_Sierra Leone,Country Name_Singapore,Country Name_Slovak Republic,Country Name_Slovenia,Country Name_Solomon Islands,Country Name_South Africa,Country Name_South Sudan,Country Name_Spain,Country Name_Sri Lanka,Country Name_St. Kitts and Nevis,Country Name_St. Lucia,Country Name_St. Vincent and the Grenadines,Country Name_Sudan,Country Name_Suriname,Country Name_Sweden,Country Name_Switzerland,Country Name_Syrian Arab Republic,Country Name_Tajikistan,Country Name_Tanzania,Country Name_Thailand,Country Name_Timor-Leste,Country Name_Togo,Country Name_Trinidad and Tobago,Country Name_Tunisia,Country Name_Turkey,Country Name_Turkmenistan,Country Name_Uganda,Country Name_Ukraine,Country Name_United Kingdom,Country Name_United States,Country Name_Uruguay,Country Name_Uzbekistan,Country Name_Vanuatu,"Country Name_Venezuela, RB",Country Name_Vietnam,Country Name_Zambia,Country Name_Zimbabwe
0,2003,652860.0,0.871334,-1.444585,-0.238569,2.833507,-1.488437,-0.650035,0.9662,-0.325614,-0.195424,-0.367831,-0.928651,-0.929756,-0.404835,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2004,652860.0,0.871406,-1.444585,-0.235796,2.540205,-1.481744,-0.648822,-0.476629,-0.391587,-0.185539,-0.36798,-0.931341,-0.936252,-0.407862,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2005,652860.0,0.871334,-1.444585,-0.233336,1.999872,-1.472501,-0.647025,1.4325,-0.404847,-0.191614,-0.367914,-0.927006,-0.934624,-0.405426,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2006,652860.0,0.871334,-1.444585,-0.231271,1.414446,-1.463213,-0.645753,0.290338,-0.375773,-0.191342,-0.368358,-0.924755,-0.915736,-0.411344,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2007,652860.0,0.871334,-1.444585,-0.229501,0.994354,-1.453834,-0.640128,1.937539,-0.402332,-0.119929,-0.369223,-0.921499,-0.896147,-0.404235,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2008,652860.0,0.871334,-1.444585,-0.227852,0.803583,-1.44441,-0.639837,0.011733,-0.39972,0.042073,-0.369056,-0.904549,-0.860905,-0.414497,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2009,652860.0,0.871334,-1.444585,-0.226068,0.912439,-1.43494,-0.635534,3.408774,-0.418547,-0.065209,-0.370296,-0.891957,-0.834602,-0.417107,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2010,652860.0,0.871406,-1.444585,-0.223972,1.208153,-1.425424,-0.629366,2.041814,-0.398589,0.028639,-0.370353,-0.871562,-0.796556,-0.415885,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,2011,652860.0,0.871334,-1.444585,-0.2215,1.543496,-1.415817,-0.626561,-0.668749,-0.410307,0.338492,-0.370161,-0.845401,-0.730997,-0.414105,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,2012,652860.0,0.871334,-1.444585,-0.218732,1.767199,-1.406165,-0.623588,1.72864,-0.421154,0.092211,-0.358186,-0.862458,-0.699759,-0.419302,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Splitting the dataset into test and train sets

In [79]:
# Splitting the dataset into training and test parts
y = df['Tree Loss (ha)']
x = df.loc[:, df.columns != 'Tree Loss (ha)']
x = x[~x.isin([np.nan, np.inf, -np.inf]).any(1)]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
print(X_train.shape)
print(X_test.shape)

(2296, 184)
(574, 184)


# First Model: Linear Regression

In [80]:
model = LinearRegression()  
model.fit(X_train, y_train) 
y_pred = model.predict(X_test)
compare_df = pd.DataFrame({'Actual': y_test.to_numpy().flatten(), 'Predicted': y_pred.flatten()})
display(compare_df)

Unnamed: 0,Actual,Predicted
0,-0.204019,-0.161298
1,-0.204874,-0.431760
2,-0.312071,0.001086
3,0.011223,-0.102587
4,-0.236457,-0.218748
...,...,...
569,0.747812,0.418346
570,-0.112435,-0.162124
571,-0.072781,-0.385881
572,-0.363977,0.240037


In [83]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2 score:', np.sqrt(metrics.r2_score(y_pred,y_test)))

Mean Absolute Error: 0.22625231867019516
Mean Squared Error: 0.2093940249896431
Root Mean Squared Error: 0.45759591889531
R2 score: 0.5765734920442971


#Second Model: XGBoost

In [82]:
dtrain = xgb.DMatrix(data=X_train,label=y_train)
params = {
    'gamma':0,                 
    'learning_rate':0.07,
    'max_depth':5,
    'min_child_weight':1.5,
    'n_estimators':1000,                                                                    
    'reg_alpha':0.75,
    'reg_lambda':0.45,
    'subsample':0.6,
    'seed':42
}
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=999,
    seed=42,
    nfold=5,
    metrics={'mae'},
    early_stopping_rounds=10
)
cv_results

KeyboardInterrupt: ignored

#Hyper parameter Tuning for XGBoost

In [None]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(4,10)
    for min_child_weight in range(1,8)
]

min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=1000,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

In [None]:
data_dmatrix = xgb.DMatrix(data=x,label=y)
xg_reg = xgb.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.07,
                 max_depth=8,
                 min_child_weight=3,
                 n_estimators=1000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42)
xg_reg.fit(X_train,y_train)

y_pred = xg_reg.predict(X_test)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
compare_df = pd.DataFrame({'Actual': y_test.to_numpy().flatten(), 'Predicted': y_pred.flatten()})
display(compare_df)

#Feature Importance using XGBoost

In [None]:
pyplot.bar(range(14), xg_reg.feature_importances_[:14])
LABELS = x.columns[:14]
pyplot.xticks(range(len(xg_reg.feature_importances_[:14])), LABELS, rotation='vertical')
pyplot.show()

#Third Model: LightGBM

In [None]:
import lightgbm as lgb

def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return np.sqrt(np.mean(np.power(np.log1p(y_true + 1) - np.log1p(y_pred + 1), 2)))
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmsle',
    'max_depth': 6, 
    'learning_rate': 0.1,
    'verbose': 0}
n_estimators = 100

n_iters = 5
preds_buf = []
err_buf = []
X_lgb = X_train
y_lgb = y_train
for i in range(n_iters): 
    x_train, x_valid, y_train, y_valid = train_test_split(X_lgb, y_lgb, test_size=0.10, random_state=i)
    d_train = lgb.Dataset(x_train, label=y_train)
    d_valid = lgb.Dataset(x_valid, label=y_valid)
    watchlist = [d_valid]

    model = lgb.train(params, d_train, n_estimators, watchlist, verbose_eval=1)

    preds = np.exp(model.predict(x_valid)) - 1
    err = rmsle(np.exp(y_valid) - 1, preds)
    err_buf.append(err)
    print('RMSLE = ' + str(err))
    
    preds = np.exp(model.predict(X_test)) - 1
    preds_buf.append(preds)

print('Mean RMSLE = ' + str(np.mean(err_buf)) + ' +/- ' + str(np.std(err_buf)))
preds = np.mean(preds_buf, axis=0)
compare_df = pd.DataFrame({'Actual': y_test.to_numpy().flatten(), 'Predicted': preds.flatten()})
display(compare_df)

#Feature Importance using LightGBM


In [None]:
lgb.plot_importance(model)