In [None]:
import numpy as np
from sklearn import model_selection
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.datasets import make_blobs
import regex as re

In [None]:
df=pd.read_csv('../input/countries-of-the-world/countries of the world.csv')

In [None]:
df.columns=(['Country', 'Region', 'Population', 'Area',
       'Pop_Density', 'Coastline',
       'Net_migration', 'Infant_mortality',
       'GDP', 'Literacy', 'Phones', 'Arable',
       'Crops', 'Other', 'Climate', 'Birthrate', 'Deathrate',
       'Agriculture', 'Industry', 'Service'])

In [None]:
df.Deathrate=df.Deathrate.str.replace(',','.').astype('float64')
df.Pop_Density=df.Pop_Density.str.replace(',','.').astype('float64')
df.Coastline=df.Coastline.str.replace(',','.').astype('float64')
df.Net_migration=df.Net_migration.str.replace(',','.').astype('float64')
df.Infant_mortality=df.Infant_mortality.str.replace(',','.').astype('float64')
df.Literacy=df.Literacy.str.replace(',','.').astype('float64')
df.Phones=df.Phones.str.replace(',','.').astype('float64')
df.Arable=df.Arable.str.replace(',','.').astype('float64')
df.Crops=df.Crops.str.replace(',','.').astype('float64')
df.Other=df.Other.str.replace(',','.').astype('float64')
df.Climate=df.Climate.str.replace(',','.').astype('float64')
df.Birthrate=df.Birthrate.str.replace(',','.').astype('float64')
df.Agriculture=df.Agriculture.str.replace(',','.').astype('float64')
df.Industry=df.Industry.str.replace(',','.').astype('float64')
df.Service=df.Service.str.replace(',','.').astype('float64')

df.Country=df.Country.astype('category')
df.Region=df.Region.astype('category')

In [None]:
df.head()


#Filling the Missing values

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(df.isna(),cmap='magma')

In [None]:
df.describe()

In [None]:
df.groupby('Region').mean()['Literacy']

In [None]:
df['Literacy'].fillna(df.groupby('Region')['Literacy'].transform('mean'), inplace= True)
df['Phones'].fillna(df.groupby('Region')['Phones'].transform('mean'), inplace= True)
df['Birthrate'].fillna(df.groupby('Region')['Birthrate'].transform('mean'), inplace= True)
df['Deathrate'].fillna(df.groupby('Region')['Deathrate'].transform('mean'), inplace= True)
df['Net_migration'].fillna(df.groupby('Region')['Net_migration'].transform('mean'), inplace= True)
df['Infant_mortality'].fillna(df.groupby('Region')['Infant_mortality'].transform('mean'), inplace= True)
df['GDP'].fillna(df.groupby('Region')['GDP'].transform('mean'), inplace= True)
df['Industry'].fillna(df.groupby('Region')['Industry'].transform('mean'), inplace= True)


df['Agriculture'].fillna(0.17, inplace=True)
df['Service'].fillna(0.8, inplace=True)
df['Arable'].fillna(0, inplace=True)
df['Crops'].fillna(0, inplace=True)
df['Other'].fillna(0, inplace=True)
df['Climate'].fillna(0, inplace=True)

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(df.isna(),cmap='magma')

In [None]:

plt.figure(figsize=(10,10))
sns.barplot(x='Region', y='GDP', data=df.groupby('Region').mean().reset_index())
plt.xticks(rotation=90)

In [None]:
sns.jointplot(x='Literacy', y= 'GDP',data= df,kind= 'hex')
#GDP is directly proportional to the literacy rate of a country

In [None]:

sns.jointplot(x='Infant_mortality', y= 'GDP',data= df,kind= 'hex')
#GDP is inversely proportional to the infant mortality rate of a country

In [None]:

sns.jointplot(x='Agriculture', y= 'GDP',data= df,kind= 'hex')

#the GDP is no more dependent on Agriculture


In [None]:
sns.jointplot(x='Phones', y= 'GDP',data= df)

#a clear upward trend

In [None]:
sns.jointplot(x='Birthrate', y= 'GDP',data= df,kind='hex')
#a downward trend b/w birthrate and GDP

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(df.corr(),annot=True,cmap='BrBG')

## **Feature Selection, Standardising and Training the model**

In [None]:
 df=pd.concat([df,pd.get_dummies(df['Region'], prefix='Region')], axis=1).drop(['Region'],axis=1)

In [None]:
df.head()

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()

In [None]:
x=df.drop(['Country', 'GDP'],axis=1)
y=df['GDP']


In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.4,random_state=101)
x_train=sc.fit_transform(x_train)
x_test=sc.fit_transform(x_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfc=RandomForestRegressor(random_state=101, n_estimators=200)

In [None]:
rfc.fit(x_train,y_train)
pred=rfc.predict(x_test)

In [None]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score
print('MAE:', metrics.mean_absolute_error(y_test, pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred)))
print('R2_Score: ', metrics.r2_score(y_test, pred))



In [None]:
fig = plt.figure(figsize=(12, 6))
plt.scatter(y_test,pred,color='coral', linewidths=2, edgecolors='k')
plt.xlabel('True GDP per Capita') 
plt.ylabel('Predictions') 
plt.title('Random Forest Performance') 
plt.grid()