In [1]:
# Importing all the necessary packages/libraries

import warnings
warnings.simplefilter("ignore")
import pickle
import joblib

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from math import sqrt
from pandas_profiling import ProfileReport
import scipy.stars as stats
from scipy.stats import zscore

from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV



ModuleNotFoundError: No module named 'joblib'

In [None]:
WHR_df = pd.read_csv("https://raw.githubusercontent.com/dsrscientist/DSData/master/happiness_score_dataset.csv")

In [None]:
WHR_df

In [None]:
profile1 = ProfileReport(WHR_df, title="Pandas Profiling Report", explorative=True)
profile1.to_widgets() # Helps to get a visualization on the entire data set

In [None]:
WHR_df.shape

In [None]:
WHR_df.isnull().sum() 

In [None]:
WHR_df.info() 

In [None]:
WHR_df.describe() 

In [None]:
WHR_df.columns 

In [None]:
WHR_df_new = WHR_df.drop(['Country' , 'Region' , 'Happiness Rank'], axis=1) # Creating after removing useless columns

In [None]:
upper_triangle = np.tril(WHR_df_new.corr())
sns.heatmap(WHR_df_new.corr(), vmin=-1, vmax=1, annot=True, annot_kws={'size':10}, cmap="gist_earth", mask=upper_triangle)
plt.show()

In [None]:
sns.pairplot(WHR_df_new)
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (20,20)
WHR_df_new.hist(bins=20, color='maroon', density=True, label='value', histtype='bar')
plt.tight_layout()
plt.show()

In [None]:
WHR_df_new.skew() # Checking the skewness of columns where acceptable range is between -0.5 to 0.5 

In [None]:
profile2 = ProfileReport(WHR_df_new, title="Pandas Profiling Report", explorative=True)
profile2.to_notebook_iframe()

In [None]:
# Z score method

z=np.abs(zscore(WHR_df_new))
threshold=3
np.where(z>3)

WHR_df_new1=WHR_df_new[(z<3).all(axis=1)]
WHR_df_new1 # Creating after removing the outliers

In [None]:
# Percentage of Data Loss

data_loss=(158-149)/158*100 # 158 was the number of rows in original data set and 149 after the removal of outliers
data_loss

In [None]:
X = WHR_df_new1.drop('Happiness Score', axis=1) # List of all features
Y = WHR_df_new1['Happiness Score'] # Data of our label

In [None]:
X # Displaying the content of X

In [None]:
y # Displaying the content of Y

In [None]:
scaler = PowerTransformer(method = 'yeo-johnson')

X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [None]:
X_scaled # Displayed the feature values after applying the Power Transformer Scaler 

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, train_size=0.75, random_state=42, shuffle=True) 

In [None]:
# Model Selection 

linear_model = LinearRegression()
svr_model = SVR(c=1.0, epsilon=0.2, kernel='poly', gamma='auto')
dtr_model = DecisionTreeRegressor(criterion="poisson", random_state=111)
rfr_model = RandomForestRegressor(max_depth=2, max_features="sqrt")
knr_model = KNeighborsRegressor(n_neighbors=2, algorithm='kd_tree')

In [None]:
# Training the models

linear_model.fit(X_train, Y_train)
svr_model.fit(X_train, y_train)
dtr_model.fit(X_train, Y_train)
rfr_model.fit(X_train, Y_train)
knr_model.fit(X_train, Y_train)

In [None]:
# Predictions

pred_linear = linear_model.predict(X_test)
pred_svr = svr_model.predict(X_test)
pred_dtr = dtr_model.predict(X_test)
pred_rfr = rfr_model.predict(X_test)
pred_knr = knr_model.predict(X_test)

In [None]:
# RMSE -  a lower RMSE is better than a higher one

mse_linear = mean_squared_error(Y_test, pred_linear)
mse_svr = mean_squared_error(Y_test, pred_svr)
mse_dtr = mean_squared_error(Y_test, pred_dtr)
mse_rfr = mean_squared_error(Y_test, pred_rfr)
mse_knr = mean_squared_error(Y_test, pred_knr)

rmse_linear = sqrt(mse_linear)
rmse_svr = sqrt(mse_svr)
rmse_dtr = sqrt(mse_dtr)
rmse_rfr = sqrt(mse_rfr)
rmse_knr = sqrt(mse_knr)

print(f"RMSE of Linear Regression is: ", rmse_linear)
print(f"RMSE of Support Vector Regression is: ", rmse_svr)
print(f"RMSE of Decision Tree Regressor is: ", rmse_dtr)
print(f"RMSE of Random Forest Regressor is: ", rmse_rfr)
print(f"RMSE of K Neighbors Regressor is: ", rmse_knr)

In [None]:
# R2 Score for the models

print(f"R2 Score for Linear Regression is: ", r2_score(Y_test, pred_linear, multioutput='variance_weighted')*100)
print(f"R2 Score for Support Vector Regression is: ", r2_score(Y_test, pred_svr, multioutput='variance_weighted')*100)
print(f"R2 Score for Decision Tree Regressor is: ", r2_score(Y_test, pred_dtr, multioutput='variance_weighted')*100)
print(f"R2 Score for Random Forest Regressor is: ", r2_score(Y_test, pred_rfr, multioutput='variance_weighted')*100)
print(f"R2 Score for K Neighbors Regressor is: ", r2_score(Y_test, pred_knr, multioutput='variance_weighted')*100)

In [None]:
# Linear Regression 

lt_param = {'fit_intercept' : [True,False], 'normalize' : [True,False], 'copy_X' : [True,False], 'n_jobs' : [1,2,3,4,5,6,7,8,9,10], 'positive' : [True,False]'}

In [None]:
RSCV = RandomizedSearchCV(LinearRegression(), lt_param, cv=5)

In [None]:
RSCV = fit(X_train, Y_train)

In [None]:
RSCV.best_params_

In [None]:
Final_Model = LinearRegression(positive=True, normalize=True, n_jobs=10, fit_intercept=True, copy_X=True)
Classifier = Final_Model.fit(X_train, Y_train)
lr_pred = Final_Model.predict(X_test)
lr_r2_score = r2_score(Y_test, lr_pred, multioutput='variance_weighted')
print(f"R2 score for the Final Model is:", lr_r2_score*100)

In [None]:
filename = 'FinalModel_2.pk1'
pickle.dump(Final_Model, open(filename, 'wb'))

In [None]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)*100
print(result)