<a href="https://colab.research.google.com/github/neeyora/OLADIPO-Olaniyi-the-Analyst.github.io/blob/main/Employee_Retention_project_End_to_End_using_H2O_PyCaret_Feb_2022.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Code snippet 1
# NumPy for numerical computing
import numpy as np

# Pandas for DataFrames
import pandas as pd

In [None]:
# Code snippet 2
## This dataset contain both features and labels
url = 'https://raw.githubusercontent.com/nvamsimohan/DallasDSA/main/employee_data.csv'
df = pd.read_csv(url)
df

Unnamed: 0,avg_monthly_hrs,department,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,status,tenure
0,221,engineering,,0.932868,4,,low,0.829896,Left,5.0
1,232,support,,,3,,low,0.834544,Employed,2.0
2,184,sales,,0.788830,3,,medium,0.834988,Employed,3.0
3,206,sales,,0.575688,4,,low,0.424764,Employed,2.0
4,249,sales,,0.845217,3,,low,0.779043,Employed,3.0
...,...,...,...,...,...,...,...,...,...,...
14244,178,IT,,0.735865,5,,low,0.263282,Employed,5.0
14245,257,sales,,0.638604,3,,low,0.868209,Employed,2.0
14246,232,finance,1.0,0.847623,5,,medium,0.898917,Left,5.0
14247,130,IT,,0.757184,4,,medium,0.641304,Employed,3.0


In [None]:
df.to_csv("Employee datafile.csv") #To download the data as a csv file

In [None]:
# Code snippet 3
# Checking the data types
df.dtypes

avg_monthly_hrs        int64
department            object
filed_complaint      float64
last_evaluation      float64
n_projects             int64
recently_promoted    float64
salary                object
satisfaction         float64
status                object
tenure               float64
dtype: object

In [None]:
# Code snippet 4
# Checking the null values list and counts
df.isnull().sum()

avg_monthly_hrs          0
department             709
filed_complaint      12191
last_evaluation       1532
n_projects               0
recently_promoted    13949
salary                   0
satisfaction           181
status                   0
tenure                 181
dtype: int64

In [None]:
# Code snippet 9
# Replacing null values with zeroes as appropriate
df['filed_complaint'] = df['filed_complaint'].fillna(0)
df['recently_promoted'] = df['recently_promoted'].fillna(0)

In [None]:
# Code snippet 10
# Checking the null values list after cleaning
df.isnull().sum()

avg_monthly_hrs         0
department            709
filed_complaint         0
last_evaluation      1532
n_projects              0
recently_promoted       0
salary                  0
satisfaction          181
status                  0
tenure                181
dtype: int64

In [None]:
# Code snippet 11
# Exploratory Data Analysis using latest AI based AutoEDA Package - sweetviz
!pip install sweetviz

Collecting sweetviz
  Downloading sweetviz-2.1.3-py3-none-any.whl (15.1 MB)
[K     |████████████████████████████████| 15.1 MB 955 kB/s 
Installing collected packages: sweetviz
Successfully installed sweetviz-2.1.3


In [None]:
# Code snippet 12
# Importing the package for use
import sweetviz as sv

# Generate the EDA report
Employee_EDA_report = sv.analyze(df)
# Convert the report to HTML 
Employee_EDA_report.show_html() # Default arguments will generate to "SWEETVIZ_REPORT.html"

# Downloading the EDA report to local machine
from google.colab import files
files.download("SWEETVIZ_REPORT.html")

                                             |          | [  0%]   00:00 -> (? left)

Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Code snippet 13
# Method 1 - Doing Auto ML with PyCaret
!pip install pycaret --upgrade



In [None]:
# Code snippet 13a (to run after the restart of the run time)
import numpy as np
import pandas as pd
url = 'https://raw.githubusercontent.com/nvamsimohan/DallasDSA/main/employee_data.csv'
df = pd.read_csv(url)
df

Unnamed: 0,avg_monthly_hrs,department,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,status,tenure
0,221,engineering,,0.932868,4,,low,0.829896,Left,5.0
1,232,support,,,3,,low,0.834544,Employed,2.0
2,184,sales,,0.788830,3,,medium,0.834988,Employed,3.0
3,206,sales,,0.575688,4,,low,0.424764,Employed,2.0
4,249,sales,,0.845217,3,,low,0.779043,Employed,3.0
...,...,...,...,...,...,...,...,...,...,...
14244,178,IT,,0.735865,5,,low,0.263282,Employed,5.0
14245,257,sales,,0.638604,3,,low,0.868209,Employed,2.0
14246,232,finance,1.0,0.847623,5,,medium,0.898917,Left,5.0
14247,130,IT,,0.757184,4,,medium,0.641304,Employed,3.0


In [None]:
# Code snippet 4.1 - Setting numerical, categorical features
num_cols=['avg_monthly_hrs','last_evaluation','n_projects','satisfaction','tenure']
cat_cols=['department','salary']

In [None]:
# Code snippet 14
#To import all Classification related Pycaret libraries
from pycaret.classification import *

  defaults = yaml.load(f)


In [None]:
# Code snippet 15
# Setting up the classification ML experiment
classification_model_setup = setup(df,  target = 'status', categorical_features= cat_cols, numeric_features=num_cols)

IntProgress(value=0, description='Processing: ', max=3)

Text(value="Following data types have been inferred automatically, if they are correct press enter to continue…

Unnamed: 0,Data Type
avg_monthly_hrs,Numeric
department,Categorical
filed_complaint,Categorical
last_evaluation,Numeric
n_projects,Numeric
recently_promoted,Categorical
salary,Categorical
satisfaction,Numeric
tenure,Numeric
status,Label


In [None]:
# Code snippet 16
# Running the multiple algorithms
compare_models(budget_time=1)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9752,0.9865,0.9211,0.9738,0.9466,0.9305,0.9312,1.084
lightgbm,Light Gradient Boosting Machine,0.9752,0.9894,0.9248,0.9702,0.9468,0.9307,0.9313,0.199
et,Extra Trees Classifier,0.9739,0.9846,0.9299,0.9598,0.9445,0.9275,0.9278,0.909
gbc,Gradient Boosting Classifier,0.97,0.9819,0.9156,0.957,0.9357,0.9162,0.9167,1.162
dt,Decision Tree Classifier,0.9614,0.9487,0.9244,0.9152,0.9197,0.8943,0.8944,0.057
knn,K Neighbors Classifier,0.9344,0.9673,0.9198,0.826,0.8702,0.8265,0.8288,0.17
ada,Ada Boost Classifier,0.9282,0.965,0.8359,0.86,0.8475,0.8006,0.8009,0.392
lr,Logistic Regression,0.7852,0.8177,0.3275,0.5917,0.4212,0.3026,0.3227,1.029
ridge,Ridge Classifier,0.7784,0.0,0.2569,0.5841,0.3565,0.2462,0.2769,0.026
lda,Linear Discriminant Analysis,0.7769,0.8141,0.3136,0.5603,0.4015,0.2777,0.2957,0.056


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=6009, verbose=0,
                       warm_start=False)

In [None]:
# Code snippet 17
# Creating the model with the best algorithm based on the above results
model = create_model('rf')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9649,0.9828,0.9034,0.9471,0.9247,0.9019,0.9023
1,0.979,0.9735,0.9286,0.9822,0.9546,0.941,0.9416
2,0.9749,0.9907,0.9247,0.9693,0.9465,0.9301,0.9306
3,0.984,0.9928,0.9498,0.9827,0.966,0.9555,0.9557
4,0.9689,0.982,0.895,0.9726,0.9322,0.912,0.9134
5,0.9779,0.991,0.9286,0.9779,0.9526,0.9382,0.9388
6,0.985,0.99,0.9538,0.9827,0.968,0.9582,0.9584
7,0.9699,0.9936,0.8908,0.9815,0.9339,0.9145,0.9163
8,0.9799,0.9923,0.9454,0.9698,0.9574,0.9443,0.9445
9,0.9679,0.9767,0.8908,0.9725,0.9298,0.9091,0.9105


In [None]:
# Code snippet 17.1
# Checking the confusion matrix etc
evaluate_model(model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [None]:
# Code snippet 18
# Loading the Client dataset for which predictions to be made
url1 = 'https://raw.githubusercontent.com/nvamsimohan/DallasDSA/main/Employee%20Attrition%20Data%20-%20Client%20request.csv'
pred_data = pd.read_csv(url1)

In [None]:
# Code snippet 18.1 - checking the data size of the client data set
pred_data.shape

(499, 9)

In [None]:
# Code snippet 19
# Predicting the values using the best model
predictions = predict_model(model, data = pred_data)

In [None]:
# Code snippet 20
# Converting predictions dataframe to csv
predictions.to_csv("Employee Status Predictions.csv")

In [None]:
# Code snippet 21
# Method 2 - Installing the H2O AI Package for Advanced ML and Deep Learning packages
!pip install h2o

Collecting h2o
  Downloading h2o-3.36.0.3.tar.gz (176.2 MB)
[K     |████████████████████████████████| 176.2 MB 52 kB/s 
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25l[?25hdone
  Created wheel for h2o: filename=h2o-3.36.0.3-py2.py3-none-any.whl size=176244303 sha256=effc0635837e990b5994e1ced6dfd8ebb397e98d22b6d9f7fc745005f5b9210e
  Stored in directory: /root/.cache/pip/wheels/b5/71/a5/02087a05e5644158183e1c58eeae3f9356a4d1e80659fb2dfb
Successfully built h2o
Installing collected packages: h2o
Successfully installed h2o-3.36.0.3


In [None]:
# Code snippet 22
# Importing into current Working Session
import h2o
from h2o.automl import H2OAutoML

In [None]:
# Code snippet 23
# Initializing the H2O Server
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.14" 2022-01-18; OpenJDK Runtime Environment (build 11.0.14+9-Ubuntu-0ubuntu2.18.04); OpenJDK 64-Bit Server VM (build 11.0.14+9-Ubuntu-0ubuntu2.18.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.7/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpss_cdyeb
  JVM stdout: /tmp/tmpss_cdyeb/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpss_cdyeb/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.0.3
H2O_cluster_version_age:,1 month and 1 day
H2O_cluster_name:,H2O_from_python_unknownUser_l7u2i8
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.172 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [None]:
# Code snippet 24
# Converting Pandas dataframe to H2O dataframe
import pandas as pd
df1 = pd.read_csv(url)
hf = h2o.H2OFrame(df1)
hf

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


avg_monthly_hrs,department,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,status,tenure
221,engineering,,0.932868,4,,low,0.829896,Left,5
232,support,,,3,,low,0.834544,Employed,2
184,sales,,0.78883,3,,medium,0.834988,Employed,3
206,sales,,0.575688,4,,low,0.424764,Employed,2
249,sales,,0.845217,3,,low,0.779043,Employed,3
140,sales,,0.589097,4,,medium,0.66002,Employed,4
121,sales,1.0,0.625399,3,,low,0.835571,Employed,3
150,engineering,,0.644586,4,,low,0.796683,Employed,3
215,engineering,1.0,0.524114,3,,medium,0.715005,Employed,7
269,support,,0.909364,5,,medium,0.994037,Employed,2




In [None]:
# Code snippet 25
# Splitting the data to train and test
splits = hf.split_frame(ratios = [0.8])
# Assigning first split to train variable
train = splits[0]
# Assigning first split to test variable
test = splits[1]

In [None]:
# Code snippet 26
# Invoking the Auto advanced ML and Deep learning algorithms restricting run time to 60 seconds
aml = H2OAutoML(max_runtime_secs = 30)
# Training using train data and testing with test data
aml.train(y = "status", training_frame = train)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
Model Details
H2OStackedEnsembleEstimator :  Stacked Ensemble
Model Key:  StackedEnsemble_BestOfFamily_1_AutoML_1_20220318_31531

No model summary for this model

ModelMetricsBinomialGLM: stackedensemble
** Reported on train data. **

MSE: 0.023531207422825464
RMSE: 0.15339885078717333
LogLoss: 0.09187371949594311
Null degrees of freedom: 10012
Residual degrees of freedom: 10010
Null deviance: 10996.605478639856
Residual deviance: 1839.8631066257567
AIC: 1845.8631066257567
AUC: 0.9893599150501081
AUCPR: 0.9777919947536181
Gini: 0.9787198301002162

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.5723460619677646: 


Unnamed: 0,Unnamed: 1,Employed,Left,Error,Rate
0,Employed,7564.0,63.0,0.0083,(63.0/7627.0)
1,Left,216.0,2170.0,0.0905,(216.0/2386.0)
2,Total,7780.0,2233.0,0.0279,(279.0/10013.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.572346,0.939597,145.0
1,max f2,0.139746,0.932,250.0
2,max f0point5,0.780016,0.964524,113.0
3,max accuracy,0.702612,0.972136,126.0
4,max precision,0.998304,1.0,0.0
5,max recall,0.004228,1.0,385.0
6,max specificity,0.998304,1.0,0.0
7,max absolute_mcc,0.702612,0.922415,126.0
8,max min_per_class_accuracy,0.109026,0.94939,264.0
9,max mean_per_class_accuracy,0.295232,0.954726,198.0



Gains/Lift Table: Avg response rate: 23.83 %, avg score: 23.64 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.010087,0.997638,4.196563,4.196563,1.0,0.998066,1.0,0.998066,0.04233,0.04233,319.656329,319.656329,0.04233
1,2,0.020074,0.996789,4.196563,4.196563,1.0,0.997237,1.0,0.997654,0.041911,0.084241,319.656329,319.656329,0.084241
2,3,0.030061,0.995948,4.196563,4.196563,1.0,0.996397,1.0,0.997236,0.041911,0.126153,319.656329,319.656329,0.126153
3,4,0.040048,0.995127,4.196563,4.196563,1.0,0.995579,1.0,0.996823,0.041911,0.168064,319.656329,319.656329,0.168064
4,5,0.050035,0.994473,4.196563,4.196563,1.0,0.994765,1.0,0.996412,0.041911,0.209975,319.656329,319.656329,0.209975
5,6,0.10007,0.9901,4.188187,4.192375,0.998004,0.992494,0.999002,0.994453,0.209556,0.419531,318.818691,319.23751,0.419399
6,7,0.150005,0.980464,4.162991,4.182593,0.992,0.986051,0.996671,0.991656,0.207879,0.62741,316.299078,318.259337,0.626754
7,8,0.20004,0.931816,4.054165,4.15047,0.966068,0.96408,0.989016,0.984758,0.20285,0.83026,305.416493,315.047023,0.827375
8,9,0.30001,0.055186,1.366713,3.22286,0.325674,0.318717,0.767976,0.762819,0.13663,0.96689,36.671292,222.286002,0.875504
9,10,0.39998,0.022156,0.213811,2.470786,0.050949,0.035742,0.588764,0.581095,0.021375,0.988265,-78.618908,147.078558,0.772322




ModelMetricsBinomialGLM: stackedensemble
** Reported on cross-validation data. **

MSE: 0.029100329000276724
RMSE: 0.1705881854064833
LogLoss: 0.11811223953663172
Null degrees of freedom: 11343
Residual degrees of freedom: 11341
Null deviance: 12522.601136456295
Residual deviance: 2679.7304906071004
AIC: 2685.7304906071004
AUC: 0.9796594435670358
AUCPR: 0.9620123741528896
Gini: 0.9593188871340717

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.6140316873669391: 


Unnamed: 0,Unnamed: 1,Employed,Left,Error,Rate
0,Employed,8523.0,91.0,0.0106,(91.0/8614.0)
1,Left,279.0,2451.0,0.1022,(279.0/2730.0)
2,Total,8802.0,2542.0,0.0326,(370.0/11344.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.614032,0.929818,145.0
1,max f2,0.266981,0.917314,215.0
2,max f0point5,0.797961,0.954597,112.0
3,max accuracy,0.614032,0.967384,145.0
4,max precision,0.998787,1.0,0.0
5,max recall,0.001749,1.0,395.0
6,max specificity,0.998787,1.0,0.0
7,max absolute_mcc,0.614032,0.909589,145.0
8,max min_per_class_accuracy,0.114853,0.935165,273.0
9,max mean_per_class_accuracy,0.466471,0.945876,175.0



Gains/Lift Table: Avg response rate: 24.07 %, avg score: 24.07 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.010049,0.997409,4.155311,4.155311,1.0,0.998302,1.0,0.998302,0.041758,0.041758,315.531136,315.531136,0.041758
1,2,0.020011,0.996098,4.155311,4.155311,1.0,0.996752,1.0,0.99753,0.041392,0.08315,315.531136,315.531136,0.08315
2,3,0.03006,0.994541,4.118861,4.143126,0.991228,0.995325,0.997067,0.996793,0.041392,0.124542,311.886126,314.312569,0.124426
3,4,0.040021,0.992973,4.081766,4.127853,0.982301,0.993689,0.993392,0.996021,0.040659,0.165201,308.176602,312.785335,0.164853
4,5,0.050071,0.990952,4.045961,4.111417,0.973684,0.992047,0.989437,0.995223,0.040659,0.205861,304.596106,311.141722,0.205164
5,6,0.100053,0.980986,4.11134,4.111379,0.989418,0.985793,0.989427,0.990512,0.205495,0.411355,311.133981,311.137855,0.409962
6,7,0.150035,0.963375,4.133326,4.11869,0.994709,0.974168,0.991187,0.985067,0.206593,0.617949,313.332558,311.868993,0.616207
7,8,0.200018,0.894392,4.023397,4.094877,0.968254,0.940608,0.985456,0.973957,0.201099,0.819048,302.339671,309.487712,0.815217
8,9,0.299982,0.076668,1.286168,3.158916,0.309524,0.343936,0.760212,0.764012,0.128571,0.947619,28.61678,215.89158,0.85289
9,10,0.400035,0.03252,0.248953,2.431104,0.059912,0.049735,0.585059,0.585364,0.024908,0.972527,-75.104743,143.110437,0.75393







In [None]:
# Code snippet 27
# Displaying the best algorithm scores
aml.leaderboard.head(20)

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
StackedEnsemble_BestOfFamily_1_AutoML_1_20220318_31531,0.979659,0.118112,0.962012,0.056381,0.170588,0.0291003
XGBoost_1_AutoML_1_20220318_31531,0.979622,0.121313,0.962124,0.0555633,0.171263,0.029331
GBM_1_AutoML_1_20220318_31531,0.972604,0.217265,0.948345,0.0611948,0.230169,0.052978
GLM_1_AutoML_1_20220318_31531,0.818116,0.434148,0.521804,0.257235,0.376488,0.141743




In [None]:
# Code snippet 28
# Getting the prediction metrics
perf = aml.leader.model_performance(test)
# Displaying the performance metrics
perf


ModelMetricsBinomialGLM: stackedensemble
** Reported on test data. **

MSE: 0.02707717586642454
RMSE: 0.16455143836024205
LogLoss: 0.10796234852543289
Null degrees of freedom: 2904
Residual degrees of freedom: 2902
Null deviance: 3120.885487071191
Residual deviance: 627.2612449327652
AIC: 633.2612449327652
AUC: 0.9832540444726999
AUCPR: 0.9679158770945968
Gini: 0.9665080889453999

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.6007277407695892: 


Unnamed: 0,Unnamed: 1,Employed,Left,Error,Rate
0,Employed,2222.0,21.0,0.0094,(21.0/2243.0)
1,Left,69.0,593.0,0.1042,(69.0/662.0)
2,Total,2291.0,614.0,0.031,(90.0/2905.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.600728,0.929467,146.0
1,max f2,0.218849,0.920458,196.0
2,max f0point5,0.777454,0.956607,128.0
3,max accuracy,0.683446,0.969019,138.0
4,max precision,0.998478,1.0,0.0
5,max recall,0.002187,1.0,394.0
6,max specificity,0.998478,1.0,0.0
7,max absolute_mcc,0.600728,0.910711,146.0
8,max min_per_class_accuracy,0.099307,0.941088,239.0
9,max mean_per_class_accuracy,0.218849,0.948105,196.0



Gains/Lift Table: Avg response rate: 22.79 %, avg score: 22.43 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.010327,0.997605,4.388218,4.388218,1.0,0.997982,1.0,0.997982,0.045317,0.045317,338.821752,338.821752,0.045317
1,2,0.02031,0.996494,4.388218,4.388218,1.0,0.997092,1.0,0.997545,0.043807,0.089124,338.821752,338.821752,0.089124
2,3,0.030293,0.995474,4.388218,4.388218,1.0,0.995994,1.0,0.997034,0.043807,0.132931,338.821752,338.821752,0.132931
3,4,0.040275,0.994549,4.388218,4.388218,1.0,0.994959,1.0,0.996519,0.043807,0.176737,338.821752,338.821752,0.176737
4,5,0.050258,0.993604,4.388218,4.388218,1.0,0.994059,1.0,0.996031,0.043807,0.220544,338.821752,338.821752,0.220544
5,6,0.100172,0.987056,4.388218,4.388218,1.0,0.990824,1.0,0.993437,0.219033,0.439577,338.821752,338.821752,0.439577
6,7,0.150086,0.973014,4.357954,4.378153,0.993103,0.981203,0.997706,0.989368,0.217523,0.6571,335.795395,337.81528,0.656654
7,8,0.2,0.836063,4.055318,4.297583,0.924138,0.935235,0.979346,0.975858,0.202417,0.859517,305.531826,329.758308,0.854167
8,9,0.300172,0.050182,0.965106,3.185484,0.219931,0.218637,0.725917,0.723162,0.096677,0.956193,-3.489374,218.548359,0.84964
9,10,0.4,0.020269,0.211845,2.443353,0.048276,0.032821,0.556799,0.550874,0.021148,0.977341,-78.815502,144.335347,0.747738







In [None]:
# Code snippet 29
#Predicting the values using the best algorithms 
client_data = pd.read_csv(url1)
hf_client = h2o.H2OFrame(client_data)
hf_client 

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


avg_monthly_hrs,department,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,tenure
221,engineering,,0.932868,4,,low,0.829896,5
232,support,,,3,,low,0.834544,2
184,sales,,0.78883,3,,medium,0.834988,3
206,sales,,0.575688,4,,low,0.424764,2
249,sales,,0.845217,3,,low,0.779043,3
140,sales,,0.589097,4,,medium,0.66002,4
121,sales,1.0,0.625399,3,,low,0.835571,3
150,engineering,,0.644586,4,,low,0.796683,3
215,engineering,1.0,0.524114,3,,medium,0.715005,7
269,support,,0.909364,5,,medium,0.994037,2




In [None]:
# Code snippet 30
pred = aml.predict(hf_client)
# Displaying sample prediction results
pred.head()

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


predict,Employed,Left
Left,0.0190452,0.980955
Employed,0.99509,0.0049095
Employed,0.995611,0.0043891
Employed,0.970432,0.0295676
Employed,0.991991,0.00800908
Employed,0.99308,0.00691997
Employed,0.996417,0.0035825
Employed,0.993959,0.00604121
Employed,0.996003,0.00399685
Employed,0.995848,0.00415216




In [None]:
# Code snippet 31
# Converting H2O Predictions data to a Pandas dataframe
pred_df = pred.as_data_frame()

In [None]:
# Code snippet 32
# Appending predictions to the client data
client_data['Prediction'] = pred_df['predict']

In [None]:
# Code snippet 33
# Converting the dataframe to a CSV file
client_data.to_csv('Employee Data Predictions.csv')

In [None]:
# Code snippet 34
# Downloading the Predictions data file to local machine
from google.colab import files
files.download("Employee Data Predictions.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>