In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install dataprep

In [None]:
train = pd.read_csv('../input/playground-series-s3e3/train.csv')
test = pd.read_csv('../input/playground-series-s3e3/test.csv')
submission = pd.read_csv('../input/playground-series-s3e3/sample_submission.csv')
original = pd.read_csv("/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv")

In [None]:
original['Attrition'] = original['Attrition'].apply(lambda x: 1 if x =='Yes' else 0)

In [None]:
original.drop(['EmployeeNumber', 'EmployeeCount','Over18','StandardHours'], axis = 1, inplace = True)

In [None]:
print(train.columns)

In [None]:
train.isnull().sum()

In [None]:
train.drop(['id','Over18','EmployeeCount','StandardHours'], axis=1, inplace=True)

In [None]:
test.drop(['id','Over18','EmployeeCount','StandardHours'], axis=1, inplace=True)

In [None]:
train = pd.concat([train,original],axis=0)

In [None]:
train.columns

In [None]:
from dataprep.eda import create_report, plot, plot_correlation, plot_missing , plot_diff
report =create_report(train)
report.show()

In [None]:
features =['Age', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EnvironmentSatisfaction', 'Gender',
       'HourlyRate', 'JobInvolvement','JobLevel', 'JobRole',
       'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate',
       'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager']

In [None]:
cat_features = ['BusinessTravel', 'Department','Education', 'EducationField', 'EnvironmentSatisfaction', 'Gender',
               'JobInvolvement','JobLevel', 'JobRole',
       'JobSatisfaction', 'MaritalStatus','NumCompaniesWorked', 'OverTime', 
               'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany']

Brace yourselves .. **Outliers** are coming 

In [None]:
train.sort_values('Education', ascending=False).head(3)[['Education']]

Looks like we found an oultlier ! .. let's search for another one 


In [None]:
train.sort_values('DailyRate', ascending=False).head(3)[['DailyRate']]

ok another one 

In [None]:
train = train[train.DailyRate != 3921]
train = train[train.Education != 15]


Here comes My favorite part .. checking for **Multicolinearity** using heatmap. 

In [None]:
#correlation matrix
corrmat = train.corr()
f, ax = plt.subplots(figsize=(16, 12))
sns.heatmap(corrmat, vmax=.8, square=True);

hmmm..Job_level and monthly income are highly correlated with 0.92 .. that's something to be considered 


In [None]:
X = train.drop('Attrition', axis=1)
y = train['Attrition']


In [None]:

from sklearn.model_selection import train_test_split, KFold

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

****Ridge Regression **** As we have many features .. Ridge will use his power (regularization)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

import pandas as pd
from sklearn.model_selection import GridSearchCV



# define the categorical columns
categorical_cols = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole',
       'MaritalStatus', 'OverTime']

# split the data into x and y



# split the data into train and validation sets


# create the preprocessing pipelines
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

# create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
       ('num', numerical_transformer, X.select_dtypes(include='number').columns),

        ('cat', categorical_transformer, categorical_cols)
    ])

# create the pipeline
ridge_pipeline = Pipeline([
  ('preprocessor', preprocessor),
  ('ridge', Ridge())
])

# define the k-fold
kfold = KFold(n_splits=10, shuffle=True, random_state=42)



param_grid = {'ridge__alpha':[0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(ridge_pipeline, param_grid, cv=kfold, n_jobs=-1)

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
grid_search.fit(X_train, y_train)
y_val_pred = grid_search.predict(X_val)

from sklearn.metrics import roc_auc_score

roc_auc = roc_auc_score(y_val, y_val_pred)
print("ROC-AUC:", roc_auc)




Perparing Data for the **boosters**

In [None]:
from catboost import CatBoostClassifier, Pool

from category_encoders import WOEEncoder
train_pool = Pool(X_train, y_train, cat_features=cat_features)
val_pool = Pool(X_val, y_val, cat_features=cat_features)

In [None]:
cv_pool = Pool(train[features], train['Attrition'], cat_features=cat_features)

In [None]:
woe = WOEEncoder(drop_invariant=True, randomized = True)

In [None]:
for col in cat_features:
    train[col] = train[col].astype(str)

In [None]:
woe.fit(train[features],train['Attrition'], cols = cat_features)

In [None]:
X_1 = woe.transform(train[features])

**XGBoost**

In [None]:
from xgboost import XGBClassifier
xgb_tuned_params = {'subsample': 0.6, 'scale_pos_weight': 5, 'n_estimators': 400, 'max_depth': 3, 'learning_rate': 0.03, 'lambda': 5, 'colsample_bytree': 0.4}

clf = XGBClassifier(**xgb_tuned_params, eval_metric = 'auc')
clf.fit(X_1, train['Attrition'])

**CatBoost**

In [None]:
cb_params = {'depth': 3,
 'l2_leaf_reg': 1,
 'iterations': 400,
 'subsample': 0.6,
 'rsm': 0.6,
 'learning_rate': 0.1}

model = CatBoostClassifier(**cb_params, eval_metric='AUC',verbose=0)
model.fit(cv_pool)

In [None]:
test_pool = Pool(test[features], cat_features=cat_features)
for col in cat_features:
    test[col] = test[col].astype(str)
X_test = woe.transform(test[features])

In [None]:
cb_preds = model.predict_proba(test_pool)[:,1]
xgb_preds = clf.predict_proba(X_test)[:,1]

In [None]:
def preds_plot(preds):
    plt.figure(figsize=(15, 7))
    plt.title('Distribution of predictions', 
          size=25, y=1.03, fontname='Calibri', 
          fontweight='bold', color='#444444')
    a = sns.histplot(preds, color='#72bfd6', bins=100)
    plt.ylabel('')
    plt.xlabel('')
    plt.xticks(fontname='Calibri', size=12)
    plt.yticks([])
    for s in ['right', 'top', 'left']:
        a.spines[s].set_visible(False)
    plt.show()

In [None]:
preds_plot(xgb_preds)

In [None]:
preds_plot(cb_preds)

In [None]:
y_val_test = grid_search.predict(test)

In [None]:
submission['Attrition']=(0.7*cb_preds)+(0.1*y_val_test)+(0.2*xgb_preds)

In [None]:
submission

In [None]:
submission.to_csv('submission.csv', index=False)