# Import all the necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# Load data from the csv file and validate

In [None]:
employeeData = pd.read_csv('Data/EmployeeAttrition.csv')
employeeData.info()
sns.heatmap(employeeData.isnull(), cbar=False)
employeeData.shape
#employeeData.head()

# Exploring the data through visualization

In [None]:
#Check the age group structure
employeeData['Age'].plot.hist()

In [None]:
#Check the monthly income structure
employeeData['MonthlyIncome'].plot.hist()

In [None]:
#Check the proportion of attrition rate
sns.countplot(x='Attrition', data=employeeData)

In [None]:
#Compare the attrition rate by gender group
sns.countplot(x='Attrition', hue='Gender', data=employeeData)

In [None]:
#Compare the attrition rate proportion by marital status
sns.countplot(x='Attrition', hue='MaritalStatus', data=employeeData)

In [None]:
#Check attrition rate by marital status
sns.countplot(x='Attrition', hue='EnvironmentSatisfaction', data=employeeData)   

In [None]:
#Check attrition rate by Education Level
sns.countplot(x='Attrition', hue='Education', data=employeeData)

# Getting the data ready for testing

In [None]:
#Assigning numerical values to non-numerical (string) data 
Gender = pd.get_dummies(employeeData['Gender'],drop_first=True)
BusinessTravel = pd.get_dummies(employeeData['BusinessTravel'],drop_first=True)
Department = pd.get_dummies(employeeData['Department'],drop_first=True)
MaritalStatus = pd.get_dummies(employeeData['MaritalStatus'],drop_first=True)
Over18 = pd.get_dummies(employeeData['Over18'],drop_first=True)
OverTime = pd.get_dummies(employeeData['OverTime'],drop_first=True)
EducationField = pd.get_dummies(employeeData['EducationField'],drop_first=True)
JobRole = pd.get_dummies(employeeData['JobRole'],drop_first=True)

Attrition = pd.get_dummies(employeeData['Attrition'],drop_first=True)

#Combining the numerized and original data into a new file and checking the structutre
employeeDataMod = pd.concat([employeeData,Gender,BusinessTravel,Department,
                           MaritalStatus,Over18,OverTime,EducationField,JobRole],axis=1)
employeeDataMod.head()

#Dropping the unncessary or duplicate information and checking the dataframe structutre
employeeDataMod.drop(['Gender','BusinessTravel','Department','EmployeeNumber','EmployeeCount','JobRole','MaritalStatus',
                      'EducationField','Over18','OverTime'],axis=1,inplace=True)
#Check 
employeeDataMod.info()
employeeDataMod.head()

# Testing the model

In [None]:
#Take the independent and dependent variables info from the polished data
X=employeeDataMod.drop(['Attrition'],axis=1)
y= Attrition

#An additional step of normalizing the data to avoid ConvergenceWarning
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

#Split the input and output data to testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)

#Train the model
logreg = LogisticRegression()
logreg.fit(X_train, y_train.values.ravel())

#Test the model
y_pred = logreg.predict(X_test)

#To verify the test results, two metrics are chosen
print("Accuracy score = " + str(accuracy_score(y_test, y_pred)))
print("F1 score = " + str(f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))))

# Remarks

Other metrics can also be used since no threshold is defined in a specific metric. The accuracy and f1 scores are remarkablly good. It should be noted that I have played with the parameters beforehand to improve accuracy. 

                                                   ***The end***