##*Welcome to the exercise for the course Information Systems*

---



#1. Data Preprocessing

##1.1 Importing the Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf

##1.2 Importing the Dataset

In [None]:
dataset = pd.read_csv('Employee.csv')

print(dataset)

      Education  JoiningYear       City  PaymentTier  Age  Gender EverBenched  \
0     Bachelors         2017  Bangalore            3   34    Male          No   
1     Bachelors         2013       Pune            1   28  Female          No   
2     Bachelors         2014  New Delhi            3   38  Female          No   
3       Masters         2016  Bangalore            3   27    Male          No   
4       Masters         2017       Pune            3   24    Male         Yes   
...         ...          ...        ...          ...  ...     ...         ...   
4648  Bachelors         2013  Bangalore            3   26  Female          No   
4649    Masters         2013       Pune            2   37    Male          No   
4650    Masters         2018  New Delhi            3   27    Male          No   
4651  Bachelors         2012  Bangalore            3   30    Male         Yes   
4652  Bachelors         2015  Bangalore            3   33    Male         Yes   

      ExperienceInCurrentDo

##1.3 Exploring the data to see what we are dealing with

In [None]:
dataset.info()

##1.4 Looking for Missing Data

In [None]:
dataset.isnull().sum()

##1.5 Isolating the variable we want to predict (dependent variable)

In [None]:
y = dataset.iloc[:, -1].values

##1.6 Creating the matrix of independent variables

###1.6.1 Label encoder for Education, Gender and EverBenched

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dataset['Education Cat'] = label_encoder.fit_transform(dataset['Education'])
dataset['EverBenched Cat'] = label_encoder.fit_transform(dataset['EverBenched'])
dataset['Gender Cat'] = label_encoder.fit_transform(dataset['Gender'])

print(dataset)

###1.6.2 OneHotEncoder for City





In [None]:
from sklearn.preprocessing import OneHotEncoder

# creating instance of one hot encoder
enc = OneHotEncoder()

# one hot encoder gender and city feature
dum_city_dataset = pd.get_dummies(dataset['City'])

# concate
dataset = pd.concat([dataset, dum_city_dataset], axis='columns')

dataset.head()

###1.6.3 Deleting the Columns we don't need

In [None]:
X = dataset.drop(['Education', 'City', 'Gender', 'EverBenched', 'LeaveOrNot', 'Pune'], axis=1)

X

##1.7 Splitting the dataset into the training set and the test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

#2. Naive Bayes Prediction Model

##2.1 Training the Naive Bayes Model on the Training Set

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

##2.2 Predicting the Test Set Results

In [None]:
y_pred = classifier.predict(X_test)

##2.3 Making the Confusion Matrix and Calculating the Accuracy

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

#3. K-NN Prediction Model

##3.1 Standardization of certain features

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[['JoiningYear', 'Age', 'ExperienceInCurrentDomain']]=sc.fit_transform(X_train[['JoiningYear', 'Age', 'ExperienceInCurrentDomain']])

X_test[['JoiningYear', 'Age', 'ExperienceInCurrentDomain']] = sc.fit_transform(X_test[['JoiningYear', 'Age', 'ExperienceInCurrentDomain']])

In [None]:
print(X_train)

In [None]:
print(X_test)

##3.2 Training the K-NN model on the Training set

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

##3.3 Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)

##3.4 Making the Confusion Matrix and calculating accuracy

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

#4. Random Forests Prediction Model

##4.1 Training the Random Forest Classification model on the Training set

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

##4.2 Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

##4.3 Making the Confusion Matrix and calculating accuracy

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

#5. Logistic Regression Prediction Model

##5.1 Training the Logistic Regression model on the Training set

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

##5.2 Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)

##5.3 Making the Confusion Matrix and calculating accuracy

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

#6. Support Vector Machine Prediction Model

##6.1 Training the SVM model on the Training set

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

##6.2 Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)

##6.3 Making the Confusion Matrix and calculating accuracy

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

#7. Artificial Neural Networks

##7.1 Building the ANN

###7.1.1 Initializing the ANN

In [None]:
ann = tf.keras.models.Sequential()

###7.1.2 Adding the input layer and the first hidden layer

In [None]:
ann.add(tf.keras.layers.Dense(units=8, activation='relu'))

###7.1.3 Adding the second hidden layer

In [None]:
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

###7.1.4 Adding the output layer

In [None]:
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

##7.2 Training the ANN

###7.2.1 Compiling the ANN

In [None]:
ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

###7.2.2 Training the ANN on the Training set

In [None]:
ann.fit(X_train, y_train, batch_size = 32, epochs = 100)

##7.3 Making the predictions and evaluating the model

###7.3.1 Predicting the Test set results

In [None]:
y_pred = ann.predict(X_test)
y_pred = (y_pred > 0.5)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

###7.3.2 Making the Confusion Matrix and calculating accuracy

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)