In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,OrdinalEncoder,TargetEncoder
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv('salaries.csv')
df

Unnamed: 0,company,job,degree,salary_more_then_100k
0,google,sales executive,bachelors,0
1,google,sales executive,masters,0
2,google,business manager,bachelors,1
3,google,business manager,masters,1
4,google,computer programmer,bachelors,0
5,google,computer programmer,masters,1
6,abc pharma,sales executive,masters,0
7,abc pharma,computer programmer,bachelors,0
8,abc pharma,business manager,bachelors,0
9,abc pharma,business manager,masters,1


In [3]:
X = df[['company','job','degree']]
Y = df.salary_more_then_100k

In [4]:
ct = ColumnTransformer(transformers=[('encoder1',OneHotEncoder(),[0]),
                                     ('encoder2',OneHotEncoder(),[1]),
                                     ('encoder3',OrdinalEncoder(categories=[['masters','bachelors']]),[2])])
X = ct.fit_transform(X)
X

# 001: google, 100: abc pharma, 010: facebook
# 001: sales excecutive, 100: business manager, 010: computer programmer
# 1: bacherlors, 0: master

array([[0., 0., 1., 0., 0., 1., 1.],
       [0., 0., 1., 0., 0., 1., 0.],
       [0., 0., 1., 1., 0., 0., 1.],
       [0., 0., 1., 1., 0., 0., 0.],
       [0., 0., 1., 0., 1., 0., 1.],
       [0., 0., 1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 1., 0., 1.],
       [1., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 1., 1.],
       [0., 1., 0., 0., 0., 1., 0.],
       [0., 1., 0., 1., 0., 0., 1.],
       [0., 1., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 1., 0., 1.],
       [0., 1., 0., 0., 1., 0., 0.]])

In [5]:
X1, X2, X3 = np.split(X,[3,6],axis=1)
X1 = X1[:,1:] # drop abc pharma
X2 = X2[:,1:] # drop bussiness manager
print(X1)
print()
print(X2)
print()
print(X3)

# 001: google, 100: abc pharma, 010: facebook
# 001: sales excecutive, 100: business manager, 010: computer programmer
# 1: bacherlors, 0: master

[[0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]]

[[0. 1.]
 [0. 1.]
 [0. 0.]
 [0. 0.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 0.]
 [0. 0.]
 [0. 1.]
 [0. 1.]
 [0. 0.]
 [0. 0.]
 [1. 0.]
 [1. 0.]]

[[1.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]]


In [6]:
X = np.concatenate((X1,X2,X3),axis=1)
X

array([[0., 1., 0., 1., 1.],
       [0., 1., 0., 1., 0.],
       [0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [0., 1., 1., 0., 1.],
       [0., 1., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0.],
       [1., 0., 0., 1., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [1., 0., 1., 0., 1.],
       [1., 0., 1., 0., 0.]])

In [7]:
Y.values

array([0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,train_size=0.5,random_state=0)

### DecisionTreeClassifier

In [9]:
DTC = DecisionTreeClassifier()
DTC.fit(X_train,Y_train)

In [14]:
Yp_train = DTC.predict(X_train)
print(DTC.score(X_train,Y_train))
print(DTC.predict_proba(X_train))
print(Yp_train)

1.0
[[0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]]
[1 0 1 1 1 0 1 1]


In [15]:
Yp_test = DTC.predict(X_test)
print(DTC.score(X_test,Y_test))
print(DTC.predict_proba(X_test))
print(Yp_test)

0.625
[[0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]]
[1 1 0 1 1 0 0 1]
