# MACHINE LEARNING

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [3]:
data = pd.read_csv('adult.csv')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education.num     32561 non-null int64
marital.status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital.gain      32561 non-null int64
capital.loss      32561 non-null int64
hours.per.week    32561 non-null int64
native.country    32561 non-null object
income            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


### DATA PREPROCESSING

I will first convert '?' into 'Not available'.

In [5]:
data[data=='?'] = np.nan
data = data.fillna('Not available')
data.head()

  result = method(y)


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,Not available,77053,HS-grad,9,Widowed,Not available,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,Not available,186061,Some-college,10,Widowed,Not available,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [6]:
cat_col = data[['workclass','education','marital.status',
                                    'occupation','relationship','race','sex','native.country']]

num_cal = data[['age','fnlwgt','education.num','capital.gain','capital.loss','hours.per.week']]


target_col = data[['income']]

I used dummy variables on only categorical columns in order to label columns and convert strings into numeric variables.

In [7]:
cat_col = pd.get_dummies(cat_col,drop_first=True)
cat_col.head()

Unnamed: 0,workclass_Local-gov,workclass_Never-worked,workclass_Not available,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_11th,education_12th,...,native.country_Portugal,native.country_Puerto-Rico,native.country_Scotland,native.country_South,native.country_Taiwan,native.country_Thailand,native.country_Trinadad&Tobago,native.country_United-States,native.country_Vietnam,native.country_Yugoslavia
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


I also standardized numerical columns.Variables that are measured at different scales do not contribute equally to the analysis and might end up creating a bias.

In [8]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
num_cal = pd.DataFrame(data=sc.fit_transform(num_cal),columns=['age','fnlwgt','education.num',
                                                               'capital.gain','capital.loss','hours.per.week'])
num_cal.head()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
0,3.769612,-1.067997,-0.42006,-0.14592,10.593507,-0.035429
1,3.183112,-0.539169,-0.42006,-0.14592,10.593507,-1.817204
2,2.01011,-0.03522,-0.03136,-0.14592,10.593507,-0.035429
3,1.130359,-0.468215,-2.363558,-0.14592,9.461864,-0.035429
4,0.177296,0.709482,-0.03136,-0.14592,9.461864,-0.035429


I merged three columns and got a data is ready to be worked on for further ML algorithms.

In [9]:
data = pd.concat([num_cal,target_col,cat_col],axis=1)
data.head()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week,income,workclass_Local-gov,workclass_Never-worked,workclass_Not available,...,native.country_Portugal,native.country_Puerto-Rico,native.country_Scotland,native.country_South,native.country_Taiwan,native.country_Thailand,native.country_Trinadad&Tobago,native.country_United-States,native.country_Vietnam,native.country_Yugoslavia
0,3.769612,-1.067997,-0.42006,-0.14592,10.593507,-0.035429,<=50K,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,3.183112,-0.539169,-0.42006,-0.14592,10.593507,-1.817204,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,2.01011,-0.03522,-0.03136,-0.14592,10.593507,-0.035429,<=50K,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,1.130359,-0.468215,-2.363558,-0.14592,9.461864,-0.035429,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0.177296,0.709482,-0.03136,-0.14592,9.461864,-0.035429,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [10]:
#Seting feature and target columns

x = data.drop(['income'],axis=1).values
y = data['income'].values

### LOGISTIC REGRESSION

In [11]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 38)

In [12]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegressionCV

logr = LogisticRegressionCV(random_state=38)
logr.fit(X_train,y_train)

y_pred = logr.predict(X_test)

cm = confusion_matrix(y_test,y_pred)
cm

array([[6908,  536],
       [ 930, 1395]])

In [13]:
round(accuracy_score(y_test,y_pred),2)

0.85

### SUPPORT VECTOR MACHINE

In [14]:
from sklearn.svm import SVC

svc = SVC(kernel='linear',random_state=38)
svc.fit(X_train,y_train)

y_pred = svc.predict(X_test)

cm = confusion_matrix(y_test,y_pred)
cm

array([[6944,  500],
       [1005, 1320]])

In [15]:
round(accuracy_score(y_test,y_pred),2)

0.85

### KNN

In [16]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5,metric='minkowski')
knn.fit(X_train,y_train)

y_pred = knn.predict(X_test)

cm = confusion_matrix(y_test,y_pred)
cm

array([[6729,  715],
       [ 903, 1422]])

In [17]:
round(accuracy_score(y_test,y_pred),2)

0.83

### DECISION TREE

In [18]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(criterion='entropy',random_state=38)
dtc.fit(X_train,y_train)

y_pred = dtc.predict(X_test)

cm = confusion_matrix(y_test,y_pred)
cm

array([[6515,  929],
       [ 868, 1457]])

In [19]:
round(accuracy_score(y_test,y_pred),2)

0.82

### RANDOM FOREST 

In [20]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=38)
rfc.fit(X_train,y_train)

y_pred = rfc.predict(X_test)

cm = confusion_matrix(y_test,y_pred)
cm

array([[6934,  510],
       [ 994, 1331]])

In [21]:
round(accuracy_score(y_test,y_pred),2)

0.85

### GRADIENT BOOSTING CLASSIFIER

In [22]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(random_state=38)
gbc.fit(X_train,y_train)

y_pred = gbc.predict(X_test)

cm = confusion_matrix(y_test,y_pred)
cm

array([[7041,  403],
       [ 911, 1414]])

In [23]:
round(accuracy_score(y_test,y_pred),2)

0.87

### DIMENSION REDUCTION

### PCA

In [86]:
from sklearn.decomposition import PCA

pca = PCA(n_components=30)

X_train2 = pca.fit_transform(X_train) #Differ from LDA#
X_test2 = pca.transform(X_test)

#Before pca
logr = LogisticRegressionCV(random_state=38)
logr.fit(X_train,y_train)

#after pca
logr_pca = LogisticRegressionCV(random_state=38)
logr_pca.fit(X_train2,y_train)


y_pred = logr.predict(X_test)
y_pred2 = logr_pca.predict(X_test2)


#Accuracy score before pca
cm = confusion_matrix(y_test,y_pred)
round(accuracy_score(y_test,y_pred),2)

0.85

In [87]:
#Accuracy score after pca
cm2 = confusion_matrix(y_test,y_pred2)
round(accuracy_score(y_test,y_pred2),2)

0.85

In [88]:
cm3 = confusion_matrix(y_pred,y_pred2)
round(accuracy_score(y_pred,y_pred2),2)

0.98

In [89]:
print('cm = {}\n\ncm2 = {}\n\ncm3 = {}'.format(cm, cm2, cm3))

cm = [[6908  536]
 [ 930 1395]]

cm2 = [[6903  541]
 [ 969 1356]]

cm3 = [[7736  102]
 [ 136 1795]]


### LDA

In [90]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

lda = LDA(n_components=15)
X_train_lda = lda.fit_transform(X_train,y_train)    #Differ from PCA#
X_test_lda = lda.transform(X_test)


#after lda
logr_lda = LogisticRegressionCV(random_state=38)
logr_lda.fit(X_train_lda,y_train)

y_pred_lda = logr_lda.predict(X_test_lda)



round(accuracy_score(y_pred,y_pred_lda),2)

0.96

In [91]:
round(accuracy_score(y_test,y_pred_lda),2)

0.84