In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
df_census = pd.read_csv('../Bases de dados/census.csv')
x_census = df_census.iloc[:,0:14].values
y_census = df_census.iloc[:,14].values

In [3]:
label_encoder_workclass = LabelEncoder()
label_encoder_education = LabelEncoder()
label_encoder_marital_status = LabelEncoder()
label_encoder_occupation = LabelEncoder()
label_encoder_relationship = LabelEncoder()
label_encoder_race = LabelEncoder()
label_encoder_sex = LabelEncoder()
label_encoder_native_country = LabelEncoder()

x_census[:,1] = label_encoder_workclass.fit_transform(x_census[:,1])
x_census[:,3] = label_encoder_education.fit_transform(x_census[:,3])
x_census[:,5] = label_encoder_marital_status.fit_transform(x_census[:,5])
x_census[:,6] = label_encoder_occupation.fit_transform(x_census[:,6])
x_census[:,7] = label_encoder_relationship.fit_transform(x_census[:,7])
x_census[:,8] = label_encoder_race.fit_transform(x_census[:,8])
x_census[:,9] = label_encoder_sex.fit_transform(x_census[:,9])
x_census[:,13] = label_encoder_native_country.fit_transform(x_census[:,13])

In [4]:
x_census

array([[39, 7, 77516, ..., 0, 40, 39],
       [50, 6, 83311, ..., 0, 13, 39],
       [38, 4, 215646, ..., 0, 40, 39],
       ...,
       [58, 4, 151910, ..., 0, 40, 39],
       [22, 4, 201490, ..., 0, 20, 39],
       [52, 5, 287927, ..., 0, 40, 39]], dtype=object)

In [5]:
scaler_census = StandardScaler()
x_census_scaler = scaler_census.fit_transform(x_census)
x_census

array([[39, 7, 77516, ..., 0, 40, 39],
       [50, 6, 83311, ..., 0, 13, 39],
       [38, 4, 215646, ..., 0, 40, 39],
       ...,
       [58, 4, 151910, ..., 0, 40, 39],
       [22, 4, 201490, ..., 0, 20, 39],
       [52, 5, 287927, ..., 0, 40, 39]], dtype=object)

In [6]:
x_census_train,x_census_test,y_census_train,y_census_test = train_test_split(x_census_scaler,y_census,test_size=0.15,random_state=0)

In [7]:
x_census_train.shape,y_census_train.shape

((27676, 14), (27676,))

In [8]:
x_census_test.shape,y_census_test.shape

((4885, 14), (4885,))

In [9]:
x_census_test

array([[-0.84908045,  0.09005041, -0.11993902, ..., -0.21665953,
         0.28852962,  0.29156857],
       [-0.84908045,  0.09005041,  0.25298957, ..., -0.21665953,
        -0.03542945,  0.29156857],
       [-0.99570562,  0.09005041,  0.62989738, ..., -0.21665953,
        -0.03542945,  0.29156857],
       ...,
       [-1.28895595,  0.09005041, -0.12885437, ..., -0.21665953,
        -1.97918382,  0.29156857],
       [-0.6291427 ,  0.09005041, -1.03415505, ..., -0.21665953,
        -0.03542945,  0.29156857],
       [ 0.25060831,  0.09005041,  0.11100726, ..., -0.21665953,
        -0.03542945,  0.29156857]])

PCA (Principal Components Analysis)

In [10]:
pca = PCA(n_components=8)

In [11]:
x_census_train_pca = pca.fit_transform(x_census_train)
x_census_test_pca = pca.transform(x_census_test)

In [12]:
x_census_train_pca.shape, x_census_test_pca.shape

((27676, 8), (4885, 8))

In [13]:
x_census_test

array([[-0.84908045,  0.09005041, -0.11993902, ..., -0.21665953,
         0.28852962,  0.29156857],
       [-0.84908045,  0.09005041,  0.25298957, ..., -0.21665953,
        -0.03542945,  0.29156857],
       [-0.99570562,  0.09005041,  0.62989738, ..., -0.21665953,
        -0.03542945,  0.29156857],
       ...,
       [-1.28895595,  0.09005041, -0.12885437, ..., -0.21665953,
        -1.97918382,  0.29156857],
       [-0.6291427 ,  0.09005041, -1.03415505, ..., -0.21665953,
        -0.03542945,  0.29156857],
       [ 0.25060831,  0.09005041,  0.11100726, ..., -0.21665953,
        -0.03542945,  0.29156857]])

In [14]:
pca.explained_variance_ratio_

array([0.151561  , 0.10109701, 0.08980379, 0.08076277, 0.07627678,
       0.07357646, 0.06772289, 0.06690789])

In [15]:
pca.explained_variance_ratio_.sum()

np.float64(0.7077085943199322)

In [16]:
randomForest_Pca = RandomForestClassifier(n_estimators=40,criterion='entropy',random_state=0)

In [17]:
randomForest_Pca.fit(x_census_train_pca,y_census_train)

In [18]:
predictPca = randomForest_Pca.predict(x_census_test_pca)

In [19]:
accuracy_score(y_census_test,predictPca)

0.8343909928352098

Kernel PCA

In [20]:
kpca = KernelPCA(n_components=8,random_state=0,kernel='rbf')

In [21]:
x_census_train_kpca = kpca.fit_transform (x_census_train)
x_census_test_kpca = kpca.transform(x_census_test)

In [22]:
x_census_train_kpca

array([[ 0.18894283,  0.05517031, -0.08653531, ...,  0.41350358,
         0.06383854, -0.25147161],
       [ 0.16165577, -0.07855932, -0.03858728, ..., -0.01994034,
        -0.00338877, -0.29452204],
       [-0.085163  , -0.27486159,  0.12920466, ...,  0.19029531,
         0.04692292,  0.13187157],
       ...,
       [-0.27872683,  0.40033469,  0.20104327, ..., -0.17392304,
        -0.16108872,  0.00683154],
       [ 0.21829898, -0.12392098, -0.02312817, ..., -0.12231089,
         0.33364887,  0.25177643],
       [ 0.24961467,  0.42418174,  0.12141394, ..., -0.00408799,
        -0.18258341, -0.08239723]])

In [23]:
randomForest_KPCA = RandomForestClassifier(n_estimators=40,random_state=0,criterion='entropy')
randomForest_KPCA.fit(x_census_train_kpca,y_census_train)

In [24]:
predictKpca = randomForest_KPCA.predict(x_census_test_kpca)
accuracy_score(y_census_test,predictKpca)

0.8235414534288639

LDA (Linear Discriminant Analysis)

In [28]:
lda = LinearDiscriminantAnalysis(n_components=1)

In [29]:
x_census_train_lda = lda.fit_transform(x_census_train,y_census_train)
x_census_test_lda = lda.transform(x_census_test)

In [30]:
x_census_train_lda.shape,x_census_test_lda.shape

((27676, 1), (4885, 1))

In [32]:
randomForest_LDA =RandomForestClassifier(n_estimators=40,random_state=0,criterion='entropy')
randomForest_LDA.fit(x_census_train_lda,y_census_train)

In [33]:
predictLda = randomForest_LDA.predict(x_census_test_lda)
accuracy_score(y_census_test,predictLda)

0.7334698055271238