# Redução de dimensionalidade

In [1]:
import pandas as pd
import numpy
import matplotlib.pyplot as plt 
import seaborn as srn
srn.set()

In [15]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

## Preparação da base de dados

### Base census

In [3]:
base_census = pd.read_csv('dados\census.csv')
print(base_census.shape)
base_census.head()

(32561, 15)


Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
X_census = base_census.iloc[:, 0:14].values
y_census = base_census.iloc[:, 14].values
X_census.shape, y_census.shape

((32561, 14), (32561,))

In [5]:
# Label Encoder para atributos categóricos

label_encoder_workclass = LabelEncoder()
label_encoder_education = LabelEncoder()
label_encoder_marital = LabelEncoder()
label_encoder_occupation = LabelEncoder()
label_encoder_relationship = LabelEncoder()
label_encoder_race = LabelEncoder()
label_encoder_sex = LabelEncoder()
label_encoder_country = LabelEncoder()

X_census[:,1] = label_encoder_workclass.fit_transform(X_census[:,1])
X_census[:,3] = label_encoder_education.fit_transform(X_census[:,3])
X_census[:,5] = label_encoder_marital.fit_transform(X_census[:,5])
X_census[:,6] = label_encoder_occupation.fit_transform(X_census[:,6])
X_census[:,7] = label_encoder_relationship.fit_transform(X_census[:,7])
X_census[:,8] = label_encoder_race.fit_transform(X_census[:,8])
X_census[:,9] = label_encoder_sex.fit_transform(X_census[:,9])
X_census[:,13] = label_encoder_country.fit_transform(X_census[:,13])

X_census.shape

(32561, 14)

In [6]:
scaler_census = StandardScaler()
X_census = scaler_census.fit_transform(X_census)
X_census.shape

(32561, 14)

In [7]:
X_census_train, X_census_test, y_census_train, y_census_test = train_test_split(X_census, y_census, test_size=0.15, random_state=0)
print('train:', X_census_train.shape, y_census_train.shape)
print('test:', X_census_test.shape, y_census_test.shape)

train: (27676, 14) (27676,)
test: (4885, 14) (4885,)


## PCA - Principal Component Analysis

In [8]:
pca = PCA(n_components=8)   # de 14 para 8 atrbutos

X_census_train_pca = pca.fit_transform(X_census_train)
X_census_test_pca = pca.transform(X_census_test)

In [9]:
X_census_train_pca.shape, X_census_test_pca.shape

((27676, 8), (4885, 8))

In [10]:
pca.explained_variance_ratio_  # para cada atributo

array([0.151561  , 0.10109701, 0.08980379, 0.08076277, 0.07627678,
       0.07357646, 0.06772289, 0.06690789])

In [11]:
pca.explained_variance_ratio_.sum()   # 8 atributos explica 71% das variáveis

0.7077085943199352

In [12]:
# Random Forest
random_forest_census_pca = RandomForestClassifier(n_estimators=40, random_state=0)
random_forest_census_pca.fit(X_census_train_pca, y_census_train)

RandomForestClassifier(n_estimators=40, random_state=0)

In [13]:
previsoes = random_forest_census_pca.predict(X_census_test_pca)
previsoes.shape

(4885,)

In [14]:
accuracy_score(y_census_test, previsoes)

0.8364380757420675

## Kernel PCA

(parecido com **kernel trick** do SVM)

Não to conseguindo rodar essa parte!

In [None]:
kpca = KernelPCA(n_components=8, kernel='rbf')

X_census_train_kpca = kpca.fit_transform(X_census_train)
X_census_test_kpca = kpca.transform(X_census_test)

In [None]:
X_census_train_kpca.shape, X_census_test_kpca.shape

In [None]:
# Random Forest
random_forest_census_kpca = RandomForestClassifier(n_estimators=40, random_state=0, criterion='entropy')
random_forest_census_kpca.fit(X_census_train_kpca, y_census_train)

In [None]:
previsoes = random_forest_census_kpca.predict(X_census_test_kpca)
previsoes.shape

In [None]:
accuracy_score(y_census_test, previsoes)

## LDA - Linear Discriminant Analysis

- Supervisionado

In [20]:
lda = LinearDiscriminantAnalysis(n_components=1)  # temos duas classes e o n_components não pode ser maior que o número de classes

X_census_train_lda = lda.fit_transform(X_census_train, y_census_train)
X_census_test_lda = lda.transform(X_census_test)

In [21]:
X_census_train_lda.shape, X_census_test_lda.shape

((27676, 1), (4885, 1))

In [22]:
# Random Forest
random_forest_census_lda = RandomForestClassifier(n_estimators=40, random_state=0, criterion='entropy')
random_forest_census_lda.fit(X_census_train_lda, y_census_train)

RandomForestClassifier(criterion='entropy', n_estimators=40, random_state=0)

In [23]:
previsoes = random_forest_census_lda.predict(X_census_test_lda)
previsoes.shape

(4885,)

In [24]:
accuracy_score(y_census_test, previsoes)

0.7334698055271238

Mesmo assim é bem alto considerando que só tem um atributo...