# PCA 예제 
* 10000개 feature를 갖고 있는 arcene data set을 이용하여 PCA실습
 1. 데이터 불러오기
 2. 데이터와 레이블 합치기
 3. Logistic Model 생성
 4. PCA(주성분 분석)
 5. 주성분 분석을 한 후 Logisitc Model 생성
 6. 차원 축소 하기 전후 모델 성능 비교


In [56]:
# 필요한 모듈 불러오기
import numpy as np
import pandas as pd

1. 데이터 불러오기

In [6]:
datapath = './data/arcene_train.data'
data1 = pd.read_csv(datapath, sep=' ')

data1.shape
data1.head()

Unnamed: 0,0,71,0.1,95,0.2,538,404,20,0.3,0.4,...,570.9,86.12,0.5762,36.31,0.5763,80.18,0.5764,0.5765,524,Unnamed: 10000
0,0,41,82,165,60,554,379,0,71,0,...,605,69,7,473,0,57,0,284,423,
1,0,0,1,40,0,451,402,0,0,0,...,593,28,0,24,0,90,0,34,508,
2,0,56,44,275,14,511,470,0,0,0,...,600,0,26,86,0,102,0,0,469,
3,105,0,141,348,0,268,329,0,0,1,...,0,0,0,0,190,301,0,0,354,
4,38,62,0,251,75,515,0,9,85,300,...,454,0,36,2,369,194,18,59,340,


※ 데이터 컬럼명이 데이터의 첫번째행으로 되어있고 마지막 컬럼은 모두 NA 값이므로 수정해줌

In [7]:
datapath = './data/arcene_train.data'
data1 = pd.read_csv(datapath, sep=' ', names=['f{}'.format(x) for x in range(0,10001)])

del(data1['f10000'])

data1.shape
data1.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f9990,f9991,f9992,f9993,f9994,f9995,f9996,f9997,f9998,f9999
0,0,71,0,95,0,538,404,20,0,0,...,255,570,86,0,36,0,80,0,0,524
1,0,41,82,165,60,554,379,0,71,0,...,213,605,69,7,473,0,57,0,284,423
2,0,0,1,40,0,451,402,0,0,0,...,235,593,28,0,24,0,90,0,34,508
3,0,56,44,275,14,511,470,0,0,0,...,91,600,0,26,86,0,102,0,0,469
4,105,0,141,348,0,268,329,0,0,1,...,813,0,0,0,0,190,301,0,0,354


In [33]:
# 데이터를 불러오는 함수를 생성하여 train data와 valid data 합치기
def read_data_file(filepath):
    
    data = pd.read_csv(filepath, sep=' ',names = ['f{}'.format(x) for x in range(0,10001)])
    
    del(data['f10000'])
    
    return data

data1 = read_data_file('./data/arcene_train.data')
data2 = read_data_file('./data/arcene_valid.data')

merged_data = pd.concat([data1,data2],ignore_index=True)

merged_data.head()

In [36]:
# label에도 동일하게 적용
def read_label_file(filepath):
    labels = pd.read_csv(filepath, names = ['class'])
    return labels

label1 = read_label_file('./data/arcene_train.labels')
label2 = read_label_file('./data/arcene_valid.labels')

merged_label = pd.concat([label1,label2],ignore_index=True)

merged_label.head()

In [38]:
# 최종 데이터 생성
final_data = pd.concat([merged_data,merged_label],axis=1)
final_data

2. Logistic Model 생성

In [None]:
from sklearn.model_selection import KFold
from sklearn.linear_regression import LogisticRegression

In [62]:
features = ['f{}'.format(x) for x in range(0,10000)]

kf = KFold(n_splits = 10, shuffle=True)

accrs = []
fold_idx = 1

for train_idx, test_idx in kf.split(final_data):
    
    print('fold{}'.format(fold_idx))
    
    train_d, test_d = final_data.iloc[train_idx],final_data.iloc[test_idx]
    
    
    train_y = train_d['class']
    train_x = train_d[features]
    
    test_y = test_d['class']
    test_x = test_d[features]
    
    model = LogisticRegression(solver='lbfgs',max_iter = 300)
    model.fit(train_x,train_y)
    
    mean_accr = model.score(test_x, test_y)
    accrs.append(mean_accr)
    
    fold_idx += 1
    
print(np.average(accrs))

fold1
fold2
fold3
fold4
fold5
fold6
fold7
fold8
fold9
fold10
0.9099999999999999


In [64]:
from sklearn import preprocessing

le_class = preprocessing.LabelEncoder()
final_data['class'] = le_class.fit_transform(final_data['class'])

In [68]:
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

features = ['f{}'.format(x) for x in range(0,10000)]

kf = KFold(n_splits = 10, shuffle=True)

accrs = []
fold_idx = 1

for train_idx, test_idx in kf.split(final_data):
    
    print('fold{}'.format(fold_idx))
    
    train_d, test_d = final_data.iloc[train_idx],final_data.iloc[test_idx]
    
    pca = PCA(n_components = 0.99)
    
    train_y = train_d['class']
    train_x = pca.fit_transform(train_d[features])
    
    print(train_x.shape)
    #print(pca.explained_variance_ratio_)
    #print(pca.singular_values_)
    
    test_y = test_d['class']
    test_x = pca.transform(test_d[features])
    
    model = LogisticRegression(solver='lbfgs',max_iter = 300)
    model.fit(train_x,train_y)
    
    mean_accr = model.score(test_x, test_y)
    accrs.append(mean_accr)
        
    fold_idx += 1

    
print(np.average(accrs))

fold1
(180, 146)
fold2
(180, 146)
fold3
(180, 146)
fold4
(180, 146)
fold5
(180, 146)
fold6
(180, 146)
fold7
(180, 146)
fold8
(180, 146)
fold9
(180, 146)
fold10
(180, 146)
0.9


※ 146개 feature로 비슷한 성능을 달성! 