In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
import math 

from sklearn.ensemble import IsolationForest

In [2]:
train = pd.read_csv('train.csv', index_col=0)
test = pd.read_csv('test.csv', index_col=0)
sample_submission = pd.read_csv('sample_submission.csv', index_col=0)

## Feature engineering

##### FiberID 별로 관측된 천체 타입의 종류가 다름을 인지하고 이에 따른 Typenum 변수 생성
##### 특히 640? 이후로는 모든 변수가 QSO임을 확인하고 이를 제외하고 train하기로 마음먹음

In [3]:
column_number = {}
for i, column in enumerate(sample_submission.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

train['type_num'] = train['type'].apply(lambda x: to_number(x, column_number))



In [4]:
# Make Fiber tpye
fiber = []
label_dict = {}

for i in list(set(train['fiberID'])):
    train2 = train[train['fiberID']==i]
    #test2 = test[test['fiberID']==i]
    #idx2 = test2.index
    idx1 = train2.index
    #print(set(train.loc[idx1]['type_num']))
    if tuple(set(train.loc[idx1]['type_num'])) not in fiber:
        fiber.append(tuple(set(train.loc[idx1]['type_num'])))
    label_dict[i] = list(set(train.loc[idx1]['type_num']))
    
    
fiber_type = {}
for k in range(len(fiber)):
    fiber_type[fiber[k]] = k
    
fiber_dict = {}  
for j in list(set(train['fiberID'])):
    train2 = train[train['fiberID']==j]
    #test2 = test[test['fiberID']==i]
    idx1 = train2.index
    
    #print(set(train.loc[idx1]['type_num']))
    fiber_dict[j] = fiber_type[tuple(set(train.loc[idx1]['type_num']))]



In [5]:
ntrain = train.shape[0]
ntest = test.shape[0]
all_data = train.append(test)

In [6]:
all_data['fiberType'] = all_data['fiberID']
Fill_Type = lambda x: fiber_dict[x]
all_data['fiberType'] = all_data['fiberType'].apply(Fill_Type)

측정 별 변수

In [7]:
psfMags = ['psfMag_u','psfMag_g','psfMag_r','psfMag_i','psfMag_z']
fiberMags = ['fiberMag_u','fiberMag_g','fiberMag_r','fiberMag_i','fiberMag_z']
petroMags = ['petroMag_u','petroMag_g','petroMag_r','petroMag_i','petroMag_z']
modelMags = ['modelMag_u','modelMag_g','modelMag_r','modelMag_i','modelMag_z']

파장 별 변수

In [8]:
u_values = ['psfMag_u','fiberMag_u','petroMag_u','modelMag_u']
g_values = ['psfMag_g','fiberMag_g','petroMag_g','modelMag_g']
r_values = ['psfMag_r','fiberMag_r','petroMag_r','modelMag_r']
i_values = ['psfMag_i','fiberMag_i','petroMag_i','modelMag_i']
z_values = ['psfMag_z','fiberMag_z','petroMag_z','modelMag_z']

## 파장 변수 Engineering
##### 강경수님의 토론 글을 읽고 구글링을 통해 u-r,u-g,g-r,g-z,g-i 등의 변수가  천체 분류에 효과적이라는 사실을 알아냄  50위권>10위권 상승


##### 관련 자료 
##### http://dspace.ewha.ac.kr/handle/2015.oak/211793
##### http://www.koreascience.or.kr/article/CFKO201023859828644.pub

##### *** 기타 변수들은 임의로 만들어봄

In [9]:
u_r = ['psfMag_u-r', 'fiberMag_u-r','petroMag_u-r','modelMag_u-r']
u_g = ['psfMag_u-g','fiberMag_u-g',  'petroMag_u-g','modelMag_u-g']
g_r = ['psfMag_g-r', 'fiberMag_g-r',  'petroMag_g-r', 'modelMag_g-r']      
g_z = ['psfMag_g-z','fiberMag_g-z', 'petroMag_g-z','modelMag_g-z'] 
g_i = ['psfMag_g-i','fiberMag_g-i','petroMag_g-i', 'modelMag_g-i']

u_i = ['psfMag_u-i','fiberMag_u-i','petroMag_u-i', 'modelMag_u-i']
u_z = ['psfMag_u-z','fiberMag_u-z','petroMag_u-z', 'modelMag_u-z']
r_i = ['psfMag_r-i','fiberMag_r-i','petroMag_r-i', 'modelMag_r-i']
r_z = ['psfMag_r-z','fiberMag_r-z','petroMag_r-z', 'modelMag_r-z']
i_z = ['psfMag_i-z','fiberMag_i-z','petroMag_i-z', 'modelMag_i-z']

In [10]:
sumur = ['psfMag_u+r', 'fiberMag_u+r','petroMag_u+r','modelMag_u+r']
sumug = ['psfMag_u+g','fiberMag_u+g',  'petroMag_u+g','modelMag_u+g']
sumgr = ['psfMag_g+r', 'fiberMag_g+r',  'petroMag_g+r', 'modelMag_g+r']      
sumgz = ['psfMag_g+z','fiberMag_g+z', 'petroMag_g+z','modelMag_g+z'] 
sumgi = ['psfMag_g+i','fiberMag_g+i','petroMag_g+i', 'modelMag_g+i']

sumui = ['psfMag_u+i','fiberMag_u+i','petroMag_u+i', 'modelMag_u+i']
sumuz = ['psfMag_u+z','fiberMag_u+z','petroMag_u+z', 'modelMag_u+z']
sumri = ['psfMag_r+i','fiberMag_r+i','petroMag_r+i', 'modelMag_r+i']
sumrz = ['psfMag_r+z','fiberMag_r+z','petroMag_r+z', 'modelMag_r+z']
sumiz = ['psfMag_i+z','fiberMag_i+z','petroMag_i+z', 'modelMag_i+z']

In [11]:
cols = all_data.columns.drop(['type','type_num','fiberType','fiberID'])
pca = PCA(n_components = 2)

x = all_data[cols].values
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['all_pca1','all_pca2'])
all_data = pd.concat([all_data,principalDf],axis =1)

In [12]:
for i in range(4):
    all_data[str(u_values[i])+'-'+'r'] = all_data[u_values[i]]-all_data[r_values[i]]
    all_data[str(u_values[i])+'-'+'g'] = all_data[u_values[i]]-all_data[g_values[i]]
    all_data[str(g_values[i])+'-'+'r'] = all_data[g_values[i]]-all_data[r_values[i]]
    all_data[str(g_values[i])+'-'+'z'] = all_data[g_values[i]]-all_data[z_values[i]]
    all_data[str(g_values[i])+'-'+'i'] = all_data[g_values[i]]-all_data[i_values[i]]
    
    all_data[str(u_values[i])+'-'+'i'] = all_data[u_values[i]]-all_data[i_values[i]]
    all_data[str(u_values[i])+'-'+'z'] = all_data[u_values[i]]-all_data[z_values[i]]
    all_data[str(r_values[i])+'-'+'z'] = all_data[r_values[i]]-all_data[z_values[i]]
    all_data[str(r_values[i])+'-'+'i'] = all_data[r_values[i]]-all_data[i_values[i]]
    all_data[str(i_values[i])+'-'+'z'] = all_data[i_values[i]]-all_data[z_values[i]]
        
        
        
    all_data[str(u_values[i])+'+'+'r'] = all_data[u_values[i]]+all_data[r_values[i]]
    all_data[str(u_values[i])+'+'+'g'] = all_data[u_values[i]]+all_data[g_values[i]]
    all_data[str(g_values[i])+'+'+'r'] = all_data[g_values[i]]+all_data[r_values[i]]
    all_data[str(g_values[i])+'+'+'z'] = all_data[g_values[i]]+all_data[z_values[i]]
    all_data[str(g_values[i])+'+'+'i'] = all_data[g_values[i]]+all_data[i_values[i]]
    
    all_data[str(u_values[i])+'+'+'i'] = all_data[u_values[i]]+all_data[i_values[i]]
    all_data[str(u_values[i])+'+'+'z'] = all_data[u_values[i]]+all_data[z_values[i]]
    all_data[str(r_values[i])+'+'+'z'] = all_data[r_values[i]]+all_data[z_values[i]]
    all_data[str(r_values[i])+'+'+'i'] = all_data[r_values[i]]+all_data[i_values[i]]
    all_data[str(i_values[i])+'+'+'z'] = all_data[i_values[i]]+all_data[z_values[i]]

In [13]:
val_groups = [psfMags,fiberMags,petroMags,modelMags,u_values,g_values,r_values,i_values,z_values,u_r,u_g,g_r,g_z,g_i,u_i,u_z,r_i,r_z,i_z,sumur, sumug, sumgr,  sumgz, sumgi, sumui, sumuz, sumri ,sumrz ,sumiz]
group_names = ['psf','fiber','petro','model','u','g','r','i','z','u-r','u-g','g-r','g-z','g-i','u-i','u-z','r-i','r-z','i-z','sumur', 'sumug', 'sumgr',  'sumgz', 'sumgi', 'sumui', 'sumuz', 'sumri' ,'sumrz' ,'sumiz']

In [14]:
pca = PCA(n_components = 3)

for i in range(len(val_groups)):
    
    all_data[str(group_names[i])+'_mean'] = all_data[val_groups[i]].mean(axis =1)
    all_data[str(group_names[i])+'_std'] = all_data[val_groups[i]].std(axis =1)

    x = all_data[val_groups[i]].values
    principalComponents = pca.fit_transform(x)
    principalDf = pd.DataFrame(data = principalComponents
             , columns = [str(group_names[i]) + '_pca1', str(group_names[i]) + '_pca2',str(group_names[i]) + '_pca3'])
    #print(group_names[i])
    #print('explained variance ratio :', pca.explained_variance_ratio_)

    
    
    all_data = pd.concat([all_data,principalDf],axis =1)

#### 트리 구조 특성상 더 잘 학습시키기 위해 45 Rotation한 변수 생성

##### ***행렬 곱을 이용해 더 효과적인 코드를 짰지만 무슨 이유인지 성능이 좋지 않아 이 방법을 씀

In [16]:
def RotateXY(x,y,xc=0,yc=0,angle=0,units="DEGREES"):  
    """Rotate an xy cooordinate about a specified origin  
  
    x,y      xy coordinates  
    xc,yc   center of rotation  
    angle   angle  
    units    "DEGREES" (default) or "RADIANS"  
    """  
    import math  
    x = x - xc  
    y = y - yc  
    if units == "DEGREES":  
        angle = math.radians(angle)  
    xr = (x * math.cos(angle)) - (y * math.sin(angle)) + xc  
    yr = (x * math.sin(angle)) + (y * math.cos(angle)) + yc  
    return xr, yr  

In [17]:
import datetime
for i in range(4):
    start = datetime.datetime.now()
    u_g_1 = []
    u_g_2 = []
    g_r_1 = []
    g_r_2 = []
    g_z_1 = []
    g_z_2 = []
    g_i_1 = []
    g_i_2 = []
    r_i_1 = []
    r_i_2 = []
        
    for k in range(len(all_data)):
        
        Rotate = RotateXY(all_data[str(u_values[i])+'-'+'g'].loc[k],all_data[str(r_values[i])].loc[k],xc=0,yc=0,angle=45,units="DEGREES")
        u_g_1.append(Rotate[0])
        u_g_2.append(Rotate[1])
        
        Rotate = RotateXY(all_data[str(g_values[i])+'-'+'r'].loc[k],all_data[str(r_values[i])].loc[k],xc=0,yc=0,angle=45,units="DEGREES")
        g_r_1.append(Rotate[0])
        g_r_2.append(Rotate[1])
        
        
        Rotate = RotateXY(all_data[str(g_values[i])+'-'+'z'].loc[k],all_data[str(r_values[i])].loc[k],xc=0,yc=0,angle=45,units="DEGREES")
        g_z_1.append(Rotate[0])
        g_z_2.append(Rotate[1])
        
        Rotate = RotateXY(all_data[str(g_values[i])+'-'+'i'].loc[k],all_data[str(r_values[i])].loc[k],xc=0,yc=0,angle=45,units="DEGREES")
        g_i_1.append(Rotate[0])
        g_i_2.append(Rotate[1])
        
        Rotate = RotateXY(all_data[str(r_values[i])+'-'+'i'] .loc[k],all_data[str(r_values[i])].loc[k],xc=0,yc=0,angle=45,units="DEGREES")
        r_i_1.append(Rotate[0])
        r_i_2.append(Rotate[1])   
        
    ug1= pd.DataFrame(data = u_g_1
             , columns = [str(u_values[i])+'-'+'g'+'r451'])
    ug2= pd.DataFrame(data = u_g_2
             , columns = [str(u_values[i])+'-'+'g'+'r452'])
       
    gr1= pd.DataFrame(data = g_r_1
             , columns = [str(g_values[i])+'-'+'r'+'r451'])
    gr2= pd.DataFrame(data = g_r_2
             , columns = [str(g_values[i])+'-'+'r'+'r452'])
        
    gz1= pd.DataFrame(data = g_z_1
             , columns = [str(g_values[i])+'-'+'z'+'r451'])
    gz2= pd.DataFrame(data = g_z_2
             , columns = [str(g_values[i])+'-'+'z'+'r452'])
        
        
    gi1= pd.DataFrame(data = g_i_1
             , columns = [str(g_values[i])+'-'+'i'+'r451'])
    gi2= pd.DataFrame(data = g_i_2
             , columns = [str(g_values[i])+'-'+'i'+'r452'])   
    
    ri1= pd.DataFrame(data = r_i_1
             , columns = [str(r_values[i])+'-'+'i'+'r451'])
    ri2= pd.DataFrame(data = r_i_2
             , columns = [str(r_values[i])+'-'+'i'+'r452'])
             
    
    all_data = pd.concat([all_data,ug1,ug2,gr1,gr2,gz1,gz2,gi1,gi1,ri1,ri2],axis =1)
    runtime = datetime.datetime.now() - start
    break    

1
2
0:01:05.265786
3


In [19]:
train= all_data[:ntrain]
test= all_data[ntrain:]


## Train set에서 FiberType ==34인것 제외

##### output이 모두 QSO인 FiberID들은 train에서 제외하고 추후에 QSO 확률 1,나머지 0으로 채워넣음

In [20]:
y = train['type_num']
train = train.drop(columns=['type','type_num'], axis=1)
train1 = train[train['fiberType'] != 34]
train2 = train[train['fiberType'] == 34]
train1_idx = train1.index 
train2_idx = train2.index
y1 = y.loc[train1_idx]

test = test.drop(['type','type_num'], axis=1)
test1 = test[test['fiberType'] != 34]
test2 = test[test['fiberType'] == 34]
test1_idx = test1.index
test2_idx = test2.index

## 학습 및 예측 (cv = 20)

In [None]:
from sklearn.model_selection import StratifiedKFold

kfold = 20
kf = StratifiedKFold(n_splits=kfold, shuffle=True, random_state = 42)
preds =[]
for train_index, test_index in kf.split(train1, y1):
    x_train, x_val = train1.iloc[train_index], train1.iloc[test_index]
    y_train, y_val = y1.iloc[train_index], y1.iloc[test_index]
    
    clf = lgb.LGBMClassifier(n_estimators = 10000, max_depth=10, learning_rate=0.009, objective='multiclass',num_class = 19
                             ,num_leaves = 1024,random_state=42,metric = 'multi_logloss',boost_from_average = False,
                            feature_fraction =0.21815809919426804,bagging_fraction = 0.8055711236002633,lambda_l1 = 0.0029896841778409566,
                             lambda_l2 = 0.06905300928232105,min_split_gain = 0.05931126989247223,min_child_weight = 10)
    clf.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_val, y_val)],
            early_stopping_rounds=50, verbose=100)
    
    
    
    # fiberID ==34 인 test_set에 QSO =1로 채워넣음
    ps = pd.DataFrame(data=clf.predict_proba(test), index=test.index)
    sam = sample_submission.copy()
    sam.loc[test1_idx] =ps.loc[test1_idx]
    sam.loc[test2_idx] = 0
    sam.loc[test2_idx,'QSO'] = 1
    

    preds.append(sam)

Training until validation scores don't improve for 50 rounds
[100]	training's multi_logloss: 1.05194	valid_1's multi_logloss: 1.0832
[200]	training's multi_logloss: 0.612898	valid_1's multi_logloss: 0.662993
[300]	training's multi_logloss: 0.431397	valid_1's multi_logloss: 0.496419
[400]	training's multi_logloss: 0.346796	valid_1's multi_logloss: 0.423711
[500]	training's multi_logloss: 0.302346	valid_1's multi_logloss: 0.389637
[600]	training's multi_logloss: 0.275687	valid_1's multi_logloss: 0.372187
[700]	training's multi_logloss: 0.257368	valid_1's multi_logloss: 0.362578
[800]	training's multi_logloss: 0.243561	valid_1's multi_logloss: 0.357024
[900]	training's multi_logloss: 0.232336	valid_1's multi_logloss: 0.353606
[1000]	training's multi_logloss: 0.223131	valid_1's multi_logloss: 0.351478
[1100]	training's multi_logloss: 0.214751	valid_1's multi_logloss: 0.350107
[1200]	training's multi_logloss: 0.206839	valid_1's multi_logloss: 0.349227
[1300]	training's multi_logloss: 0.1991

[100]	training's multi_logloss: 1.05158	valid_1's multi_logloss: 1.09038
[200]	training's multi_logloss: 0.612504	valid_1's multi_logloss: 0.672551
[300]	training's multi_logloss: 0.431011	valid_1's multi_logloss: 0.507331
[400]	training's multi_logloss: 0.346274	valid_1's multi_logloss: 0.435634
[500]	training's multi_logloss: 0.301848	valid_1's multi_logloss: 0.402435
[600]	training's multi_logloss: 0.275074	valid_1's multi_logloss: 0.386061
[700]	training's multi_logloss: 0.256645	valid_1's multi_logloss: 0.377086
[800]	training's multi_logloss: 0.24277	valid_1's multi_logloss: 0.371988
[900]	training's multi_logloss: 0.231512	valid_1's multi_logloss: 0.368848
[1000]	training's multi_logloss: 0.222031	valid_1's multi_logloss: 0.366927
[1100]	training's multi_logloss: 0.213666	valid_1's multi_logloss: 0.365834
[1200]	training's multi_logloss: 0.205669	valid_1's multi_logloss: 0.365087
[1300]	training's multi_logloss: 0.197826	valid_1's multi_logloss: 0.364601
[1400]	training's multi_

[400]	training's multi_logloss: 0.346497	valid_1's multi_logloss: 0.434098
[500]	training's multi_logloss: 0.302129	valid_1's multi_logloss: 0.399713
[600]	training's multi_logloss: 0.275573	valid_1's multi_logloss: 0.381787
[700]	training's multi_logloss: 0.257636	valid_1's multi_logloss: 0.371856
[800]	training's multi_logloss: 0.243927	valid_1's multi_logloss: 0.365916
[900]	training's multi_logloss: 0.232721	valid_1's multi_logloss: 0.362224
[1000]	training's multi_logloss: 0.223509	valid_1's multi_logloss: 0.359772
[1100]	training's multi_logloss: 0.215194	valid_1's multi_logloss: 0.358175
[1200]	training's multi_logloss: 0.207292	valid_1's multi_logloss: 0.357195
[1300]	training's multi_logloss: 0.199876	valid_1's multi_logloss: 0.356561
[1400]	training's multi_logloss: 0.192267	valid_1's multi_logloss: 0.356095
[1500]	training's multi_logloss: 0.185094	valid_1's multi_logloss: 0.355959
[1600]	training's multi_logloss: 0.178266	valid_1's multi_logloss: 0.355953
Early stopping, be

In [None]:
pred = sum(preds)/20
pred.index = sample_submission.index
pred.columns = sample_submission.columns
pred.to_csv('LGBM_20_Final.csv', index=True)