In [1]:
import numpy as np
import pandas as pd
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

warnings.filterwarnings('ignore')

## 데이터셋 로드

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
submission = pd.read_csv('./sample_submission.csv')

In [3]:
target = train['nerdiness']
train.drop(['nerdiness'], axis=1, inplace=True)

## Feature 전처리

In [4]:
# country 콜럼 제거
train = train.drop(columns='country')
test = test.drop(columns='country')

1. 이상치 처리

In [5]:
# 이상치 처리
# familysize는 14명 이상을 이상치로 결정해 대체
# age 또한 100세 이상을 이상치로 보아 최빈값으로 대체

train.loc[train['familysize'] > 14, 'familysize'] = 14
train.loc[train['age'] > 100, 'age'] = train['age'].mode()[0]

test.loc[test['familysize'] > 14, 'familysize'] = 14
test.loc[test['age'] > 100, 'age'] = train['age'].mode()[0]

2. 결측치 처리

In [6]:
# 결측치에 대한 처리. train 데이터셋의 최빈값으로 대체.

train.loc[train['education'] != train['education'], 'education'] = train['education'].mode()[0]
train.loc[train['urban'] != train['urban'], 'urban'] = train['urban'].mode()[0]
train.loc[train['engnat'] != train['engnat'], 'engnat'] = train['engnat'].mode()[0]
train.loc[train['age'] != train['age'], 'age'] = train['age'].mode()[0]
train.loc[train['religion'] != train['religion'], 'religion'] = train['religion'].mode()[0]
train.loc[train['voted'] != train['voted'], 'voted'] = train['voted'].mode()[0]
train.loc[train['orientation'] != train['orientation'], 'orientation'] = train['orientation'].mode()[0]
train.loc[train['familysize'] != train['familysize'], 'familysize'] = train['familysize'].mode()[0]
train.loc[train['gender'] != train['gender'], 'gender'] = train['gender'].mode()[0]
train.loc[train['married'] != train['married'], 'married'] = train['married'].mode()[0]
train.loc[train['ASD'] != train['ASD'], 'ASD'] = train['ASD'].mode()[0]

test.loc[test['education'] != test['education'], 'education'] = train['education'].mode()[0]
test.loc[test['urban'] != test['urban'], 'urban'] = train['urban'].mode()[0]
test.loc[test['engnat'] != test['engnat'], 'engnat'] = train['engnat'].mode()[0]
test.loc[test['age'] != test['age'], 'age'] = train['age'].mode()[0]
test.loc[test['religion'] != test['religion'], 'religion'] = train['religion'].mode()[0]
test.loc[test['voted'] != test['voted'], 'voted'] = train['voted'].mode()[0]
test.loc[test['orientation'] != test['orientation'], 'orientation'] = train['orientation'].mode()[0]
test.loc[test['familysize'] != test['familysize'], 'familysize'] = train['familysize'].mode()[0]
test.loc[test['gender'] != test['gender'], 'gender'] = train['gender'].mode()[0]
test.loc[test['married'] != test['married'], 'married'] = train['married'].mode()[0]
test.loc[test['ASD'] != test['ASD'], 'ASD'] = train['ASD'].mode()[0]

In [7]:
# Q 질문들의 결측치에 대하여 모두 2로 대체
for i in range(1,27):
    train.loc[train['Q'+str(i)] != train['Q'+str(i)], 'Q'+str(i)] = 2
for i in range(1,27):
    test.loc[test['Q'+str(i)] != test['Q'+str(i)], 'Q'+str(i)] = 2

In [8]:
# 혹시 남아있을 수 있는 결측치 0으로 모두 대체
train = train.fillna(0)
test = test.fillna(0)

In [9]:
# 남아있는 결측치 있는지 다시한번 확인
train.isnull().sum()

index          0
Q1             0
Q2             0
Q3             0
Q4             0
              ..
orientation    0
voted          0
married        0
familysize     0
ASD            0
Length: 68, dtype: int64

3. feature engineering

In [10]:
# VCL 테스트의 결과값 생성. 세상에 존재하지 않는 단어에 대해서는 스코어 계산에서 제외. 
# 세상에 존재하지 않는 단어를 두 개 이상 안다고 응답했을 경우, VCL 스코어를 최대치로 고정함.

def score_vocab(df):
    df['VCL_Score'] = (df['VCL1'] + df['VCL2'] + df['VCL3'] + df['VCL4'] + df['VCL5'] + df['VCL7']
                         + df['VCL8'] + df['VCL10'] + df['VCL11'] + df['VCL13'] + df['VCL14']
                         + df['VCL15'] + df['VCL16'] )
    
    df.loc[(df['VCL6'] == 1) & (df['VCL9'] == 1) & (df['VCL12'] == 1), 'VCL_Score' ] = 13
    df.loc[(df['VCL6'] == 1) & (df['VCL9'] == 1), 'VCL_Score' ] = 13
    df.loc[(df['VCL12'] == 1) & (df['VCL9'] == 1), 'VCL_Score' ] = 13
    df.loc[(df['VCL6'] == 1) & (df['VCL12'] == 1), 'VCL_Score' ] = 13
    return df

train = score_vocab(train)
test = score_vocab(test)

In [11]:
#성격 유형 5가지로 분리
train[['TIPI2','TIPI4','TIPI6','TIPI8','TIPI10']]= 6-train[['TIPI2','TIPI4','TIPI6','TIPI8','TIPI10']]

train['TIPI_result1']=(train.TIPI1 + train.TIPI6)/2
train['TIPI_result2']=(train.TIPI2 + train.TIPI7)/2
train['TIPI_result3']=(train.TIPI3 + train.TIPI8)/2
train['TIPI_result4']=(train.TIPI4 + train.TIPI9)/2
train['TIPI_result5']=(train.TIPI5 + train.TIPI10)/2

test[['TIPI2','TIPI4','TIPI6','TIPI8','TIPI10']]= 6-test[['TIPI2','TIPI4','TIPI6','TIPI8','TIPI10']]

test['TIPI_result1']=(test.TIPI1 + test.TIPI6)/2
test['TIPI_result2']=(test.TIPI2 + test.TIPI7)/2
test['TIPI_result3']=(test.TIPI3 + test.TIPI8)/2
test['TIPI_result4']=(test.TIPI4 + test.TIPI9)/2
test['TIPI_result5']=(test.TIPI5 + test.TIPI10)/2

In [12]:
# 응답시간 이분화 후 빠름/ 느림으로 1. 0 코딩
mid= np.median(train['introelapse']) 

train['n_introelapse'] = np.where(train['introelapse']>mid,1,0)
test['n_introelapse'] = np.where(test['introelapse']>mid,1,0)

In [13]:
mid= np.median(train['testelapse']) 

train['n_testelapse'] = np.where(train['testelapse']>mid,1,0)
test['n_testelapse'] = np.where(test['testelapse']>mid,1,0)

In [14]:
mid= np.median(train['surveyelapse']) 

train['n_surveyelapse'] = np.where(train['surveyelapse']>mid,1,0)
test['n_surveyelapse'] = np.where(test['surveyelapse']>mid,1,0)

In [15]:
# nerdiness와 연관성이 높지 않다고 판단된 feature 제거
train = train.drop('hand', axis=1)\
.drop('introelapse', axis=1)\
.drop('testelapse', axis=1)\
.drop('surveyelapse', axis=1)

test = test.drop('hand', axis=1)\
.drop('introelapse', axis=1)\
.drop('testelapse', axis=1)\
.drop('surveyelapse', axis=1)

In [16]:
# Q 질문에 대한 결과값 column 생성
Q_columns = ['Q{}'.format(i) for i in range(1,27)]

train['Q_score'] = train[Q_columns].sum(axis=1)
test['Q_score'] = test[Q_columns].sum(axis=1)

## Model

In [17]:
X_train = train.loc[:, 'Q1':]
X_test = test.loc[:, 'Q1':]
y_train = target

X_train.shape

(15000, 73)

In [18]:
rf_clf = RandomForestClassifier(n_estimators=3000, random_state=333)
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict_proba(X_test)[:, 1]

In [19]:
et_clf = ExtraTreesClassifier(n_estimators=3000, random_state=3)
et_clf.fit(X_train, y_train)
et_pred = et_clf.predict_proba(X_test)[:, 1]

In [20]:
y_preds = 0.7 * et_pred + 0.3 * rf_pred

출력물

In [21]:
submission['nerdiness'] = y_preds
submission.to_csv('./ensemble_nl.csv', index=False)