In [3]:
import numpy as np
import pandas as pd
import random
import os

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정


In [4]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [5]:
train_y = train['Income']
train_X = train.drop(columns=['Income', 'ID'])

test_X = test.drop(columns=['ID'])

In [9]:
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor

encoding_list = train.dtypes[train.dtypes == 'object'].index.tolist()
encoding_list.remove('ID')

for i in encoding_list:
    le = LabelEncoder()

    train_X[i] = train_X[i].astype(str)
    test_X[i] = test_X[i].astype(str)

    le.fit(train_X[i])
    train_X[i] = le.transform(train_X[i])

    for case in np.unique(test_X[i]):
        if case not in le.classes_:
            le.classes_ = np.append(le.classes_, case)
    
    test_X[i] = le.transform(test_X[i])


In [14]:
test_X

Unnamed: 0,Age,Gender,Education_Status,Employment_Status,Working_Week (Yearly),Industry_Status,Occupation_Status,Race,Hispanic_Origin,Martial_Status,Household_Status,Household_Summary,Citizenship,Birth_Country,Birth_Country (Father),Birth_Country (Mother),Tax_Status,Gains,Losses,Dividends,Income_Status
0,79,1,17,0,0,6,6,4,0,5,44,4,2,39,40,40,5,0,0,0,1
1,47,1,15,0,0,6,6,4,9,5,31,0,2,39,39,39,4,0,0,0,1
2,18,0,17,0,52,11,3,4,0,5,31,0,2,39,39,39,5,0,0,0,1
3,39,0,1,2,30,4,3,4,0,1,55,7,2,39,39,39,2,0,0,0,2
4,6,1,11,0,0,6,6,4,7,5,35,2,2,39,39,39,4,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,31,1,10,0,52,7,4,4,0,5,44,4,2,39,39,39,5,0,0,0,1
9996,27,1,12,0,52,11,10,4,0,1,42,4,2,39,39,39,2,0,0,0,1
9997,18,1,4,0,7,11,3,1,0,5,31,0,2,39,39,39,4,0,0,0,1
9998,9,1,11,0,0,6,6,4,0,5,35,2,2,39,39,39,4,0,0,0,1


In [21]:
from sklearn.metrics import mean_squared_error

xgb = XGBRegressor(n_estimators=200, learning_rate=0.01, gamma=0, subsample=0.75,
                    colsample_bytree=1, max_depth=7)
xgb.fit(train_X, train_y)
pred_y = xgb.predict(test_X)


In [22]:
submission = pd.read_csv('../data/sample_submission.csv')
submission['Income'] = pred_y
submission

Unnamed: 0,ID,Income
0,TEST_0000,79.032654
1,TEST_0001,262.018463
2,TEST_0002,489.889496
3,TEST_0003,664.498718
4,TEST_0004,74.370285
...,...,...
9995,TEST_9995,768.988098
9996,TEST_9996,747.635559
9997,TEST_9997,691.246216
9998,TEST_9998,74.364258


In [23]:
submission.to_csv('../output/xgb_submission.csv', index=False)