In [1]:
import pandas as pd                         # 데이터 분석 라이브러리
import numpy as np                          # 계산 라이브러리
from tqdm import tqdm                       # 진행바
from sklearn.metrics import roc_auc_score   # AUC 스코어 계산
from sklearn.model_selection import KFold   # K-fold CV    
from bayes_opt import BayesianOptimization  # 베이지안 최적화 라이브러리  
from functools import partial               # 함수 변수 고정
import lightgbm as lgb                      # LightGBM 라이브러리
import warnings   
import re

In [3]:
! pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading https://files.pythonhosted.org/packages/b5/26/9842333adbb8f17bcb3d699400a8b1ccde0af0b6de8d07224e183728acdf/bayesian_optimization-1.1.0-py3-none-any.whl
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.1.0


In [4]:
!pip install lightgbm

Collecting lightgbm
  Downloading https://files.pythonhosted.org/packages/1f/cb/a8ec24334c35a7d0c87b4e4e056bd2137573c7c1bd81c760b79a2f370254/lightgbm-2.3.1-py2.py3-none-win_amd64.whl (544kB)
Installing collected packages: lightgbm
Successfully installed lightgbm-2.3.1


In [2]:
train =pd.read_csv('train.csv')

In [48]:
train

Unnamed: 0,game_id,winner,time,player,species,event,event_contents
0,0,1,0,0,T,Camera,"at (145.25, 21.5078125)"
1,0,1,0,1,T,Camera,"at (22.75, 147.0078125)"
2,0,1,120,0,T,Selection,['OrbitalCommand [3080001]']
3,0,1,120,0,T,Ability,(1360) - TrainSCV
4,0,1,840,0,T,Camera,"at (142.99609375, 24.50390625)"
...,...,...,...,...,...,...,...
67091771,38871,0,531,0,Z,Camera,"at (139.578125, 62.58203125)"
67091772,38871,0,532,1,T,GetControlGroup,
67091773,38871,0,532,0,Z,Camera,"at (122.42578125, 45.4296875)"
67091774,38871,0,532,0,Z,Camera,"at (122.42578125, 43.25390625)"


In [19]:
sample=train.iloc[0:10000,:]

In [20]:
sample

Unnamed: 0,game_id,winner,time,player,species,event,event_contents
0,0,1,0.00,0,T,Camera,"at (145.25, 21.5078125)"
1,0,1,0.00,1,T,Camera,"at (22.75, 147.0078125)"
2,0,1,0.02,0,T,Selection,['OrbitalCommand [3080001]']
3,0,1,0.02,0,T,Ability,(1360) - TrainSCV
4,0,1,0.14,0,T,Camera,"at (142.99609375, 24.50390625)"
...,...,...,...,...,...,...,...
9995,4,0,4.11,1,Z,Selection,"['Egg [4100001]', 'Egg [4180001]', 'Egg [41C00..."
9996,4,0,4.11,0,T,Camera,"at (60.98046875, 112.71875)"
9997,4,0,4.11,0,T,Camera,"at (58.5, 115.19921875)"
9998,4,0,4.12,0,T,Camera,"at (53.375, 120.32421875)"


In [3]:
#Feature Engineering
def species_converter(string):
    if string == 'T':
        return 0
    elif string == 'P':
        return 1
    elif string == 'Z':
        return 2
    else:
        raise ValueError

def convert(time):
    second = int(time * 100) % 100
    minutes = int(time)
    tot_sec = minutes * 60 + second

    return tot_sec

def extract_loc(row):
    if row['event'] == 'Camera':
        parsed_list = re.split('; |, | |\(|\)',row['event_contents'])
        row['Camera_X'] = float(parsed_list[2])
        row['Camera_Y'] = float(parsed_list[3])
    else:
        row['Camera_X'] = float('NaN')
        row['Camera_Y'] = float('NaN')
        return row
    return row



def convert_outlier(time):
    if time >= 20.0:
        return 50.0
    return time

def extract_camera_x(col):
    parsed_list = re.split('; |, | |\(|\)',col)
    return float(parsed_list[2])

def extract_camera_y(col):
    parsed_list = re.split('; |, | |\(|\)',col)
    return float(parsed_list[3])

def data_preparation_3(df, answer=False):
    
    #game 시간 
    
    train_game_time = df['time'].groupby(df['game_id'])
    train_game_time=train_game_time.max()
    train_game_time=np.ceil(train_game_time)
    train_game_time=pd.DataFrame(train_game_time)
    
    #ability 분리
    ability = df[df['event']=='Ability']
    ability_list = ability['event_contents'].apply(lambda x : x.split(';') )
    split_ability =pd.DataFrame(ability_list.tolist(), columns=['ability_1', 'ability_2','ability_3'],index=ability_list.index)
    ability_df = pd.concat([ability,split_ability],axis=1)
    ability_df=ability_df[['ability_1','player','game_id']]

    del ability,ability_list,split_ability
    
    ablity_unique=ability_df['ability_1'].unique()
    ability_count = ability_df.groupby(['game_id', 'player']).ability_1.value_counts()
    
    del ability_df
    
    unique_ability_0, unique_ability_1  = {}, {}
        
    for ability in ablity_unique:
        unique_ability_0['P0_' + ability] = 0
        unique_ability_1['P1_' + ability] = 0
    
    
    #df['time'] = df['time'].apply(convert)
    #print('finish time')
    #df = parallelize_dataframe(df, add_feature, n_cores=8)
    tmp_df = df[df['event']=='Camera']
    print('finish adding new df')
    tmp_df['Camera_X'] = tmp_df['event_contents'].apply(extract_camera_x)
    print('finish getting camera_x')
    tmp_df['Camera_Y'] = tmp_df['event_contents'].apply(extract_camera_y)
    print('finish getting camera_y')
    df = pd.concat([df, tmp_df['Camera_X'], tmp_df['Camera_Y']], axis=1, sort=False)
    print('finish getting x, y')
#     df['time_diff'] = df.sort_values(['game_id', 'player', 'time']).groupby(['game_id', 'player'])['time'].diff()
#     df['time_diff'] = df['time_diff'].fillna(0.0)
#     df['time_diff'] = df['time_diff'].apply(convert_outlier)
#     df['time_diff'] = df['time_diff'].astype(int).astype(str)
    

    


    game_ids = df['game_id'].unique()
    events = ['Ability', 'AddToControlGroup', 'Camera', 'ControlGroup', 'GetControlGroup', 'Right Click', 'Selection', 'SetControlGroup']
#     times = []
#     for i in range(20):
#         times.append(str(i))
#     times.append(str(50))

    unique_event_0, unique_event_1, delta_event = {}, {}, {}


    for event in events:
        unique_event_0['P0_' + event] = 0
        unique_event_1['P1_' + event] = 0
        delta_event['delta_' + event] = 0
    
    
#     unique_time_0, unique_time_1, delta_time = {}, {}, {}
    
    
#     for time in times:
#         unique_time_0['P0_' + time] = 0
#         unique_time_1['P1_' + time] = 0
#         delta_time['delta_' + time] = 0

    species = df.groupby(['game_id', 'player']).species.unique()
    print('finish species')
    var_x = df.groupby(['game_id', 'player'])['Camera_X'].var()
    print('finish varx')
    var_y = df.groupby(['game_id', 'player'])['Camera_Y'].var()
    print('finish vary')

    event_count = df.groupby(['game_id', 'player']).event.value_counts()
    #time_count = df.groupby(['game_id', 'player']).time_diff.value_counts()

    if answer:
        winners = df.groupby(['game_id']).winner.max()

    x_data, y_data = [], []
    for game_id in tqdm(game_ids):
        df_event_count = event_count[game_id].unstack(level=-1)
        df = pd.DataFrame(species[game_id])
        df = pd.concat([df, df_event_count], axis=1)   
        df = df.fillna(0)
        
    
        df_P0_species = pd.DataFrame([species_converter(df.loc[0]['species'][0])], columns=['P0_species'])        
        df_P1_species = pd.DataFrame([species_converter(df.loc[1]['species'][0])], columns=['P1_species'])
        
        
        #our new feature!!!
        
#         df_time_count = time_count[game_id].unstack(level=-1)
#         df_time_count.fillna(0)
        if hasattr(ability_count[game_id], 'unstack'):
            df_ability_count = ability_count[game_id].unstack(level=-1)
            df_ability_count = df_ability_count.fillna(0)
            
        #camera variance
        df_P0_camera_x_var = pd.DataFrame([var_x[game_id].loc[0]], columns=['P0_x_var'])
        df_P0_camera_y_var = pd.DataFrame([var_y[game_id].loc[0]], columns=['P0_y_var'])

        df_P1_camera_x_var = pd.DataFrame([var_x[game_id].loc[1]], columns=['P1_x_var'])
        df_P1_camera_y_var = pd.DataFrame([var_y[game_id].loc[1]], columns=['P1_y_var'])

        df_delta_x_var = pd.DataFrame([var_x[game_id].loc[0] - var_x[game_id].loc[1]], columns=['delta_x_var'])
        df_delta_y_var = pd.DataFrame([var_y[game_id].loc[0] - var_y[game_id].loc[1]], columns=['delta_y_var'])

        df = df.drop(['species'], axis=1)
        
        #time diff 
        
        
#         df_PO_time = unique_time_0.copy()
#         for column in df_time_count.columns:
#             df_PO_time['P0_' + column] = df_time_count.loc[0][column]
#         df_PO_time = pd.DataFrame(pd.Series(df_PO_time)).T
#         df_PO_time.fillna(0)
        
        
#         df_P1_time = unique_time_1.copy()
#         for column in df_time_count.columns:
#             df_P1_time['P1_' + column] = df_time_count.loc[1][column]
#         df_P1_time = pd.DataFrame(pd.Series(df_P1_time)).T
#         df_P1_time.fillna(0)
        
#         df_delta_time = delta_time.copy()
#         for column in df_time_count.columns:
#             df_delta_time['delta_' + column] = df_PO_time['P0_' + column][0] - df_P1_time['P1_' + column][0]
#         df_delta_time = pd.DataFrame(pd.Series(df_delta_time)).T
#         df_delta_time.fillna(0)
        
        #ability 에 등장한 건물 ,유닛 액션 전부 원핫 인코딩 

        ## ability가 하나도 없는 게임이 있을수 있다.
        try :
            df_PO_ability = unique_ability_0.copy()
            for column in df_ability_count.columns:
                df_PO_ability['P0_' + column] = df_ability_count.loc[0][column]
            df_PO_ability = pd.DataFrame(pd.Series(df_PO_ability)).T
            df_PO_ability.fillna(0)
        except:
            df_PO_ability = unique_ability_0.copy()
            df_PO_ability = pd.DataFrame(pd.Series(df_PO_ability)).T
            df_PO_ability.fillna(0)
        try:
            df_P1_ability = unique_ability_1.copy()
            for column in df_ability_count.columns:
                df_P1_ability['P1_' + column] = df_ability_count.loc[1][column]
            df_P1_ability = pd.DataFrame(pd.Series(df_P1_ability)).T        
            df_P1_ability.fillna(0)
        except:
            df_P1_ability = unique_ability_1.copy()
            df_P1_ability = pd.DataFrame(pd.Series(df_P1_ability)).T        
            df_P1_ability.fillna(0)
        
        #경기 시간

        df_game_time = pd.DataFrame(pd.Series(train_game_time.loc[game_id].values)).T
        df_game_time.columns=['game_time']
        
        #event per minute

        df_P0_event = unique_event_0.copy()
        temp = 0
        for column in df.columns:
            df_P0_event['P0_' + column] = df.loc[0][column]/train_game_time.loc[game_id].values[0]
            temp= temp + df.loc[0][column]/train_game_time.loc[game_id].values[0]
        df_P0_event['P0_APM']=temp
        df_P0_event = pd.DataFrame(pd.Series(df_P0_event)).T

        df_P1_event = unique_event_1.copy()
        temp = 0
        for column in df.columns:
            df_P1_event['P1_' + column] = df.loc[1][column]/train_game_time.loc[game_id].values[0]
            temp= temp + df.loc[0][column]/train_game_time.loc[game_id].values[0]
        df_P0_event['P1_APM']=temp
        df_P1_event = pd.DataFrame(pd.Series(df_P1_event)).T
        
        
        #event delta
        
        df_delta_event = delta_event.copy()
        for column in df.columns:
            df_delta_event['delta_' + column] = df_P0_event['P0_' + column][0] - df_P1_event['P1_' + column][0]
        df_delta_event = pd.DataFrame(pd.Series(df_delta_event)).T
        
        
        
        #result   여기서 전부 통~~~합

        out = pd.concat([df_P0_species, df_PO_ability,df_P0_event, df_P1_species, df_P1_ability,df_P1_event,  df_delta_event,df_game_time ,df_P0_camera_x_var, df_P0_camera_y_var, df_P1_camera_x_var, df_P1_camera_y_var, df_delta_x_var, df_delta_y_var], axis=1)

        out.index = [game_id]
        out.index.name = 'game_id'

        x_data.append(out)
        if answer:
            y_data.append(winners[game_id])  

    x_data = pd.concat(x_data)
    y_data = np.array(y_data)

    return x_data, y_data

In [4]:
x_train, y_train = data_preparation_3(train, answer=True)

finish adding new df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


finish getting camera_x


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


finish getting camera_y
finish getting x, y
finish species
finish varx
finish vary


100%|████████████████████████████████████████████████████████████████████████████| 38872/38872 [13:57<00:00, 46.41it/s]


In [5]:
x_train['delta_APM']=x_train['P0_APM']-+x_train['P1_APM']

In [6]:
x_train.to_csv('train_covert.csv')

In [7]:
del train

In [8]:
def lgb_cv(num_leaves, learning_rate, n_estimators, subsample, colsample_bytree, reg_alpha, reg_lambda, x_data=None, y_data=None, n_splits=5, output='score'):
    score = 0
    kf = KFold(n_splits=n_splits)
    models = []
    for train_index, valid_index in kf.split(x_data):
        x_train, y_train = x_data.iloc[train_index], y_data[train_index]
        x_valid, y_valid = x_data.iloc[valid_index], y_data[valid_index]
        
        model = lgb.LGBMClassifier(
            num_leaves = int(num_leaves), 
            learning_rate = learning_rate, 
            n_estimators = int(n_estimators), 
            subsample = np.clip(subsample, 0, 1), 
            colsample_bytree = np.clip(colsample_bytree, 0, 1), 
            reg_alpha = reg_alpha, 
            reg_lambda = reg_lambda,
        )
        
        model.fit(x_train, y_train)
        models.append(model)
        
        pred = model.predict_proba(x_valid)[:, 1]
        true = y_valid
        score += roc_auc_score(true, pred)/n_splits
    
    if output == 'score':
        return score
    if output == 'model':
        return models

In [9]:
# 모델과 관련없는 변수 고정
func_fixed = partial(lgb_cv, x_data=x_train, y_data=y_train, n_splits=5, output='score' ) 
# 베이지안 최적화 범위 설정
lgbBO = BayesianOptimization(
    func_fixed, 
    {
        'num_leaves': (16, 128),        # num_leaves,       범위(16~1024) <--- 지수가 바꾸긴 전꺼임 지수는 128
        'bagging_fraction': (0.1, 1.0),
        'feature_fraction': (0.1, 1.0),
        'learning_rate': (0.001, 0.1),  # learning_rate,    범위(0.0001~0.1)
        'n_estimators': (16, 1024),      # n_estimators,     범위(16~1024)
        'subsample': (0, 1),             # subsample,        범위(0~1)
        'colsample_bytree': (0, 1),      # colsample_bytree, 범위(0~1)
        'reg_alpha': (0, 10),            # reg_alpha,        범위(0~10)
        'reg_lambda': (0, 50),           # reg_lambda,       범위(0~50)
    }, 
    random_state=4321                    # 시드 고정
)
lgbBO.maximize(init_points=50, n_iter=100) # 처음 5회 랜덤 값으로 score 계산 후 30회 최적화

|   iter    |  target   | baggin... | colsam... | featur... | learni... | n_esti... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------


TypeError: lgb_cv() got an unexpected keyword argument 'bagging_fraction'

In [None]:
params = lgbBO.max['params']
models = lgb_cv(
    params['num_leaves'], 
    params['learning_rate'], 
    params['n_estimators'], 
    params['subsample'], 
    params['colsample_bytree'], 
    params['reg_alpha'], 
    params['reg_lambda'], 
    x_data=x_train, y_data=y_train, n_splits=5, output='model')

In [None]:
test = pd.read_csv('test.csv')
x_test, _ = data_preparation_3(test, answer=False)

In [None]:
preds = []
for model in models:
    pred = model.predict_proba(x_test)[:, 1]
    preds.append(pred)
pred = np.mean(preds, axis=0)

submission = pd.read_csv('sample_submission.csv', index_col=0)
submission['winner'] = submission['winner'] + pred
submission.to_csv('one_hot_encoding_ability_and_divide_time.csv')
submission.head()

In [73]:
train

Unnamed: 0,game_id,winner,time,player,species,event,event_contents
0,0,1,0,0,T,Camera,"at (145.25, 21.5078125)"
1,0,1,0,1,T,Camera,"at (22.75, 147.0078125)"
2,0,1,432000,0,T,Selection,['OrbitalCommand [3080001]']
3,0,1,432000,0,T,Ability,(1360) - TrainSCV
4,0,1,3024000,0,T,Camera,"at (142.99609375, 24.50390625)"
...,...,...,...,...,...,...,...
67091771,38871,0,114696000,0,Z,Camera,"at (139.578125, 62.58203125)"
67091772,38871,0,114912000,1,T,GetControlGroup,
67091773,38871,0,114912000,0,Z,Camera,"at (122.42578125, 45.4296875)"
67091774,38871,0,114912000,0,Z,Camera,"at (122.42578125, 43.25390625)"


time    599.0
Name: 1, dtype: float64

Unnamed: 0,time
0,343.0


Unnamed: 0,time
0,343.0


NameError: name 'ttt' is not defined

Unnamed: 0,time
0,26640.0
