In [1]:
import numpy as np
import pandas as pd
import gc

from sklearn.model_selection import StratifiedKFold
import lightgbm as lgbm

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
train = pd.read_csv("../input/tabular-playground-series-dec-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-dec-2021/test.csv")
submission = pd.read_csv("../input/tabular-playground-series-dec-2021/sample_submission.csv")

In [4]:
covtype = pd.read_csv("../input/forest-cover-type-dataset/covtype.csv")
covtype['Id'] = range(len(train), len(train)+len(covtype))
covtype = covtype[train.columns].set_index("Id")

In [5]:
y = train.Cover_Type.values
X = reduce_mem_usage(train.drop("Cover_Type", axis=1).set_index("Id"))
Xt = reduce_mem_usage(test.set_index("Id"))

Mem. usage decreased to 270.84 Mb (83.9% reduction)
Mem. usage decreased to 67.71 Mb (83.9% reduction)


In [6]:
aug_X = reduce_mem_usage(covtype.loc[covtype.Cover_Type.isin([4, 5]), X.columns])
aug_y = covtype.loc[covtype.Cover_Type.isin([4, 5]), 'Cover_Type']

Mem. usage decreased to  0.83 Mb (83.9% reduction)


In [7]:
#aug_X = X[y==5]
#aug_y = y[y==5]

In [8]:
del([train, test, covtype])
gc.collect()

72

In [9]:
FOLDS = 5

cv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=0)

preds = np.zeros( (len(Xt), len(np.unique(y))) )
oof = np.zeros( (len(X), len(np.unique(y))) )

for idx, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    X_train = X_train.append(aug_X)
    y_train = np.concatenate([y_train, aug_y])
    
    model = lgbm.LGBMClassifier(objective="multiclass")
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
              eval_metric="multi_error", early_stopping_rounds=10)
              
    preds += model.predict_proba(Xt) / FOLDS
    oof[val_idx] = model.predict_proba(X_val)



[1]	valid_0's multi_error: 0.425028	valid_0's multi_logloss: 0.719034
Training until validation scores don't improve for 10 rounds
[2]	valid_0's multi_error: 0.106591	valid_0's multi_logloss: 0.622185
[3]	valid_0's multi_error: 0.0850675	valid_0's multi_logloss: 0.540125
[4]	valid_0's multi_error: 0.0840863	valid_0's multi_logloss: 0.489538
[5]	valid_0's multi_error: 0.0722263	valid_0's multi_logloss: 0.441986
[6]	valid_0's multi_error: 0.0683813	valid_0's multi_logloss: 0.398004
[7]	valid_0's multi_error: 0.0662637	valid_0's multi_logloss: 0.366359
[8]	valid_0's multi_error: 0.0644712	valid_0's multi_logloss: 0.340981
[9]	valid_0's multi_error: 0.063345	valid_0's multi_logloss: 0.318382
[10]	valid_0's multi_error: 0.0622612	valid_0's multi_logloss: 0.300402
[11]	valid_0's multi_error: 0.0616763	valid_0's multi_logloss: 0.285162
[12]	valid_0's multi_error: 0.0610062	valid_0's multi_logloss: 0.272297
[13]	valid_0's multi_error: 0.0600263	valid_0's multi_logloss: 0.259973
[14]	valid_0's 

In [10]:
oof = pd.DataFrame(oof, columns=[f"prob_{i}" for i in range(7)])
oof.insert(loc=0, column='Id', value=range(len(X)))
oof.to_csv("oof.csv", index=False)

In [11]:
submission.Cover_Type = np.argmax(preds, axis=1) + 1
submission.to_csv("submission.csv", index=False)