In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.calibration import LabelEncoder
import xgboost as xgb
from sklearn.metrics import classification_report

# 2MODELS
Predictions based on 2 models
1. Classify between 1-2 and 3-4-5-6-7
2. For each class, find the cover_Type


In [21]:
from utils import *
from sklearn.ensemble import AdaBoostClassifier

def split_in_two_df(df: pd.DataFrame):
    if "under2" not in df.columns:
        # Split between cover type >2 and <=2
        df["under2"] = (df["Cover_Type"] < 3).astype(int)
    
    # 1 -2
    df_under = df.where(df["under2"] == 1)
    df_under.dropna(inplace=True)
    df_under = df_under.astype(int)  # @TODO c'est bizarre

    # 3-4-5-6-7
    df_above = df.where(df["under2"] == 0)
    df_above.dropna(inplace=True)
    df_above = df_above.astype(int)  # @TODO c'est bizarre

    return df, df_under, df_above


def get_X_y(df: pd.DataFrame, target_col=None):
    X = df.copy()
    if target_col == "under2":
        X.drop(["Cover_Type"], axis=1, inplace=True)
    elif target_col == "Cover_Type":
        X.drop(["under2"], axis=1, inplace=True)
    if target_col in X.columns:
        y = X[target_col]
        X.drop([target_col], axis=1, inplace=True)
        return X, y
    else:
        # X.drop(["under2", "Cover_Type"], axis=1, inplace=True)
        return X


def train_first_model(X, y, model, test_size=0.25, seed=42, startify=True):
    if startify:
        strf = y
    else:
        strf = None

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=seed, stratify=strf)

    model.fit(X, y)
    # model.fit(X_train, y_train)
    # y_pred = model.predict(X_test)

    return model


def train_second_model(X, y, model, test_size=0.25, seed=42, startify=True):
    if startify:
        strf = y
    else:
        strf = None
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=seed, stratify=strf)

    le = LabelEncoder()
    y_train = le.fit_transform(y_train)

    model.fit(X, le.fit_transform(y))
    # model.fit(X_train, y_train)
    # y_pred = model.predict(X_test)
    # y_pred = le.inverse_transform(y_pred)

    return model



# Load training data
df_train = get_data_train()

# Split the data
df_train, df_train_under, df_train_above = split_in_two_df(df_train)
X_train_under, y_train_under = get_X_y(
    df_train_under, target_col='Cover_Type')
X_train_above, y_train_above = get_X_y(
    df_train_above, target_col='Cover_Type')
X_train, y_train = get_X_y(df_train, target_col='under2')

# Train first model
model = AdaBoostClassifier(n_estimators=300, algorithm='SAMME')#xgb.XGBClassifier(n_estimators=200, max_depth=15)
model1 = train_first_model(X_train, y_train, model)

# Train second model
model2u = AdaBoostClassifier(n_estimators=300, algorithm='SAMME')#xgb.XGBClassifier(n_estimators=200, max_depth=15)
model2u = train_second_model(
    X_train_under, y_train_under, model2u)

model2a = xgb.XGBClassifier(n_estimators=200, max_depth=15, objective="multi:softmax")
model2a = train_second_model(
    X_train_above, y_train_above, model2a)

# Load Testing data
df_test = get_data_test()

X_test = get_X_y(df_test)
y1_pred = model1.predict(X_test)

X_test["under2"] = y1_pred

df_test1, df_test_under, df_test_above = split_in_two_df(X_test)
X_test_under = get_X_y(df_test_under)
X_test_under.drop(["under2"], axis=1, inplace=True)
X_test_above = get_X_y(df_test_above)
X_test_above.drop(["under2"], axis=1, inplace=True)

y2u_pred = model2u.predict(X_test_under)
y2a_pred = model2a.predict(X_test_above)

# Convert prediction into Dataframe
pred_under = pd.DataFrame({'Id': X_test_under.Id, 'Cover_Type': y2u_pred+1})
pred_above = pd.DataFrame({'Id': X_test_above.Id, 'Cover_Type': y2a_pred+3})

final_pred = pd.concat([pred_under, pred_above])
final_pred_clean = clean_predictor(y_pred=final_pred.Cover_Type, Id=final_pred.Id)
csv_for_submission(final_pred_clean,"2model+clean")

## Choice the best model for first split 

In [4]:
from sklearn.model_selection import cross_val_score

# Load training data
df_train = get_data_train()

# Split the data
df_train, df_train_under, df_train_above = split_in_two_df(df_train)

X_train, y_train = get_X_y(df_train, target_col='under2')

# Train first model
model = xgb.XGBClassifier(n_estimators=200, max_depth=15)

cross_val_score(model, X_train, y_train, cv =5, scoring="balanced_accuracy")

array([0.92662037, 0.97048611, 0.54305556, 0.90300926, 0.43726852])

In [18]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# models = [xgb.XGBClassifier(n_estimators=500), xgb.XGBRFClassifier(n_estimators=500), RandomForestClassifier(n_estimators=500), AdaBoostClassifier(n_estimators=500, algorithm='SAMME')]
models=[AdaBoostClassifier(n_estimators=1000, algorithm='SAMME')]

In [19]:
for m in models:
    scores = cross_val_score(m, X_train, y_train, cv=5, scoring="balanced_accuracy")
    print(m)
    print('Mean', np.mean(scores))
    print('std', np.std(scores))
    print('\n')

AdaBoostClassifier(algorithm='SAMME', n_estimators=1000)
Mean 0.7262268518518519
std 0.21902431849433232




In [13]:
for m in models:
    scores = cross_val_score(m, X_train, y_train, cv=5, scoring="accuracy")
    print(m)
    print('Mean', np.mean(scores))
    print('std', np.std(scores))
    print('\n')

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=500, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
Mean 0.6853174603174603
std 0.3106491830859837


XGBRFClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bytree=None, device=None,
                early_stopping_rounds=None, enable_cate



AdaBoostClassifier(n_estimators=500)
Mean 0.6956349206349206
std 0.2963235029789866




# 2MODELS with lGBM

In [128]:
print("1:",180425/581012)
print("2:",240095/581012)
print("3:",39490/581012)
print("4:",4099/581012)
print("5:",39937/581012)
print("6:",25676/581012)
print("7:",51290/581012)

a = 581012 - 180425 -240095
print("#")
print("3:",39490/a)
print("4:",4099/a)
print("5:",39937/a)
print("6:",25676/a)
print("7:",51290/a)
print("#")
print("3:",0.04*581012/a)
print("4:",0.01*581012/a)
print("5:",0.04*581012/a)
print("6:",0.04*581012/a)
print("7:",0.04*581012/a)
print("#")
print('12', 180425+240095)
print('3456', 39490+4099+39937+25676+51290)
print('ratio',(180425+240095)/(39490+4099+39937+25676+51290))
print("#")
print("1:",180425)
print("2:",240095/581012)
print('ratio',240095/180425)

1: 0.3105357548553214
2: 0.4132358712040371
3: 0.06796761512670994
4: 0.0070549317397919495
5: 0.06873696240353039
6: 0.04419185834371751
7: 0.0882770063268917
#
3: 0.24605587817461305
4: 0.025540213842434514
5: 0.24884106372903322
6: 0.15998305211474714
7: 0.31957979213917204
#
3: 0.14480771627246217
4: 0.03620192906811554
5: 0.14480771627246217
6: 0.14480771627246217
7: 0.14480771627246217
#
12 420520
3456 160492
ratio 2.6201929068115546
#
1: 180425
2: 0.4132358712040371
ratio 1.3307191353748096


In [67]:
from lightgbm import LGBMClassifier

class_weights_matt = {
    1: 0.4,
    2: 0.45,
    3: 0.04,
    4: 0.01,
    5: 0.04,
    6: 0.04,
    7: 0.04
}

c_weights_bin = {
    1: 0.85,
    0: 0.17
}

c_weights_12 = {
    1: 0.5,
    2: 0.5
}

c_weights_34567 = {
    3: 0.15,
    4: 0.03,
    5: 0.15,
    6: 0.15,
    7: 0.15
}

coeffs = np.array([2.63, 3.06, 0.43, 0.05, 0.24, 0.27, 0.32])



model1 = LGBMClassifier(
    objective='binary',
    boosting_type='gbdt',
    num_leaves=31,
    # num_iterations=100,
    verbose=-1,
    n_jobs=-1
)
model2u = LGBMClassifier(
    objective='binary',
    boosting_type='gbdt',
    num_leaves=31,
    # num_iterations=100,
    verbose=-1,
    n_jobs=-1
)
model2a = LGBMClassifier(
    objective='multiclass',
    num_class=5,
    # class_weight=c_weights_34567,
    boosting_type='gbdt',
    num_leaves=31,
    # num_iterations=100,
    verbose=-1,
    n_jobs=-1
)


In [14]:
df_train = get_data_train()
df_train, df_test = train_test_split(df_train, test_size=0.25)

In [69]:
df_train = get_data_train()
preprocess(df_train)

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Cover_Type,Soil_Type_Synth,Wilderness_Area_Synth
0,242642,2881,130,22,210,54,1020,250,221,88,342,1,30,1
1,309891,3005,351,14,242,-16,1371,194,215,159,842,1,24,3
2,287847,3226,63,14,618,2,1092,232,210,107,2018,1,29,1
3,516307,3298,317,8,661,60,752,198,233,174,1248,1,23,2
4,124860,3080,35,6,175,26,3705,219,227,144,2673,1,24,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15115,475155,3328,321,13,323,12,5109,186,227,180,3151,7,38,3
15116,514378,3455,37,5,841,92,939,220,229,146,362,7,40,2
15117,368425,3279,90,14,404,113,1513,240,218,105,1503,7,29,1
15118,537844,3589,357,9,418,52,1868,205,223,155,1657,7,40,2


In [68]:
from sklearn.ensemble import AdaBoostClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from utils import *

def split_in_two_df(df: pd.DataFrame):
    if "under2" not in df.columns:
        # Split between cover type >2 and <=2
        df["under2"] = (df["Cover_Type"] < 3).astype(int)
    
    # 1 -2
    df_under = df.where(df["under2"] == 1)
    df_under.dropna(inplace=True)
    df_under = df_under.astype(int)  # @TODO c'est bizarre

    # 3-4-5-6-7
    df_above = df.where(df["under2"] == 0)
    df_above.dropna(inplace=True)
    df_above = df_above.astype(int)  # @TODO c'est bizarre

    return df, df_under, df_above


def get_X_y(df: pd.DataFrame, target_col=None):
    X = df.copy()
    if target_col == "under2":
        X.drop(["Cover_Type"], axis=1, inplace=True)
    elif target_col == "Cover_Type":
        X.drop(["under2"], axis=1, inplace=True)
    if target_col in X.columns:
        y = X[target_col]
        X.drop([target_col], axis=1, inplace=True)
        return X, y
    else:
        # X.drop(["under2", "Cover_Type"], axis=1, inplace=True)
        return X


def train_first_model(X, y, model, test_size=0.25, seed=42, startify=True):
    if startify:
        strf = y
    else:
        strf = None

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=seed, stratify=strf)

    model.fit(X, y)
    # model.fit(X_train, y_train)
    # y_pred = model.predict(X_test)

    return model


def train_second_model(X, y, model, test_size=0.25, seed=42, startify=True):
    if startify:
        strf = y
    else:
        strf = None
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=seed, stratify=strf)

    # le = LabelEncoder()
    # y_train = le.fit_transform(y_train)

    # model.fit(X, le.fit_transform(y))
    model.fit(X_train, y_train)
    # y_pred = model.predict(X_test)
    # y_pred = le.inverse_transform(y_pred)

    return model

def get_stack_clf():
    SEED = 42
    estimators = [
        ('rf', RandomForestClassifier(n_estimators=200, random_state=SEED)),
        # ('svr', make_pipeline(StandardScaler(),
        #                       LinearSVC(dual="auto", random_state=SEED))),
        ('xgb', xgb.XGBClassifier(n_estimators=200, random_state=SEED)),
        # ('xtree'), ExtraTreesClassifier(n_estimators=200, random_state=SEED),
        # ('ada', AdaBoostClassifier(estimator=RandomForestClassifier(n_estimators=100, random_state=SEED), algorithm="SAMME",n_estimators=200, random_state=SEED)),
        ('xgbrf', xgb.XGBRFClassifier(n_estimators=200, random_state=SEED)),
    ]
    clf = StackingClassifier(
        estimators=estimators, final_estimator=LogisticRegression(max_iter=150)
    )
    return clf

def get_stack_clf2():
    SEED = 42
    estimators = [
        ('rf', RandomForestClassifier(n_estimators=200, random_state=SEED)),
        # ('svr', make_pipeline(StandardScaler(),
        #                       LinearSVC(dual="auto", random_state=SEED))),
        ('xgb', xgb.XGBClassifier(n_estimators=200, random_state=SEED, objective="multi:softmax")),
        # ('xtree'), ExtraTreesClassifier(n_estimators=200, random_state=SEED),
        # ('ada', AdaBoostClassifier(estimator=RandomForestClassifier(n_estimators=100, random_state=SEED), algorithm="SAMME",n_estimators=200, random_state=SEED)),
        ('xgbrf', xgb.XGBRFClassifier(n_estimators=200, random_state=SEED, objective="multi:softmax")),
    ]
    clf = StackingClassifier(
        estimators=estimators, final_estimator=LogisticRegression(max_iter=150)
    )
    return clf

def concat_Wilderness_Area(df):
    wilderness_areas = [f"Wilderness_Area{i}" for i in range(1,5)]
    df["Wilderness_Area_Synth"] = df[wilderness_areas] @ range(1,5)
    df = df.drop(columns=wilderness_areas)
    return df

def concat_Soil_Type(df):
    soil_types = [f"Soil_Type{i}" for i in range(1, 41)]
    df["Soil_Type_Synth"] = df[soil_types] @ range(1,41)
    df = df.drop(columns=soil_types)
    return df

def preprocess(df):
    df = concat_Soil_Type(df)
    df = concat_Wilderness_Area(df)
    return df


# Load training data
df_train = get_data_train()
df_train = preprocess(df_train)

# Split the data
df_train, df_train_under, df_train_above = split_in_two_df(df_train)
X_train_under, y_train_under = get_X_y(
    df_train_under, target_col='Cover_Type')
X_train_above, y_train_above = get_X_y(
    df_train_above, target_col='Cover_Type')
X_train, y_train = get_X_y(df_train, target_col='under2')

# Train first model
# model = xgb.XGBClassifier(n_estimators=200, max_depth=15)
# model = get_stack_clf()
# model1 = train_first_model(X_train, y_train, model)
model1.fit(X_train, y_train, categorical_feature=['Wilderness_Area_Synth', 'Soil_Type_Synth'])

# Train second model
# model2u = xgb.XGBClassifier(n_estimators=200, max_depth=15)
# model2u = get_stack_clf()
# model2u = train_second_model(
#     X_train_under, y_train_under, model2u)
model2u.fit(X_train, y_train, categorical_feature=['Wilderness_Area_Synth', 'Soil_Type_Synth'])

# model2a = xgb.XGBClassifier(objective="multi:softmax")
# model2a = get_stack_clf2()
# model2a = train_second_model(
#     X_train_above, y_train_above, model2a)
model2a.fit(X_train, y_train, categorical_feature=['Wilderness_Area_Synth', 'Soil_Type_Synth'])

# Load Testing data
df_test = get_data_test()
df_test = preprocess(df_test)
print(df_test.shape)

X_test = get_X_y(df_test)
# X_test = df_test.drop(["Cover_Type"], axis=1)
# y_test = df_test["Cover_Type"]
y1_pred = model1.predict(X_test)

X_test["under2"] = y1_pred

df_test1, df_test_under, df_test_above = split_in_two_df(X_test)
X_test_under = get_X_y(df_test_under)
X_test_under.drop(["under2"], axis=1, inplace=True)
X_test_above = get_X_y(df_test_above)
X_test_above.drop(["under2"], axis=1, inplace=True)

y2u_pred = model2u.predict(X_test_under)
y2a_pred = model2a.predict(X_test_above)

# Convert prediction into Dataframe
pred_under = pd.DataFrame({'Id': X_test_under.Id, 'Cover_Type': y2u_pred})
pred_above = pd.DataFrame({'Id': X_test_above.Id, 'Cover_Type': y2a_pred})

final_pred = pd.concat([pred_under, pred_above])
# accuracy_score(y_test, final_pred)
final_pred_clean = clean_predictor(y_pred=final_pred.Cover_Type, Id=final_pred.Id)
csv_for_submission(final_pred_clean,"2model+lgbm")

(581012, 13)


ValueError: y contains previously unseen labels: [   593   3024 134298]

In [57]:
for i in range(5):
    df_train = get_data_train()
    df_train, df_test = train_test_split(df_train, test_size=0.25)

    model = LGBMClassifier(
        objective='binary',
        boosting_type='gbdt',
        num_leaves=31,
        num_iterations=100,
        verbose=-1,
        n_jobs=-1
    )
    model2u = LGBMClassifier(
        objective='binary',
        boosting_type='gbdt',
        num_leaves=31,
        num_iterations=100,
        verbose=-1,
        n_jobs=-1
    )
    model2a = LGBMClassifier(
        objective='multiclass',
        num_class=5,
        class_weight=c_weights_34567,
        boosting_type='gbdt',
        num_leaves=31,
        num_iterations=100,
        verbose=-1,
        n_jobs=-1
    )

    # Split the data
    df_train, df_train_under, df_train_above = split_in_two_df(df_train)
    X_train_under, y_train_under = get_X_y(
        df_train_under, target_col='Cover_Type')
    X_train_above, y_train_above = get_X_y(
        df_train_above, target_col='Cover_Type')
    X_train, y_train = get_X_y(df_train, target_col='under2')

    # Train first model
    # model = xgb.XGBClassifier(n_estimators=200, max_depth=15)
    # model = get_stack_clf()
    model1 = train_first_model(X_train, y_train, model)

    # Train second model
    # model2u = xgb.XGBClassifier(n_estimators=200, max_depth=15)
    # model2u = get_stack_clf()
    model2u = train_second_model(
        X_train_under, y_train_under, model2u)

    # model2a = xgb.XGBClassifier(objective="multi:softmax")
    # model2a = get_stack_clf2()
    model2a = train_second_model(
        X_train_above, y_train_above, model2a)

    # Load Testing data
    # df_test = get_data_test()

    # X_test = get_X_y(df_test)
    X_test = df_test.drop(["Cover_Type"], axis=1)
    y_test = df_test["Cover_Type"]
    y1_pred = model1.predict(X_test)

    X_test["under2"] = y1_pred

    df_test1, df_test_under, df_test_above = split_in_two_df(X_test)
    X_test_under = get_X_y(df_test_under)
    X_test_under.drop(["under2"], axis=1, inplace=True)
    X_test_above = get_X_y(df_test_above)
    X_test_above.drop(["under2"], axis=1, inplace=True)

    y2u_pred = model2u.predict(X_test_under)
    y2a_pred = model2a.predict(X_test_above)

    # Convert prediction into Dataframe
    pred_under = pd.DataFrame({'Id': X_test_under.Id, 'Cover_Type': y2u_pred})
    pred_above = pd.DataFrame({'Id': X_test_above.Id, 'Cover_Type': y2a_pred})

    final_pred = pd.concat([pred_under, pred_above])
    print("Accuracy",accuracy_score(pd.DataFrame(y_test).sort_index()["Cover_Type"], final_pred.sort_index()["Cover_Type"]))
    print(classification_report(pd.DataFrame(y_test).sort_index()["Cover_Type"], final_pred.sort_index()["Cover_Type"]))




Accuracy 0.8748677248677249
              precision    recall  f1-score   support

           1       0.80      0.77      0.79       548
           2       0.78      0.69      0.73       533
           3       0.86      0.88      0.87       520
           4       0.98      0.96      0.97       578
           5       0.87      0.95      0.91       526
           6       0.85      0.90      0.87       547
           7       0.96      0.98      0.97       528

    accuracy                           0.87      3780
   macro avg       0.87      0.87      0.87      3780
weighted avg       0.87      0.87      0.87      3780





Accuracy 0.8751322751322751
              precision    recall  f1-score   support

           1       0.80      0.77      0.79       558
           2       0.78      0.68      0.72       551
           3       0.88      0.88      0.88       514
           4       0.97      0.96      0.96       551
           5       0.88      0.95      0.91       566
           6       0.87      0.92      0.89       530
           7       0.94      0.97      0.96       510

    accuracy                           0.88      3780
   macro avg       0.87      0.88      0.87      3780
weighted avg       0.87      0.88      0.87      3780





Accuracy 0.8637566137566137
              precision    recall  f1-score   support

           1       0.78      0.77      0.78       548
           2       0.75      0.67      0.71       531
           3       0.84      0.88      0.86       543
           4       0.97      0.95      0.96       562
           5       0.87      0.93      0.90       508
           6       0.88      0.88      0.88       547
           7       0.94      0.96      0.95       541

    accuracy                           0.86      3780
   macro avg       0.86      0.86      0.86      3780
weighted avg       0.86      0.86      0.86      3780





Accuracy 0.8767195767195767
              precision    recall  f1-score   support

           1       0.82      0.79      0.81       559
           2       0.79      0.72      0.76       548
           3       0.86      0.86      0.86       535
           4       0.96      0.96      0.96       533
           5       0.89      0.94      0.91       540
           6       0.86      0.89      0.88       535
           7       0.95      0.98      0.96       530

    accuracy                           0.88      3780
   macro avg       0.88      0.88      0.88      3780
weighted avg       0.88      0.88      0.88      3780





Accuracy 0.8613756613756614
              precision    recall  f1-score   support

           1       0.79      0.76      0.77       552
           2       0.78      0.69      0.73       554
           3       0.84      0.87      0.86       543
           4       0.97      0.94      0.96       541
           5       0.88      0.94      0.91       517
           6       0.83      0.87      0.85       542
           7       0.94      0.96      0.95       531

    accuracy                           0.86      3780
   macro avg       0.86      0.86      0.86      3780
weighted avg       0.86      0.86      0.86      3780



In [50]:
print("Accuracy",accuracy_score(pd.DataFrame(y_test).sort_index()["Cover_Type"], final_pred.sort_index()["Cover_Type"]))

0.8611111111111112

In [54]:
print(classification_report(pd.DataFrame(y_test).sort_index()["Cover_Type"], final_pred.sort_index()["Cover_Type"]))

              precision    recall  f1-score   support

           1       0.78      0.75      0.76       560
           2       0.77      0.68      0.72       547
           3       0.85      0.87      0.86       542
           4       0.95      0.95      0.95       535
           5       0.89      0.93      0.91       512
           6       0.85      0.89      0.87       527
           7       0.93      0.97      0.95       557

    accuracy                           0.86      3780
   macro avg       0.86      0.86      0.86      3780
weighted avg       0.86      0.86      0.86      3780



## Grid search pour obtenir les best models

### Split 12 vs 34567

In [124]:
from sklearn.model_selection import KFold

# df_train = get_data_train()
# kf = KFold(n_splits=10, shuffle=True)
acc = []
# for i, (train_index, test_index)  in enumerate(kf.split(df_train)):
for i in range(10):
    model1 = LGBMClassifier(
        objective='binary',
        is_unbalanced=True,
        scale_pos_weight=3,
        boosting_type='gbdt',
        num_leaves=50,
        n_estimators=300,
        verbose=-1,
        n_jobs=-1,
        # early_stopping=5
    )
    df_train = get_data_train()
    df_train, df_test = train_test_split(df_train, test_size=0.25)
    # df_train, df_val = train_test_split(df_train, test_size=0.25)

    # Split the data
    df_train, df_train_under, df_train_above = split_in_two_df(df_train)
    X_train, y_train = get_X_y(df_train, target_col='under2')
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25)
    # Train first model
    model1.fit(X_train, y_train)#, eval_set=[(X_val, y_val)])#, early_stopping=10)
    #Predict
    X_test = df_test.drop(["Cover_Type"], axis=1)
    y_test = (df_test["Cover_Type"]< 3).astype(int)
    y1_pred = model1.predict(X_test)
    acc.append(accuracy_score(y_test, y1_pred))
    print("i", acc[-1])
print(f"Final accuracy {np.mean(acc):.3f} +/- {np.std(acc):.3f}")

i 0.9629629629629629
i 0.9645502645502646
i 0.9677248677248678
i 0.9650793650793651
i 0.9611111111111111
i 0.9674603174603175
i 0.9674603174603175
i 0.9653439153439154
i 0.9616402116402116
i 0.9656084656084656
Final accuracy 0.965 +/- 0.002


### Split 1 vs 2

In [141]:
acc = []
# for i, (train_index, test_index)  in enumerate(kf.split(df_train)):
for i in range(10):
    model2u = LGBMClassifier(
        objective='binary',
        is_unbalanced=True,
        scale_pos_weight=1.3,
        boosting_type='gbdt',
        num_leaves=50,
        n_estimators=500,
        verbose=-1,
        n_jobs=-1,
        # early_stopping=5
    )

    df_train = get_data_train()
    # df_train = preprocess(df_train)
    df_train, df_train_under, df_train_above = split_in_two_df(df_train)
    X_train_under, y_train_under = get_X_y(
        df_train_under, target_col='Cover_Type')
    y_train_under = y_train_under -1

    X_train_under, X_test, y_train_under, y_test = train_test_split(X_train_under, y_train_under, test_size=0.25)

    model2u.fit(X_train_under, y_train_under)#, categorical_feature=['Wilderness_Area_Synth', 'Soil_Type_Synth'])

    y1_pred = model2u.predict(X_test)
    acc.append(accuracy_score(y_test, y1_pred))
    print("i", acc[-1])
print(f"Final accuracy {np.mean(acc):.3f} +/- {np.std(acc):.3f}")

i 0.8518518518518519
i 0.85
i 0.8611111111111112
i 0.8592592592592593
i 0.8574074074074074
i 0.8555555555555555
i 0.8601851851851852
i 0.8481481481481481
i 0.8694444444444445
i 0.8546296296296296
Final accuracy 0.857 +/- 0.006


### Split 3 vs 4 vs 5 vs 6 vs 7

In [149]:
acc = []
# for i, (train_index, test_index)  in enumerate(kf.split(df_train)):
for i in range(10):
    model2a = LGBMClassifier(
        objective='multiclass',
        num_class=5,
        class_weight={
            3: 0.15,
            4: 0.03,
            5: 0.15,
            6: 0.15,
            7: 0.15
        },
        boosting_type='gbdt',
        num_leaves=50,
        n_estimators=500,
        verbose=-1,
        n_jobs=-1,
        # early_stopping=5
    )


    df_train = get_data_train()
    df_train = preprocess(df_train)
    df_train, df_train_under, df_train_above = split_in_two_df(df_train)
    X_train_above, y_train_above = get_X_y(
        df_train_above, target_col='Cover_Type')
    y_train_under = y_train_under -1


    X_train_above, X_test, y_train_above, y_test = train_test_split(X_train_above, y_train_above, test_size=0.25)
    X_train_above, X_val, y_train_above, y_val = train_test_split(X_train_above, y_train_above, test_size=0.25)


    model2a.fit(X_train_above, y_train_above)#, eval_set=[(X_val, y_val)])#, categorical_feature=['Wilderness_Area_Synth', 'Soil_Type_Synth'])

    y1_pred = model2a.predict(X_test)
    acc.append(accuracy_score(y_test, y1_pred))
    print("i", acc[-1])
print(f"Final accuracy {np.mean(acc):.3f} +/- {np.std(acc):.3f}")

i 0.9514814814814815
i 0.9422222222222222
i 0.9477777777777778
i 0.9437037037037037
i 0.9477777777777778
i 0.9503703703703704
i 0.9403703703703704
i 0.947037037037037
i 0.9418518518518518
i 0.947037037037037
Final accuracy 0.946 +/- 0.004


### Final pred

In [151]:
def get3model():
    model1  = LGBMClassifier(
        objective='binary',
        is_unbalanced=True,
        scale_pos_weight=3,
        boosting_type='gbdt',
        num_leaves=50,
        n_estimators=500,
        verbose=-1,
        n_jobs=-1,
        early_stopping=5
    )
    model2u = LGBMClassifier(
        objective='binary',
        is_unbalanced=True,
        scale_pos_weight=1.3,
        boosting_type='gbdt',
        num_leaves=50,
        n_estimators=500,
        verbose=-1,
        n_jobs=-1,
        early_stopping=5
    )
    model2a = LGBMClassifier(
        objective='multiclass',
        num_class=5,
        class_weight={
            3: 0.15,
            4: 0.03,
            5: 0.15,
            6: 0.15,
            7: 0.15
        },
        boosting_type='gbdt',
        num_leaves=50,
        n_estimators=500,
        verbose=-1,
        n_jobs=-1,
        early_stopping=5
    )
    return model1, model2u, model2a

In [156]:
model1, model2u, model2a = get3model()

# Load training data
df_train = get_data_train()
df_train = preprocess(df_train)

# Split the data
df_train, df_train_under, df_train_above = split_in_two_df(df_train)
X_train_under, y_train_under = get_X_y(
    df_train_under, target_col='Cover_Type')
X_train_above, y_train_above = get_X_y(
    df_train_above, target_col='Cover_Type')
X_train, y_train = get_X_y(df_train, target_col='under2')


# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.25)
# Train first model
model1.fit(X_train, y_train)#, eval_set=[(X_val, y_val)])#, categorical_feature=['Wilderness_Area_Synth', 'Soil_Type_Synth'])

# X_train_under, X_val, y_train_under, y_val = train_test_split(X_train_under, y_train_under, test_size=.25)
# Train second model
model2u.fit(X_train_under, y_train_under)#, eval_set=[(X_val, y_val)])#, categorical_feature=['Wilderness_Area_Synth', 'Soil_Type_Synth'])

# X_train_above, X_val, y_train_above, y_val = train_test_split(X_train_above, y_train_above, test_size=.25)
model2a.fit(X_train_above, y_train_above)#, eval_set=[(X_val, y_val)])#, categorical_feature=['Wilderness_Area_Synth', 'Soil_Type_Synth'])

# Load Testing data
df_test = get_data_test()
df_test = preprocess(df_test)

X_test = get_X_y(df_test)
y1_pred = model1.predict(X_test)

X_test["under2"] = y1_pred

df_test1, df_test_under, df_test_above = split_in_two_df(X_test)
X_test_under = get_X_y(df_test_under)
X_test_under.drop(["under2"], axis=1, inplace=True)
X_test_above = get_X_y(df_test_above)
X_test_above.drop(["under2"], axis=1, inplace=True)

y2u_pred = model2u.predict(X_test_under)
y2a_pred = model2a.predict(X_test_above)

# Convert prediction into Dataframe
pred_under = pd.DataFrame({'Id': X_test_under.Id, 'Cover_Type': y2u_pred})
pred_above = pd.DataFrame({'Id': X_test_above.Id, 'Cover_Type': y2a_pred})

final_pred = pd.concat([pred_under, pred_above])
# accuracy_score(y_test, final_pred)
final_pred_clean = clean_predictor(y_pred=final_pred.Cover_Type, Id=final_pred.Id)
csv_for_submission(final_pred_clean,"2model + lgbm")

In [159]:
pd.read_csv('Output/2model+lgbm.csv')['Cover_Type'].value_counts()

2    254536
1    210652
3     38069
7     27269
6     23744
5     23192
4      3550
Name: Cover_Type, dtype: int64

In [161]:
deux=253790
un=211178
trois=38299
sept=27531
six=23428
cinq=23177
quatre=3609
nb_cl = [211178,253790,38299,3609,23177,23428,27531]
c_w ={}
for i,e in enumerate(nb_cl):
    c_w[i] = np.sum(nb_cl)/(7*e)
    print(np.sum(nb_cl)/(7*e))


0.3930414829466814
0.32704879737465736
2.1672031720335854
22.998535407512964
3.58121043645486
3.5428425083538624
3.014845602619385


In [162]:
c_w

{0: 0.3930414829466814,
 1: 0.32704879737465736,
 2: 2.1672031720335854,
 3: 22.998535407512964,
 4: 3.58121043645486,
 5: 3.5428425083538624,
 6: 3.014845602619385}

## Pipeline pour modifier soil type et wilderness

In [4]:
from utils import * 
df_train = get_data_train()

In [7]:
def concat_Wilderness_Area(df):
    wilderness_areas = [f"Wilderness_Area{i}" for i in range(1,5)]
    df["Wilderness_Area_Synth"] = df[wilderness_areas] @ range(1,5)
    df = df.drop(columns=wilderness_areas)
    return df

def concat_Soil_Type(df):
    soil_types = [f"Soil_Type{i}" for i in range(1, 41)]
    df["Soil_Type_Synth"] = df[soil_types] @ range(1,41)
    df = df.drop(columns=soil_types)
    return df

def preprocess(df):
    df = concat_Soil_Type(df)
    df = concat_Wilderness_Area(df)
    return df

In [None]:
soil_types = [f"Soil_Type{i}" for i in range(1, 41)]
wilderness_areas = [f"Wilderness_Area{i}" for i in range(1,5)]
df_test["Wilderness_Area_Synth"] = df_test[wilderness_areas] @ range(1,5)
df_train["Wilderness_Area_Synth"] = df_train[wilderness_areas] @ range(1,5)
df_test["Soil_Type_Synth"] = df_test[soil_types] @ range(1,41)
df_train["Soil_Type_Synth"] = df_train[soil_types] @ range(1,41)
df_train = df_train.drop(columns=wilderness_areas + soil_types)
df_test = df_test.drop(columns=wilderness_areas + soil_types)