<a href="https://colab.research.google.com/github/JONNY-ME/Landslide-Prevention-and-Innovation-challenge/blob/main/Landslide_prediction_V1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a name = "Libraries"></a>
## 1. Import relevant libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install catboost --quiet

In [None]:
# Import libraries
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

<a name = "Load"></a>
## 2. Load files

In [None]:
# Read files to pandas dataframes
path = '/content/drive/MyDrive/landslide-prevention-and-innovation-challenge/'
Train = pd.read_csv(path+'Train.csv')
Test = pd.read_csv(path+'Test.csv')
sample_submission = pd.read_csv(path+'SampleSubmission.csv')

<a name = "Preview"></a>
## 3. Preview files

In [None]:
print(Train.shape, Test.shape)

In [None]:
data = pd.concat([Train.assign(train=1), Test.assign(train=0)])
print(data.shape)
data.head(5)

In [None]:
base_fts = ['elevation', 'lsfactor', 'placurv', 'procurv', 'sdoif', 'slope', 'twi', 'aspect']

for i in base_fts:
  cols = [x for x in data.columns if x.endswith(i)]
  data[f'{i}_mean'] = data[cols].mean(axis=1)
  data[f'{i}_std'] = data[cols].std(axis=1)
  data[f'{i}_max'] = data[cols].max(axis=1)
  data[f'{i}_min'] = data[cols].min(axis=1)
  data[f'{i}_range'] = data[f'{i}_max'] - data[f'{i}_min']
#   data[f'{i}_skew'] = data[cols].skew(axis=1)
  data[f'{i}_sum'] = data[cols].sum(axis=1)
#   data[f'{i}_sem'] = data[cols].sem(axis=1)
#   data[f'{i}_kurtosis'] = data[cols].kurtosis(axis=1)
#   data[f'{i}_quantile'] = data[cols].quantile(axis=1)

for i in ['geology']:
  cols = [x for x in data.columns if x.endswith(i)]
  data[f'{i}_mode'] = data[cols].mode(axis=1)[0]
  data[f'{i}_count'] = data[cols].count(axis=1)
  data[f'{i}_mean'] = data[cols].mean(axis=1)

print(data.shape)
data

In [None]:
neighbours = ["12", "18", "14", "8", ]

for ft in base_fts:
    for i in neighbours:
        data[f"{ft}_{i}_diff"] = data[f"13_{ft}"] - data[f"{i}_{ft}"]  

for i in base_fts:
  cols = [x for x in data.columns if x.endswith(i) and any(j in x for j in neighbours)]
  data[f'neighbours_{i}_mean'] = data[cols].mean(axis=1)
  data[f'neighbours_{i}_std'] = data[cols].std(axis=1)
  data[f'neighbours_{i}_max'] = data[cols].max(axis=1)
  data[f'neighbours_{i}_min'] = data[cols].min(axis=1)
#   data[f'neighbours_{i}_range'] = data[f'neighbours_{i}_max'] - data[f'neighbours_{i}_min']
#   data[f'neighbours_{i}_skew'] = data[cols].skew(axis=1)
  data[f'neighbours_{i}_sum'] = data[cols].sum(axis=1)
#   data[f'neighbours_{i}_sem'] = data[cols].sem(axis=1)
#   data[f'neighbours_{i}_kurtosis'] = data[cols].kurtosis(axis=1)
#   data[f'neighbours_{i}_quantile'] = data[cols].quantile(axis=1)

for i in ['geology']:
  cols = [x for x in data.columns if x.endswith(i) and any(j in x for j in neighbours)]
  data[f'neighbours_{i}_mode'] = data[cols].mode(axis=1)[0]
  data[f'neighbours_{i}_count'] = data[cols].count(axis=1)
  data[f'neighbours_{i}_mean'] = data[cols].mean(axis=1)

print(data.shape)

In [None]:
train_df = data[data.train == 1].drop(columns=['train'])
test_df = data[data.train == 0].drop(columns=['train', 'Label'])

train_df.shape, test_df.shape

In [None]:
main_cols = test_df.columns.difference([
    '17_elevation', '23_elevation', '8_aspect', '17_aspect',
       '21_aspect', '4_procurv', '4_lsfactor', '18_lsfactor',
       '19_lsfactor', '21_twi', '2_geology', '5_geology', '7_geology',
       '12_geology', '13_geology', '14_geology', '19_geology',
       '23_geology', '24_geology', '8_sdoif', '10_sdoif', '12_sdoif',
       '15_sdoif', '17_sdoif', '18_sdoif', '24_sdoif', 'elevation_min',
       'sdoif_sum', 'slope_sum', 'aspect_mean', 'geology_count',
       'geology_12_diff', 'geology_18_diff', 'geology_14_diff',
       'geology_8_diff', 'neighbours_lsfactor_mean',
       'neighbours_lsfactor_sum', 'neighbours_placurv_max',
       'neighbours_placurv_range', 'neighbours_procurv_quantile',
       'neighbours_twi_mean', 'neighbours_twi_quantile',
       'neighbours_aspect_mean', 'neighbours_aspect_max',
       'neighbours_aspect_quantile', 'neighbours_geology_count',
       '7_geology', '11_geology', '15_geology', '20_geology',
       'placurv_sum', 'sdoif_mean', 'sdoif_sum', 'sdoif_quantile',
       'slope_sum', 'aspect_sum',
       'neighbours_elevation_quantile',
       'neighbours_sdoif_mean', 'neighbours_sdoif_max',
       'neighbours_sdoif_sum', 'neighbours_sdoif_quantile',
])
X = train_df[main_cols]
y = train_df.Label
test = test_df[main_cols]

<a name = "Model"></a>
## 9. Model training

In [None]:
import time 
from sklearn.metrics import f1_score

def squared_mean(results):
  mean_results = np.ones(results[0].shape)
  ln = len(results)
  for i in range(ln):
    mean_results *= results[i]

  return mean_results ** (1/ln)

class My_model:
    def __init__(self, models):
        self.models = models

    def fit_eval_pred(self, X, y, eval_set, val, test):
        results_eval = []
        results_test = []
        for i, model in enumerate(self.models):
            print(f"[Training]............................... Model_{i+1}")
            st = time.time()
            try:
                model.fit(X, y, eval_set=eval_set, verbose=False)
            except Exception as e:
                model.fit(X, y)
            print(f"[Prediction]............................. Model_{i+1}")
            p = model.predict_proba(val)
            print(f"Log loss = {f1_score(eval_set[0][1], np.argmax(p, axis=1))}", end=" "*6)
            results_eval.append(p)
            results_test.append(model.predict_proba(test))
            print(f"Time {time.time() - st :.2f}sec")

        results_eval = squared_mean(results_eval)
        results_test = squared_mean(results_test)

        return results_eval, results_test

In [None]:
import time
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import imblearn.over_sampling as ovs


seed = 21

def get_models(seed=21):

    cat_params = {"iterations":10000, "learning_rate":0.1, "verbose":200, "random_state":seed,
            'use_best_model':True, 'early_stopping_rounds':200, }

    lgb_params = {'n_estimators':10000, 'random_state':seed, 'early_stopping_round':100, 'verbose':-1, 
            'learning_rate': 0.06918642648131483, 'num_leaves': 20, 'max_depth': 11,
            'min_data_in_leaf': 200, 'max_bin': 200, 'lambda_l1': 10, 'lambda_l2': 20,
            'min_gain_to_split': 0.00011324914950008869, 'bagging_fraction': 0.6000000000000001,
            'bagging_freq': 1, 'feature_fraction': 0.8
            }

    return  [
            CatBoostClassifier(**cat_params),   
            LGBMClassifier(**lgb_params),
    ]

In [None]:
results = []
ns = 10
tot = 0

skf = StratifiedKFold(n_splits=ns, random_state=seed, shuffle=True)
for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(100*"-", f"\nFold-{fold+1}")

    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
    sampler = ovs.SMOTE(random_state=42)
    X_train, y_train = sampler.fit_resample(X_train, y_train)

    main_model = My_model(get_models())
    pred, T_pred = main_model.fit_eval_pred(X_train, y_train, [(X_test, y_test)], X_test, test)

    score = f1_score(y_test, pd.Series([[0, 1][i >= .35] for i in pred[:, 1]]))

    print(f"f1 score = {score}")
    tot += score

    results.append(T_pred)

    print(100*"-", "\n\n\n")

print(f"Average log loss = {tot / ns}")

In [None]:
0.7207142105978677
pd.DataFrame({
    "feature":X.columns,
    "values" : main_model.models[0].feature_importances_,
}).sort_values("values", ascending=False)[:20]


<a name = "Predictions"></a>
## 10. Test set predictions

In [None]:
test_pred = pd.Series([[0, 1][i >= .35] for i in np.mean(results, axis=0)[:, 1]])
test_pred.value_counts()

<a name = "Submission"></a>
## 11. Creating a submission file

In [None]:
sub_file = pd.DataFrame({'Sample_ID': Test.Sample_ID, 'Label': test_pred})
sub_file.to_csv('/content/submission1.csv', index = False)
sub_file.head()