# **Import**

In [None]:
import folium
import json
from folium import plugins
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy import stats
from sklearn.cluster import KMeans
from scipy.stats import f_oneway
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from IPython.display import clear_output
import scipy as sp
import scipy.sparse
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler,MaxAbsScaler,RobustScaler, MinMaxScaler,FunctionTransformer, maxabs_scale
from sklearn.metrics import classification_report,confusion_matrix, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix, plot_confusion_matrix
from imblearn.over_sampling import SMOTE,ADASYN
from sklearn.feature_selection import chi2
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.decomposition import PCA 
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder

# **Functions**

In [None]:
# FUNCTION FOR PREPROCESSING

# Group classes in landslide_size
def combine_landslide_size(df_size):
  df_size.replace(to_replace=['catastrophic'], value='very_large', inplace=True)
  return df_size

# Combine features with high correlation
def combine_features(df, list_features):
  return df[list_features].mean(axis=1)

# Combine text description columns
def combine_descriptions(df, description_columns):
  df['event_description'].fillna('', inplace=True)
  df['description'] = ''
  for col in description_columns:
    df['description'] = df['description'] + df[col] + " "
  return df['description']

# Combine soil columns into one column
def combine_soils(df, soil_columns):
  df_soil = df[soil_columns]
  for i in range(df_soil.shape[0]):
    df_soil.loc[i, 'soil'] = df_soil.loc[i, soil_columns].mode()[0]
  return df_soil['soil']

In [None]:
# PREPROCESSING FUNCTION

def preprocessing(df):

  # Combine landslide_size
  df['landslide_size'] = combine_landslide_size(df['landslide_size'])

  # Combine features with high correlation
  population_columns = ['population_density_2000', 'population_density_2005', 'population_density_2010', 'population_density_2015', 'population_density_2020']
  dew_temp_avg_columns = ['dew', 'feelslikemin', 'feelslikemax', 'feelslike', 'tempmin', 'tempmax', 'temp']
  df['population_density_avg'] = combine_features(df, population_columns)
  df['dew_temp_avg'] = combine_features(df, dew_temp_avg_columns)

  # Combine description columns
  description_columns = ['event_description', 'event_title', 'location_description']
  df['description'] = combine_descriptions(df, description_columns)

  # Combine 'soil_texture_'
  soil_columns = ['soil_texture_0', 'soil_texture_10', 'soil_texture_30', 'soil_texture_60', 'soil_texture_100', 'soil_texture_200']
  df['soil_texture'] = combine_soils(df, soil_columns)

  # Drop old columns
  df.drop(set.union(set(population_columns), set(dew_temp_avg_columns), set(description_columns), set(soil_columns)), axis=1, inplace=True)
  df.drop('stations', axis=1, inplace=True) 

  return df

In [None]:
# FUNCTIONS FOR FEATURE SELECTION

# Chi-square: categorical features and target
def chisquare(df, cate_cols):

  df_cate = df[cate_cols].apply(LabelEncoder().fit_transform)
  X = df_cate.drop('landslide_size', axis=1)
  y = df_cate['landslide_size']
  selector = SelectKBest(chi2, k = 'all')
  X_new = selector.fit_transform(X, y)
  names = X.columns.values[selector.get_support()]
  scores = selector.scores_[selector.get_support()]
  p_values = selector.pvalues_[selector.get_support()]
  name_score_p = list(zip(names, scores, p_values))
  df_chi2 = pd.DataFrame(data = name_score_p, columns=['Column name', 'Chi-square', 'P-values'])
  df_chi2 = df_chi2.sort_values(['Chi-square', 'Column name'], ascending = [False, True]).reset_index(drop=True)

  return df_chi2

# ANOVA: numerical features and target
def anova(df, num_cols):

  df_num = df[num_cols]
  df_num['event_date'] = df_num['event_date'].astype(int)
  clear_output()
  X = df_num
  y = df['landslide_size']
  X_array = X.values
  y_array = LabelEncoder().fit_transform(y)
  selector = SelectKBest(f_classif, k = 'all')
  X_new = selector.fit_transform(X_array, y_array)
  names = X.columns.values[selector.get_support()]
  scores = selector.scores_[selector.get_support()]
  p_values = selector.pvalues_[selector.get_support()]
  name_score_p = list(zip(names, scores, p_values))
  df_anova = pd.DataFrame(data = name_score_p, columns=['Column name', 'F-Scores', 'P-values'])
  df_anova = df_anova.sort_values(['F-Scores', 'Column name'], ascending = [False, True]).reset_index(drop=True)

  return df_anova

In [None]:
# FEATURE SELECTION FUNCTION

def feature_selection(df, num_threshold, cate_threshold):

  cate_cols = df.select_dtypes('object').columns
  num_cols = df.select_dtypes(exclude='object').columns

  df_chi2 = chisquare(df, cate_cols)
  df_anova = anova(df, num_cols)

  cate_cols_used = df_chi2.loc[:cate_threshold, 'Column name']
  num_cols_used = df_anova.loc[:num_threshold, 'Column name']

  columns_used = list(set.union(set(cate_cols_used), set(num_cols_used)))

  return columns_used

In [None]:
# FEATURE ENGINEERING FUNCTION

def feature_engineering(X, y):

  # Convert datetime to integer
  X['event_date'] = X['event_date'].astype('int')

  # One-hot Encoding for categorical features (except 'description' column)
  cate_cols = [i for i in X.columns if(X[i].dtype=='object' and i!='description')]
  X = pd.get_dummies(X, columns = cate_cols)

  # Split train-test set
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

  # TFIDF transform 'description' column
  vec = TfidfVectorizer(ngram_range=(1, 2), analyzer='word')
  text_train_vec = vec.fit_transform(X_train['description'])
  text_test_vec = vec.transform(X_test['description'])

  X_train.drop(['description'], axis=1, inplace=True)
  X_test.drop(['description'], axis=1, inplace=True)

  X_train = sp.sparse.hstack([text_train_vec, X_train]) 
  X_test = sp.sparse.hstack([text_test_vec, X_test]) 

  # Scale features
  scaler = MaxAbsScaler()
  X_train = scaler.fit_transform(X_train)
  X_test = scaler.transform(X_test)

  return X_train, X_test, y_train, y_test

In [None]:
# OVERSAMPLING FOR IMBALANCED CLASSES

def oversampling(X_train, y_train):

  oversampling_techique = ADASYN()
  X_train_os, y_train_os = oversampling_techique.fit_resample(X_train, y_train)

  return X_train_os, y_train_os

In [None]:
# MODEL DEVELOPMENT FUNCTION

def model_development(X_train, X_test, y_train, y_test):

  df_score = pd.DataFrame({'model': [], 'accuracy': [], 'f1-macro': []})
  
  classifiers = [RandomForestClassifier(random_state=42), 
                 PassiveAggressiveClassifier(random_state=42), 
                 LogisticRegression(max_iter=500, random_state=42), 
                 SVC(random_state=42)]

  for classifier in classifiers:
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)

    df_score = df_score.append({'model': str(classifier), 'accuracy': accuracy_score(y_pred, y_test), 'f1-macro': f1_score(y_test, y_pred, average="macro")}, ignore_index=True)

  return df_score

In [None]:
# LANDSLIDE SIZE CLASSIFICATION FUNCTION

def size_classification(df, num_threshold, cate_threshold, isOversampled):

  # Additional preprocessing 
  df = preprocessing(df)

  clear_output()

  # Select features for model
  features_selected = feature_selection(df, num_threshold, cate_threshold)

  # Access independent and dependent features
  X = df[features_selected]
  y = df['landslide_size']

  # Feature engineering
  X_train, X_test, y_train, y_test = feature_engineering(X, y)

  clear_output()

  # Oversampling for imbalanced classes
  if isOversampled:
    X_train, y_train = oversampling(X_train, y_train)

  result = model_development(X_train, X_test, y_train, y_test)

  return result

# **Implement**

## **Before oversampling**

In [None]:
df = pd.read_csv('/content/GLC_features_preprocessed.csv', parse_dates=['event_date'], na_filter= False)

result = size_classification(df, 15, 8, 0)

result

Unnamed: 0,model,accuracy,f1-macro
0,RandomForestClassifier(random_state=42),0.713623,0.481892
1,PassiveAggressiveClassifier(random_state=42),0.715407,0.519439
2,"LogisticRegression(max_iter=500, random_state=42)",0.718973,0.511298
3,SVC(random_state=42),0.71719,0.472977


## **After oversampling**

In [None]:
df = pd.read_csv('/content/GLC_features_preprocessed.csv', parse_dates=['event_date'], na_filter= False)

result = size_classification(df, 15, 8, 1)

result

Unnamed: 0,model,accuracy,f1-macro
0,RandomForestClassifier(random_state=42),0.716476,0.498321
1,PassiveAggressiveClassifier(random_state=42),0.711484,0.520779
2,"LogisticRegression(max_iter=500, random_state=42)",0.71505,0.521931
3,SVC(random_state=42),0.706847,0.477818


## **GridSearch for Logistic Regression**

In [None]:
df = pd.read_csv('/content/GLC_features_preprocessed.csv', parse_dates=['event_date'], na_filter= False)

# Additional preprocessing 
df = preprocessing(df)

clear_output()

# Select features for model
features_selected = feature_selection(df, 15, 8)

# Access independent and dependent features
X = df[features_selected]
y = df['landslide_size']

# Feature engineering
X_train, X_test, y_train, y_test = feature_engineering(X, y)

clear_output()

# Oversampling for imbalanced classes
X_train, y_train = oversampling(X_train, y_train)

grid = {"C": np.logspace(-3,3,7), "penalty": ["l1","l2"]}
logreg = LogisticRegression()
logreg_cv = GridSearchCV(logreg, grid, cv=10)

logreg_cv.fit(X_train,y_train)

y_pred = logreg_cv.predict(X_test)

accuracy_score(y_pred, y_test), f1_score(y_test, y_pred, average="macro")

(0.7171897289586305, 0.5225880890952899)