In [80]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error


In [81]:
df = pd.read_csv("../data/landslideCleaned.csv")

In [82]:
df.head()

Unnamed: 0,event_id,event_date,event_year,event_month,event_time,event_hour,event_am_pm,landslide_category,landslide_trigger,landslide_size,...,fatality_count,injury_count,country_name,country_code,admin_division_name,admin_division_population,gazeteer_closest_point,gazeteer_distance,longitude,latitude
0,8253,2014-12-21 08:10:00,2014,December,08:10:00,8,AM,landslide,unknown,unknown,...,0.0,0.0,United States,US,Oregon,12883.0,Saint Helens,0.87786,-122.8178,45.8647
1,9958,2017-07-03 14:33:00,2017,July,14:33:00,14,PM,landslide,downpour,small,...,0.0,0.0,unknown,unknown,unknown,-1.0,unknown,-1.0,105.529997,23.14383
2,955,2009-01-02 20:30:00,2009,January,20:30:00,20,PM,landslide,downpour,medium,...,0.0,1.0,Philippines,PH,Bicol,6721.0,Matnog,5.5353,124.0419,12.5655
3,9810,2017-05-19 20:14:00,2017,May,20:14:00,20,PM,mudslide,rain,small,...,0.0,0.0,unknown,unknown,unknown,-1.0,unknown,-1.0,-83.712126,36.770029
4,9962,2017-07-03 14:33:00,2017,July,14:33:00,14,PM,landslide,downpour,small,...,0.0,0.0,unknown,unknown,unknown,-1.0,unknown,-1.0,104.453085,22.690403


In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5351 entries, 0 to 5350
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   event_id                   5351 non-null   int64  
 1   event_date                 5351 non-null   object 
 2   event_year                 5351 non-null   int64  
 3   event_month                5351 non-null   object 
 4   event_time                 5351 non-null   object 
 5   event_hour                 5351 non-null   int64  
 6   event_am_pm                5351 non-null   object 
 7   landslide_category         5351 non-null   object 
 8   landslide_trigger          5351 non-null   object 
 9   landslide_size             5351 non-null   object 
 10  landslide_setting          5351 non-null   object 
 11  fatality_count             5351 non-null   float64
 12  injury_count               5351 non-null   float64
 13  country_name               5351 non-null   objec

In [84]:
# Converting feature variables into numerical columns for regression modelling
df['landslide_category'] = df['landslide_category'].astype('category')
df['landslide_code'] = df['landslide_category'].cat.codes


df['landslide_size'] = df['landslide_size'].astype('category')
df['landslide_size_code'] = df['landslide_size'].cat.codes


df['landslide_trigger'] = df['landslide_trigger'].astype('category')
df['landslide_trigger_code'] = df['landslide_trigger'].cat.codes


df['landslide_setting'] = df['landslide_setting'].astype('category')
df['landslide_setting_code'] = df['landslide_setting'].cat.codes


df['country_code'] = df['country_code'].astype('category')
df['country_code'] = df['country_code'].cat.codes

In [85]:
df.head()

Unnamed: 0,event_id,event_date,event_year,event_month,event_time,event_hour,event_am_pm,landslide_category,landslide_trigger,landslide_size,...,admin_division_name,admin_division_population,gazeteer_closest_point,gazeteer_distance,longitude,latitude,landslide_code,landslide_size_code,landslide_trigger_code,landslide_setting_code
0,8253,2014-12-21 08:10:00,2014,December,08:10:00,8,AM,landslide,unknown,unknown,...,Oregon,12883.0,Saint Helens,0.87786,-122.8178,45.8647,5,4,15,2
1,9958,2017-07-03 14:33:00,2017,July,14:33:00,14,PM,landslide,downpour,small,...,unknown,-1.0,unknown,-1.0,105.529997,23.14383,5,3,3,2
2,955,2009-01-02 20:30:00,2009,January,20:30:00,20,PM,landslide,downpour,medium,...,Bicol,6721.0,Matnog,5.5353,124.0419,12.5655,5,2,3,12
3,9810,2017-05-19 20:14:00,2017,May,20:14:00,20,PM,mudslide,rain,small,...,unknown,-1.0,unknown,-1.0,-83.712126,36.770029,6,3,12,13
4,9962,2017-07-03 14:33:00,2017,July,14:33:00,14,PM,landslide,downpour,small,...,unknown,-1.0,unknown,-1.0,104.453085,22.690403,5,3,3,2


In [86]:
df = df[df['admin_division_population'] >= 0]

df = df[df['gazeteer_distance'] >= 0]

In [87]:
df.columns

Index(['event_id', 'event_date', 'event_year', 'event_month', 'event_time',
       'event_hour', 'event_am_pm', 'landslide_category', 'landslide_trigger',
       'landslide_size', 'landslide_setting', 'fatality_count', 'injury_count',
       'country_name', 'country_code', 'admin_division_name',
       'admin_division_population', 'gazeteer_closest_point',
       'gazeteer_distance', 'longitude', 'latitude', 'landslide_code',
       'landslide_size_code', 'landslide_trigger_code',
       'landslide_setting_code'],
      dtype='object')

In [88]:
columns_to_keep = ['fatality_count', 'injury_count',
       'country_code', 'admin_division_population', 'gazeteer_distance', 
       'longitude', 'latitude', 'landslide_code', 'landslide_size_code', 
       'landslide_trigger_code', 'landslide_setting_code']

In [89]:
regression_data = df[columns_to_keep]

In [90]:
regression_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3876 entries, 0 to 4299
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   fatality_count             3876 non-null   float64
 1   injury_count               3876 non-null   float64
 2   country_code               3876 non-null   int8   
 3   admin_division_population  3876 non-null   float64
 4   gazeteer_distance          3876 non-null   float64
 5   longitude                  3876 non-null   float64
 6   latitude                   3876 non-null   float64
 7   landslide_code             3876 non-null   int8   
 8   landslide_size_code        3876 non-null   int8   
 9   landslide_trigger_code     3876 non-null   int8   
 10  landslide_setting_code     3876 non-null   int8   
dtypes: float64(6), int8(5)
memory usage: 230.9 KB


In [91]:
X = regression_data.drop(columns = ['fatality_count'])
y = regression_data['fatality_count']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R² Score: {r2:.4f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")

R² Score: 0.1352
Mean Squared Error: 166.91
Root Mean Squared Error: 12.92
