In [1]:
import os
import pandas as pd
pd.set_option("display.max_columns", None)
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from shapely.wkt import loads
from shapely.geometry import Point, MultiPolygon, Polygon

In [2]:
df1 = pd.read_csv('city_dataset.csv')
df2 = pd.read_csv('barangays_dataset.csv')
df3 = gpd.read_file('geospatial data (PH Landslides)/philippines_landslides.shp')
df4 = pd.read_csv("monthly_median_soil_moisture_with_coords_city_year_month_2015_2024.csv")
df5 = pd.read_csv("HeavyRainfallPH.csv")

In [3]:
print("Shape of city_dataset: ", df1.shape)
print("Shape of barangays_dataset: ", df2.shape)
print("Shape of philippines_landslides: ", df3.shape)
print("Shape of soil_moisture: ", df4.shape)
print("Shape of RainFall: ", df5.shape)

Shape of city_dataset:  (1444, 5)
Shape of barangays_dataset:  (42058, 7)
Shape of philippines_landslides:  (675, 33)
Shape of soil_moisture:  (316492, 9)
Shape of RainFall:  (4736, 6)


In [4]:
df1.head(2)

Unnamed: 0,city,region,province,city_code,WKT
0,ABORLAN,REGION IV-B (MIMAROPA),PALAWAN,PH175301000,MULTIPOLYGON (((118.511668551 9.33087966600004...
1,ABRA DE ILOG,REGION IV-B (MIMAROPA),OCCIDENTAL MINDORO,PH175101000,"POLYGON ((120.633791242 13.2827529140001, 120...."


In [5]:
df2.head(2)

Unnamed: 0,WKT,region,province,city_code,city,barangays,geometry
0,MULTIPOLYGON (((120.254410793 15.8811169680001...,REGION I (ILOCOS REGION),PANGASINAN,PH015502000,AGUILAR,Pogomboa,MULTIPOLYGON (((120.254410793 15.8811169680001...
1,MULTIPOLYGON (((120.263724985 15.8172401300001...,REGION I (ILOCOS REGION),PANGASINAN,PH015502000,AGUILAR,Pogonsili,MULTIPOLYGON (((120.263724985 15.8172401300001...


In [6]:
df3.head(2)

Unnamed: 0,index,source_nam,source_lin,event_id,event_date,event_time,event_titl,event_desc,location_d,location_a,landslide_,landslid_1,landslid_2,landslid_3,fatality_c,injury_cou,storm_name,photo_link,notes,event_impo,event_im_1,country_na,country_co,admin_divi,admin_di_1,gazeteer_c,gazeteer_d,submitted_,created_da,last_edite,longitude,latitude,geometry
0,4,The Freeman,http://www.philstar.com/cebu-news/621414/lands...,2603,10/16/2010 12:00:00 PM,12:00,sitio Bakilid in barangay Lahug,Another landslide in sitio Bakilid in barangay...,sitio Bakilid in barangay Lahug,5km,landslide,tropical_cyclone,medium,unknown,0.0,,Supertyphoon Juan (Megi),,,glc,2603.0,Philippines,PH,Central Visayas,798634.0,Cebu City,2.02204,04/01/2014 12:00:00 AM,11/20/2017 03:17:00 PM,02/15/2018 03:51:00 PM,123.8978,10.3336,POINT (123.8978 10.3336)
1,5,BusinessWorld Online,http://www.bworldonline.com/content.php?sectio...,4203,02/16/2012 12:00:00 AM,,"Paguite, Abuyog, Leyte",Thursdayâs landslides were noted in Barangay...,"Paguite, Abuyog, Leyte",5km,landslide,downpour,medium,unknown,0.0,,,,,glc,4203.0,Philippines,PH,Eastern Visayas,2404.0,Balinsacayao,2.28967,04/01/2014 12:00:00 AM,11/20/2017 03:17:00 PM,02/15/2018 03:51:00 PM,124.9668,10.7004,POINT (124.9668 10.7004)


In [7]:
df4.head(2)

Unnamed: 0,system:index,city,closest_city,latitude,longitude,month,soil_moisture_am,year,.geo
0,0_0,Davao Del Sur,<Feature>,5.69981,125.436255,4.0,0.402025,2015.0,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
1,0_1,Saranggani,<Feature>,5.780659,125.274558,4.0,0.374269,2015.0,"{""geodesic"":false,""type"":""Point"",""coordinates""..."


In [8]:
df4['.geo'][0]

'{"geodesic":false,"type":"Point","coordinates":[125.43625469802937,5.699810477738364]}'

In [9]:
df5.head(2)

Unnamed: 0,Report No.,Date,Time of Report,Region,Weather System,Effects
0,1,"February 17, 2021",05:09 pm,Region 1,Heavy Rainfall Warning,Heavy Rainfall Warning
1,2,"February 22, 2021",05:00 pm,"Region 5, Region 8",Low Pressure Area (LPA),Light to moderate with at times heavy rains af...


In [10]:
# Function to extract coordinates
def extract_coordinates(wkt):
    try:
        geom = loads(wkt)
        if geom.geom_type == 'Point':
            return geom.x, geom.y
        elif geom.geom_type in ['Polygon', 'MultiPolygon']:
            # Extract centroid or representative point
            point = geom.centroid if geom.geom_type == 'Polygon' else geom.representative_point()
            return point.x, point.y
    except Exception as e:
        print(f"Error parsing WKT: {e}")
        return None, None

In [11]:
# Apply the function
df1[['longitude', 'latitude']] = df1['WKT'].apply(lambda x: pd.Series(extract_coordinates(x)))
# Apply the function
df2[['longitude', 'latitude']] = df2['WKT'].apply(lambda x: pd.Series(extract_coordinates(x)))

In [12]:
df1.head(2)

Unnamed: 0,city,region,province,city_code,WKT,longitude,latitude
0,ABORLAN,REGION IV-B (MIMAROPA),PALAWAN,PH175301000,MULTIPOLYGON (((118.511668551 9.33087966600004...,118.453062,9.500632
1,ABRA DE ILOG,REGION IV-B (MIMAROPA),OCCIDENTAL MINDORO,PH175101000,"POLYGON ((120.633791242 13.2827529140001, 120....",120.723685,13.39744


In [13]:
df2.head(2)

Unnamed: 0,WKT,region,province,city_code,city,barangays,geometry,longitude,latitude
0,MULTIPOLYGON (((120.254410793 15.8811169680001...,REGION I (ILOCOS REGION),PANGASINAN,PH015502000,AGUILAR,Pogomboa,MULTIPOLYGON (((120.254410793 15.8811169680001...,120.243705,15.882583
1,MULTIPOLYGON (((120.263724985 15.8172401300001...,REGION I (ILOCOS REGION),PANGASINAN,PH015502000,AGUILAR,Pogonsili,MULTIPOLYGON (((120.263724985 15.8172401300001...,120.203479,15.783225


# Merge Using Nearest Neighbor Approach:

In [14]:
from scipy.spatial import cKDTree

# Define a function to find the nearest neighbors
def merge_nearest(df1, df2):
    # Create KDTree for efficient nearest neighbor search
    tree = cKDTree(df2[['latitude', 'longitude']].values)
    distances, indices = tree.query(df1[['latitude', 'longitude']].values, k=1)

    # Merge the data based on nearest points
    merged = df1.join(df2.iloc[indices].reset_index(drop=True), lsuffix='_df1', rsuffix='_df2')

    # Clean up and rename columns if necessary
    merged = merged.rename(columns={'latitude_df1': 'latitude', 'longitude_df1': 'longitude'})
    merged = merged.drop(columns=['latitude_df2', 'longitude_df2'], errors='ignore')

    return merged

In [15]:
# Perform nearest-neighbor merges sequentially
merge1 = merge_nearest(df1, df2)
merge2 = merge_nearest(merge1, df3)
df = merge_nearest(merge2, df4)

In [16]:
# Display the merged DataFrame
df.head(3)

Unnamed: 0,city_df1,region_df1,province_df1,city_code_df1,WKT_df1,longitude,latitude,WKT_df2,region_df2,province_df2,city_code_df2,city_df2,barangays,geometry_df1,index,source_nam,source_lin,event_id,event_date,event_time,event_titl,event_desc,location_d,location_a,landslide_,landslid_1,landslid_2,landslid_3,fatality_c,injury_cou,storm_name,photo_link,notes,event_impo,event_im_1,country_na,country_co,admin_divi,admin_di_1,gazeteer_c,gazeteer_d,submitted_,created_da,last_edite,geometry_df2,system:index,city,closest_city,month,soil_moisture_am,year,.geo
0,ABORLAN,REGION IV-B (MIMAROPA),PALAWAN,PH175301000,MULTIPOLYGON (((118.511668551 9.33087966600004...,118.453062,9.500632,MULTIPOLYGON (((118.505045795 9.46650979600003...,REGION IV-B (MIMAROPA),PALAWAN,PH175301000,ABORLAN,Cabigaan,MULTIPOLYGON (((118.505045795 9.46650979600003...,1767,GMA News,http://www.gmanetwork.com/news/story/516206/ne...,7074,07/04/2015 05:00:00 PM,17:00,Km 35 and 37 at Montible-Napsan Road in Puerto...,,Km 35 and 37 at Montible-Napsan Road in Puerto...,25km,landslide,tropical_cyclone,small,unknown,0.0,0.0,Tropical Storm Egay,,,glc,7074.0,Philippines,PH,Mimaropa,4141.0,Irahuan,21.05386,07/06/2015 03:08:00 PM,11/20/2017 03:17:00 PM,02/15/2018 03:51:00 PM,POINT (118.5489 9.6811),70_1140,Palawan,<Feature>,2.0,0.388546,2021.0,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
1,ABRA DE ILOG,REGION IV-B (MIMAROPA),OCCIDENTAL MINDORO,PH175101000,"POLYGON ((120.633791242 13.2827529140001, 120....",120.723685,13.39744,"MULTIPOLYGON (((120.861720004 13.380864967,120...",REGION IV-B (MIMAROPA),OCCIDENTAL MINDORO,PH175101000,ABRA DE ILOG,San Vicente,"MULTIPOLYGON (((120.861720004 13.380864967, 12...",3262,newsinfo.inquirer.net,http://newsinfo.inquirer.net/496657/woman-dies...,5558,09/27/2013 10:30:00 PM,22:00,"Inicbulan In Bauan, Batangas, Calabarzon",A woman died while her husband and 8-month-old...,"Inicbulan In Bauan, Batangas, Calabarzon",5km,landslide,continuous_rain,medium,unknown,1.0,1.0,,,,glc,5558.0,Philippines,PH,Calabarzon,2570.0,Inicbulan,0.1481,04/01/2014 12:00:00 AM,11/20/2017 03:17:00 PM,02/15/2018 03:51:00 PM,POINT (120.9839 13.8217),92_1808,Mindoro Occidental,<Feature>,12.0,0.583229,2022.0,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
2,ABUCAY,REGION III (CENTRAL LUZON),BATAAN,PH030801000,"POLYGON ((120.548766207 14.716234408, 120.5487...",120.487222,14.721644,"MULTIPOLYGON (((120.535795766 14.716939589,120...",REGION III (CENTRAL LUZON),BATAAN,PH030801000,ABUCAY,Salian,"MULTIPOLYGON (((120.535795766 14.716939589, 12...",2237,mb,http://www.mb.com.ph/articles/324320/dpwh-clea...,3655,06/26/2011 12:00:00 AM,,Blocked the zigzag road going to Mariveles tow...,District Engineer Medel Chua of the DPWH 2nd e...,Blocked the zigzag road going to Mariveles tow...,25km,landslide,tropical_cyclone,medium,unknown,0.0,,Tropical Storm Falcon,,,glc,3655.0,Philippines,PH,Central Luzon,72954.0,Balanga,0.8511,04/01/2014 12:00:00 AM,11/20/2017 03:17:00 PM,02/15/2018 03:51:00 PM,POINT (120.5333 14.6833),64_2056,Bataan,<Feature>,8.0,0.552908,2020.0,"{""geodesic"":false,""type"":""Point"",""coordinates""..."


In [17]:
df.shape

(1444, 52)

In [18]:
df.isnull().sum()

city_df1               0
region_df1             0
province_df1           0
city_code_df1          0
WKT_df1                0
longitude              0
latitude               0
WKT_df2                0
region_df2             0
province_df2           0
city_code_df2          0
city_df2               0
barangays              0
geometry_df1           0
index                  0
source_nam             0
source_lin            21
event_id               0
event_date             0
event_time           928
event_titl             0
event_desc            30
location_d            22
location_a             0
landslide_             0
landslid_1             0
landslid_2             0
landslid_3             0
fatality_c           231
injury_cou          1055
storm_name           898
photo_link          1396
notes               1444
event_impo             0
event_im_1             0
country_na             0
country_co             0
admin_divi            20
admin_di_1             0
gazeteer_c             0


In [None]:
# Save the DataFrame to a CSV file
#df.to_csv('merge_csv_.csv', index=False) 

In [None]:
df.columns

In [None]:
# Modify the columns to drop as per your specific dataset
#df.drop(columns=['WKT_df1', 'WKT_df2', 'geometry_df1', 'geometry_df2', 'photo_link', 'notes', 'source_nam', 'source_lin'], inplace=True)

In [None]:
# Convert event date and time to datetime objects
#df['event_datetime'] = pd.to_datetime(df['event_date'] + ' ' + df['event_time'], errors='coerce')

In [None]:
df5.head()

In [None]:
df5.shape

In [None]:
df[['event_date','event_time']]

In [None]:
# Step 1: Convert 'event_date' and 'event_time' in df1, and 'Date' and 'Time of Report' in df5 to datetime
df['event_datetime'] = pd.to_datetime(df['event_date'], errors='coerce')
df5['report_datetime'] = pd.to_datetime(df5['Date'] + ' ' + df5['Time of Report'], errors='coerce')

In [None]:
df['event_datetime'].info()

In [None]:
df['event_datetime']

In [None]:
# Step 2: Sort both dataframes by datetime to facilitate the merge
df = df.sort_values(by='event_datetime')
df5 = df5.sort_values(by='report_datetime')

In [None]:
df.head()

In [None]:
# Step 3: Create a function to find the nearest report_datetime for each event_datetime
def merge_nearest_time(df1, df5):
    # Merge df1 and df5 based on the closest time match using pandas merge_asof
    merged = pd.merge_asof(df1, df5, left_on='event_datetime', right_on='report_datetime', direction='backward', tolerance=pd.Timedelta('24 hours'))

    # Clean up by dropping unnecessary columns from df5 after the merge
    merged = merged.drop(columns=['Date', 'Time of Report', 'report_datetime'])

    return merged

In [None]:
# Step 4: Merge the datasets based on the nearest time
merged_df = merge_nearest_time(df, df5)

In [None]:
merged_df.head()

In [None]:
merged_df.shape

In [None]:
merged_df.isnull().sum()

In [None]:
# Merging based on the closest matching datetime within 24 hours
merged_df1 = pd.merge_asof(
    df.sort_values('event_datetime'),
    df5.sort_values('report_datetime'),
    left_on='event_datetime',
    right_on='report_datetime',
    direction='backward',  # Report within 24 hours before the event
    tolerance=pd.Timedelta('24 hours')
)


In [None]:
merged_df1.head()

In [None]:
merged_df1.isnull().sum()

In [19]:
df['event_date']

0       07/04/2015 05:00:00 PM
1       09/27/2013 10:30:00 PM
2       06/26/2011 12:00:00 AM
3       10/08/2009 12:00:00 AM
4       03/17/2011 12:00:00 AM
                 ...          
1439    08/04/2013 03:00:00 PM
1440    05/14/2008 12:00:00 AM
1441    09/26/2009 02:30:00 PM
1442    07/30/2016 12:00:00 AM
1443    10/21/2011 03:30:00 AM
Name: event_date, Length: 1444, dtype: object

In [22]:
df['event_date'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1444 entries, 0 to 1443
Series name: event_date
Non-Null Count  Dtype 
--------------  ----- 
1444 non-null   object
dtypes: object(1)
memory usage: 11.4+ KB


In [23]:
df['event_datetime'] = pd.to_datetime(df['event_date'], errors='coerce')

  df['event_datetime'] = pd.to_datetime(df['event_date'], errors='coerce')


In [24]:
df['event_datetime'][0]

Timestamp('2015-07-04 17:00:00')

In [25]:
# Extract relevant temporal features
df['event_year'] = df['event_datetime'].dt.year
df['event_month'] = df['event_datetime'].dt.month
df['event_day'] = df['event_datetime'].dt.day
df['event_hour'] = df['event_datetime'].dt.hour
df['event_weekday'] = df['event_datetime'].dt.weekday

In [27]:
df.columns

Index(['city_df1', 'region_df1', 'province_df1', 'city_code_df1', 'WKT_df1',
       'longitude', 'latitude', 'WKT_df2', 'region_df2', 'province_df2',
       'city_code_df2', 'city_df2', 'barangays', 'geometry_df1', 'index',
       'source_nam', 'source_lin', 'event_id', 'event_date', 'event_time',
       'event_titl', 'event_desc', 'location_d', 'location_a', 'landslide_',
       'landslid_1', 'landslid_2', 'landslid_3', 'fatality_c', 'injury_cou',
       'storm_name', 'photo_link', 'notes', 'event_impo', 'event_im_1',
       'country_na', 'country_co', 'admin_divi', 'admin_di_1', 'gazeteer_c',
       'gazeteer_d', 'submitted_', 'created_da', 'last_edite', 'geometry_df2',
       'system:index', 'city', 'closest_city', 'month', 'soil_moisture_am',
       'year', '.geo', 'event_datetime', 'event_year', 'event_month',
       'event_day', 'event_hour', 'event_weekday'],
      dtype='object')

In [28]:
df.head(2)

Unnamed: 0,city_df1,region_df1,province_df1,city_code_df1,WKT_df1,longitude,latitude,WKT_df2,region_df2,province_df2,city_code_df2,city_df2,barangays,geometry_df1,index,source_nam,source_lin,event_id,event_date,event_time,event_titl,event_desc,location_d,location_a,landslide_,landslid_1,landslid_2,landslid_3,fatality_c,injury_cou,storm_name,photo_link,notes,event_impo,event_im_1,country_na,country_co,admin_divi,admin_di_1,gazeteer_c,gazeteer_d,submitted_,created_da,last_edite,geometry_df2,system:index,city,closest_city,month,soil_moisture_am,year,.geo,event_datetime,event_year,event_month,event_day,event_hour,event_weekday
0,ABORLAN,REGION IV-B (MIMAROPA),PALAWAN,PH175301000,MULTIPOLYGON (((118.511668551 9.33087966600004...,118.453062,9.500632,MULTIPOLYGON (((118.505045795 9.46650979600003...,REGION IV-B (MIMAROPA),PALAWAN,PH175301000,ABORLAN,Cabigaan,MULTIPOLYGON (((118.505045795 9.46650979600003...,1767,GMA News,http://www.gmanetwork.com/news/story/516206/ne...,7074,07/04/2015 05:00:00 PM,17:00,Km 35 and 37 at Montible-Napsan Road in Puerto...,,Km 35 and 37 at Montible-Napsan Road in Puerto...,25km,landslide,tropical_cyclone,small,unknown,0.0,0.0,Tropical Storm Egay,,,glc,7074.0,Philippines,PH,Mimaropa,4141.0,Irahuan,21.05386,07/06/2015 03:08:00 PM,11/20/2017 03:17:00 PM,02/15/2018 03:51:00 PM,POINT (118.5489 9.6811),70_1140,Palawan,<Feature>,2.0,0.388546,2021.0,"{""geodesic"":false,""type"":""Point"",""coordinates""...",2015-07-04 17:00:00,2015,7,4,17,5
1,ABRA DE ILOG,REGION IV-B (MIMAROPA),OCCIDENTAL MINDORO,PH175101000,"POLYGON ((120.633791242 13.2827529140001, 120....",120.723685,13.39744,"MULTIPOLYGON (((120.861720004 13.380864967,120...",REGION IV-B (MIMAROPA),OCCIDENTAL MINDORO,PH175101000,ABRA DE ILOG,San Vicente,"MULTIPOLYGON (((120.861720004 13.380864967, 12...",3262,newsinfo.inquirer.net,http://newsinfo.inquirer.net/496657/woman-dies...,5558,09/27/2013 10:30:00 PM,22:00,"Inicbulan In Bauan, Batangas, Calabarzon",A woman died while her husband and 8-month-old...,"Inicbulan In Bauan, Batangas, Calabarzon",5km,landslide,continuous_rain,medium,unknown,1.0,1.0,,,,glc,5558.0,Philippines,PH,Calabarzon,2570.0,Inicbulan,0.1481,04/01/2014 12:00:00 AM,11/20/2017 03:17:00 PM,02/15/2018 03:51:00 PM,POINT (120.9839 13.8217),92_1808,Mindoro Occidental,<Feature>,12.0,0.583229,2022.0,"{""geodesic"":false,""type"":""Point"",""coordinates""...",2013-09-27 22:30:00,2013,9,27,22,4


### Columns to Drop (Irrelevant or Redundant for Prediction)

#### IDs and Codes:
- `city_code_df1`, `city_code_df2`, `event_id`, `system:index` – IDs or codes that have no predictive value.

#### URLs and Media Sources:
- `source_nam`, `source_lin`, `photo_link`, `notes` – URLs and source links that don’t contribute to the model.

#### Event Metadata (Non-Numerical and Not Predictive):
- `location_d`, `location_a`, `event_titl`, `event_impo`, `event_im_1`, `storm_name`, `event_time` – Descriptive metadata that can be ignored.

#### Repeated Geospatial Data:
- `gazeteer_c`, `gazeteer_d`, `admin_divi`, `admin_di_1`, `geometry_df1`, `geometry_df2`, `WKT_df2`, `submitted_`, `created_da`, `last_edite`, `closest_city`, `.geo` – Redundant geospatial or administrative data.

#### System Information and Miscellaneous:
- `index`, `country_na`, `country_co` – System-related information and country (since we're focusing on the Philippines, this is redundant).

#### Impact and Fatality Columns (Could be dropped if the goal is to predict landslides without post-event data):
- `fatality_c`, `injury_cou` – If these are only available after the event, they won’t be useful for prediction.


In [29]:
# List of columns to drop
columns_to_drop = [
    'WKT_df1', 'region_df2', 'province_df2', 'city_df2', 'event_date',
    'city_code_df1', 'city_code_df2', 'event_id', 'system:index',
    'source_nam', 'source_lin', 'photo_link', 'notes', 'location_d', 
    'location_a', 'event_titl', 'event_impo', 'event_im_1', 'storm_name', 
    'event_time', 'gazeteer_c', 'gazeteer_d', 'admin_divi', 'admin_di_1', 
    'geometry_df1', 'geometry_df2', 'WKT_df2', 'submitted_', 'created_da', 
    'last_edite', 'closest_city', '.geo', 'index', 'country_na', 'country_co',
    'fatality_c', 'injury_cou'
]

# Drop these columns
df_cleaned = df.drop(columns=columns_to_drop)

In [31]:
df_cleaned.head(2)

Unnamed: 0,city_df1,region_df1,province_df1,WKT_df1,longitude,latitude,region_df2,province_df2,city_df2,barangays,event_date,event_desc,landslide_,landslid_1,landslid_2,landslid_3,city,month,soil_moisture_am,year,event_datetime,event_year,event_month,event_day,event_hour,event_weekday
0,ABORLAN,REGION IV-B (MIMAROPA),PALAWAN,MULTIPOLYGON (((118.511668551 9.33087966600004...,118.453062,9.500632,REGION IV-B (MIMAROPA),PALAWAN,ABORLAN,Cabigaan,07/04/2015 05:00:00 PM,,landslide,tropical_cyclone,small,unknown,Palawan,2.0,0.388546,2021.0,2015-07-04 17:00:00,2015,7,4,17,5
1,ABRA DE ILOG,REGION IV-B (MIMAROPA),OCCIDENTAL MINDORO,"POLYGON ((120.633791242 13.2827529140001, 120....",120.723685,13.39744,REGION IV-B (MIMAROPA),OCCIDENTAL MINDORO,ABRA DE ILOG,San Vicente,09/27/2013 10:30:00 PM,A woman died while her husband and 8-month-old...,landslide,continuous_rain,medium,unknown,Mindoro Occidental,12.0,0.583229,2022.0,2013-09-27 22:30:00,2013,9,27,22,4


In [35]:
df_cleaned.shape

(1444, 26)

In [33]:
df_cleaned.isnull().sum()

city_df1             0
region_df1           0
province_df1         0
WKT_df1              0
longitude            0
latitude             0
region_df2           0
province_df2         0
city_df2             0
barangays            0
event_date           0
event_desc          30
landslide_           0
landslid_1           0
landslid_2           0
landslid_3           0
city                 0
month                0
soil_moisture_am     0
year                 0
event_datetime       0
event_year           0
event_month          0
event_day            0
event_hour           0
event_weekday        0
dtype: int64

In [36]:
df_cleaned.dropna(inplace=True)

In [37]:
df_cleaned.isnull().sum()

city_df1            0
region_df1          0
province_df1        0
WKT_df1             0
longitude           0
latitude            0
region_df2          0
province_df2        0
city_df2            0
barangays           0
event_date          0
event_desc          0
landslide_          0
landslid_1          0
landslid_2          0
landslid_3          0
city                0
month               0
soil_moisture_am    0
year                0
event_datetime      0
event_year          0
event_month         0
event_day           0
event_hour          0
event_weekday       0
dtype: int64

In [43]:
df_cleaned.drop(columns=['event_datetime','event_date'], inplace=True)

In [42]:
df_cleaned.duplicated().sum()

0

In [44]:
df_cleaned['landslide_'].value_counts()

landslide_
landslide              1277
complex                  42
mudslide                 34
rock_fall                21
debris_flow              12
lahar                     9
translational_slide       7
other                     6
riverbank_collapse        6
Name: count, dtype: int64

In [None]:
# Handle missing values by filling or dropping - adjust based on domain knowledge
df.fillna(method='ffill', inplace=True)  # Forward fill as an example; you may choose other methods

In [None]:
df.isnull().sum()

In [None]:
df.head(4)

In [46]:
#Just Test
# Target Variable: Create binary target if landslide occurs within the next 24 hours
df_cleaned['target'] = np.where(df_cleaned['event_desc'].str.contains('landslide'), 1, 0)
df_cleaned['target'].value_counts()

target
1    1049
0     365
Name: count, dtype: int64

In [None]:
### Step 2: Feature Engineering ###

# Create lag features for soil moisture, precipitation, etc.
# Adjust based on the column names in your data
# Example: Creating 1-day lag for soil moisture and precipitation
df['soil_moisture_lag1'] = df['soil_moisture_am'].shift(1)
df['precipitation_lag1'] = df['landslid_2'].shift(1)  # Replace with actual precipitation column if available