In [1]:
!pip install geopy
!pip install geemap

import numpy as np
import datetime as dt
import re
from geopy.geocoders import Nominatim
import requests
import pandas as pd
from collections import defaultdict
import os
import glob

Collecting geopy
  Downloading geopy-2.2.0-py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 10.4 MB/s 
[?25hCollecting geographiclib<2,>=1.49
  Downloading geographiclib-1.52-py3-none-any.whl (38 kB)
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.52 geopy-2.2.0
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.[0m


In [None]:
df = pd.read_csv('/work/data/landslide/Global_Landslide_Catalog_Export.csv')

# Preprocess before collecting data

### &nbsp;Preprocessing country_name

In [None]:
df['country_name'].isnull().sum()

In [None]:
m_c_index = df[df['country_name'].isnull()==True].index

geolocator = Nominatim(user_agent="geoapiExercises",timeout=None)

for i in m_c_index:
  Latitude = df.loc[i,'latitude']
  Longitude = df.loc[i,'longitude']
  location = geolocator.reverse(str(Latitude)+","+str(Longitude),language='en')
  address = location.raw['address']
  df.at[i,'country_name']=address['country']

In [None]:
df = df.replace({'country_name':{'Myanmar [Burma]':'Myanmar',
                                 'U.S. Virgin Islands':'United States Virgin Islands'}})
df['country_name'].isnull().sum()

### Preprocess data_time

In [None]:
df['event_date'] = pd.to_datetime(df['event_date'],format='%m/%d/%Y %I:%M:%S %p',dayfirst=True)
df['created_date'] = pd.to_datetime(df['created_date'],format='%m/%d/%Y %I:%M:%S %p',dayfirst=True)
df['last_edited_date'] = pd.to_datetime(df['last_edited_date'],format='%m/%d/%Y %I:%M:%S %p',dayfirst=True)

# Features Collection

### Weather

In [None]:
weather = defaultdict(list)
API = 'F87HGQJ8MRULVGLM7QGQWKW6E'

for i in range(df.shape[0]):
  print(i)
  date = df['event_date'][i][:10]
  lat = df['latitude'][i]
  long_ = df['longitude'][i]

  response = requests.get("https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/{},{}/{}?unitGroup=metric&key={}&include=obs".format(lat,long_,date,API))
  res = response.json()
  weather_data = res['days'][0]
  for key, val in weather_data.items():
      weather[key].append(val)

In [None]:
weather.pop('source')
weather_df = pd.DataFrame(weather)
weather_df.insert(loc=0, column='event_id', value=df['event_id'])
weather_df.to_csv('/work/data/features/weather_full.csv',index=False)

### Elevation

In [None]:
def get_elevation2(lat, long_):
    query = ('https://api.airmap.com/elevation/v1/ele/?points={},{}'.format(lat,long_))
    r = requests.get(query).json()
    return r['data'][0]

In [None]:
elevation = []
for i in range(df.shape[0]):
    print(i)
    lat, long_ = df['latitude'][i],df['longitude'][i]
    l = get_elevation2(lat,long_)
    elevation.append(l)

In [None]:
elevation_df = pd.DataFrame({'elevation':elevation})
elevation_df.to_csv('/work/data/features/elevation.csv',index=False)

### Continent

In [None]:
!pip install pycountry_convert
import pycountry_convert as pc

def country_to_continent(country_name):
    country_alpha2 = pc.country_name_to_country_alpha2(country_name)
    country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
    country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    return country_continent_name

# get continent
continents = []
for i in df['country_name']:
  print(i)
  if(i=='East Timor'):
    continents.append('Asia')
  else:
    continents.append(country_to_continent(i))

In [None]:
continents = pd.DataFrame({'continent':continents})
continents.to_csv('/work/data/features/continents.csv',index=False)

### Season

In [None]:
def season_of_date(date,lat):
    year = str(date.year)
    date = date.strftime('%Y-%m-%d')
    if(lat>0): #bắc bán cầu
        seasons = {'spring': pd.date_range(start='21/03/'+year, end='20/06/'+year).strftime('%Y-%m-%d'),
                'summer': pd.date_range(start='21/06/'+year, end='22/09/'+year).strftime('%Y-%m-%d'),
                'autumn': pd.date_range(start='23/09/'+year, end='20/12/'+year).strftime('%Y-%m-%d')}
        if date in seasons['spring']:
            return 'spring'
        if date in seasons['summer']:
            return 'summer'
        if date in seasons['autumn']:
            return 'autumn'
        else:
            return 'winter' #đông chí: 21/12

    else: #nam bán cầu
        seasons = {'autumn': pd.date_range(start='21/03/'+year, end='20/06/'+year).strftime('%Y-%m-%d'),
                'winter': pd.date_range(start='21/06/'+year, end='22/09/'+year).strftime('%Y-%m-%d'),
                'spring': pd.date_range(start='23/09/'+year, end='20/12/'+year).strftime('%Y-%m-%d')}
        if date in seasons['spring']:
            return 'spring'
        if date in seasons['winter']:
            return 'winter'
        if date in seasons['autumn']:
            return 'autumn'
        else:
            return 'summer' #hạ chí: 21/12

# Assuming df has a date column of type `datetime`
season = [season_of_date(df['event_date'][i],df['latitude'][i]) for i in range(df.shape[0])]
season = pd.DataFrame({'season':season})
season
season.to_csv('/work/data/features/season.csv',index=False)

Unnamed: 0,season
0,summer
1,winter
2,summer
3,summer
4,autumn
...,...
11028,autumn
11029,spring
11030,autumn
11031,spring


### Forest

In [None]:
import ee
ee.Authenticate()
ee.Initialize()

In [None]:
from copy import deepcopy
forest_data = {'treecover2000' : [],
'loss' : [],
'gain' : [],
'lossyear' : [],
'first_b30':[],
'first_b40':[],
'first_b50':[],
'first_b70':[],
'last_b30':[],
'last_b40':[],
'last_b50':[],
'last_b70':[]}

fr1 = deepcopy(forest_data)
fr2 = deepcopy(forest_data)
fr3 = deepcopy(forest_data)
fr4 = deepcopy(forest_data)
fr5 = deepcopy(forest_data)
fr6 = deepcopy(forest_data)
fr7 = deepcopy(forest_data)
fr8 = deepcopy(forest_data)

In [None]:
import sys

def get_fr(fr,idx):
    im1 = ee.Image('UMD/hansen/global_forest_change_2020_v1_8')
    for i in range(idx[0],idx[1]):
        print(i)
        lon, lat = df['longitude'][i],df['latitude'][i]
        p = ee.Geometry.Point(lon,lat)
        for i in fr:
            data = im1.select(i).reduceRegion(ee.Reducer.first(),p,18).get(i)
            dataN = ee.Number(data).getInfo()
            fr[i].append(dataN)

In [None]:
import threading

p1 = threading.Thread(target=get_fr, args=(fr1,(0,1379)))
p2 = threading.Thread(target=get_fr, args=(fr2,(1379,2758)))
p3 = threading.Thread(target=get_fr, args=(fr3,(2758,4137)))
p4 = threading.Thread(target=get_fr, args=(fr4,(4137,5516)))

p5 = threading.Thread(target=get_fr, args=(fr5,(5516,6895)))
p6 = threading.Thread(target=get_fr, args=(fr6,(6895,8274)))
p7 = threading.Thread(target=get_fr, args=(fr7,(8274,9653)))
p8 = threading.Thread(target=get_fr, args=(fr8,(9653,11033)))


p1.start()
p2.start()
p3.start()
p4.start()

p5.start()
p6.start()
p7.start()
p8.start()

In [None]:
for key in forest_data:
    forest_data[key]+=fr1[key]+fr2[key]+fr3[key]+fr4[key]+fr5[key]+fr6[key]+fr7[key]+fr8[key]

In [None]:
forest_data = pd.DataFrame(forest_data)
forest_data = forest_data[['treecover2000','loss','gain']]
forest_data.to_csv('/work/data/features/forest.csv',index=False)

### Soil Texture

In [None]:
import ee
ee.Authenticate()
ee.Initialize()

In [None]:
from copy import deepcopy
soil_data = {'b0' : [],
            'b10' : [],
            'b30' : [],
            'b60' : [],
            'b100':[],
            'b200':[]}

s1 = deepcopy(soil_data)
s2 = deepcopy(soil_data)
s3 = deepcopy(soil_data)
s4 = deepcopy(soil_data)
s5 = deepcopy(soil_data)
s6 = deepcopy(soil_data)
s7 = deepcopy(soil_data)
s8 = deepcopy(soil_data)

In [None]:
def get_soil(s,idx):
    im1 = ee.Image('OpenLandMap/SOL/SOL_TEXTURE-CLASS_USDA-TT_M/v02')
    texture = dict(list(s1.items())[:6])
    for i in range(idx[0],idx[1]):
        lon, lat = df['longitude'][i],df['latitude'][i]
        p = ee.Geometry.Point(lon,lat)
        for i in texture:
            data = im1.select(i).reduceRegion(ee.Reducer.first(),p,18).get(i)
            dataN = ee.Number(data).getInfo()
            s[i].append(dataN)

In [None]:
import threading

p1 = threading.Thread(target=get_soil, args=(s1,(0,1379)))
p2 = threading.Thread(target=get_soil, args=(s2,(1379,2758)))
p3 = threading.Thread(target=get_soil, args=(s3,(2758,4137)))
p4 = threading.Thread(target=get_soil, args=(s4,(4137,5516)))

p5 = threading.Thread(target=get_soil, args=(s5,(5516,6895)))
p6 = threading.Thread(target=get_soil, args=(s6,(6895,8274)))
p7 = threading.Thread(target=get_soil, args=(s7,(8274,9653)))
p8 = threading.Thread(target=get_soil, args=(s8,(9653,11033)))

p1.start()
p2.start()
p3.start()
p4.start()

p5.start()
p6.start()
p7.start()
p8.start()

In [None]:
for key in soil_data:
    soil_data[key]+=s1[key]+s2[key]+s3[key]+s4[key]+s5[key]+s6[key]+s7[key]+s8[key]

In [None]:
soil_df = pd.DataFrame(soil_data)
soil_df = soil_df.rename(columns={'b0': 'soil_texture_0',
                        'b10': 'soil_texture_10',
                        'b30': 'soil_texture_30',
                        'b60': 'soil_texture_60',
                        'b100': 'soil_texture_100',
                        'b200': 'soil_texture_200'}, inplace=False)

replace_values = {
1	:	'Cl',
2	:	'SiCl',
3	:	'SaCl',
4	:	'ClLo',
5	:	'SiClLo',
6	:	'SaClLo',
7	:	'Lo',
8	:	'SiLo',
9	:	'SaLo',
10	:	'Si',
11	:	'LoSa',
12	:	'Sa'}

for i in soil_df.columns:
    soil_df = soil_df.replace({i: replace_values})

soil_df.to_csv('/work/data/features/soil_texture.csv',index=False)

### Population Density

In [None]:
import ee
ee.Authenticate()
ee.Initialize()

In [None]:
import numpy as np
def get_ppd(s,idx):
    dataset = ee.ImageCollection("CIESIN/GPWv411/GPW_UNWPP-Adjusted_Population_Density")
    listOfImages = dataset.toList(dataset.size());
    img1 = ee.Image(listOfImages.get(0))
    img2 = ee.Image(listOfImages.get(1))
    img3 = ee.Image(listOfImages.get(2))
    img4 = ee.Image(listOfImages.get(3))
    img5 = ee.Image(listOfImages.get(4))

    for i in range(idx[0],idx[1]):
        lon, lat = df['longitude'][i],df['latitude'][i]
        p = ee.Geometry.Point(lon,lat)
        img = [img1,img2,img3,img4,img5]
        for j in range(5):
            tmp = img[j]
            data = tmp.select('unwpp-adjusted_population_density').reduceRegion(ee.Reducer.first(),p,18).get('unwpp-adjusted_population_density')
            dataN = ee.Number(data)
            img[j] = dataN.getInfo()

        for idx,i in enumerate(s):
            s[i].append(img[idx])
        # img = list(filter(None, img))
        # s.append(np.mean(img))

In [None]:
from copy import deepcopy
population_dense =  {'population_density_2000' : [],
                    'population_density_2005' : [],
                    'population_density_2010' : [],
                    'population_density_2015' : [],
                    'population_density_2020':[]}

s1 = deepcopy(population_dense)
s2 = deepcopy(population_dense)
s3 = deepcopy(population_dense)
s4 = deepcopy(population_dense)
s5 = deepcopy(population_dense)
s6 = deepcopy(population_dense)
s7 = deepcopy(population_dense)
s8 = deepcopy(population_dense)

In [None]:
import threading

p1 = threading.Thread(target=get_ppd, args=(s1,(0,1379)))
p2 = threading.Thread(target=get_ppd, args=(s2,(1379,2758)))
p3 = threading.Thread(target=get_ppd, args=(s3,(2758,4137)))
p4 = threading.Thread(target=get_ppd, args=(s4,(4137,5516)))

p5 = threading.Thread(target=get_ppd, args=(s5,(5516,6895)))
p6 = threading.Thread(target=get_ppd, args=(s6,(6895,8274)))
p7 = threading.Thread(target=get_ppd, args=(s7,(8274,9653)))
p8 = threading.Thread(target=get_ppd, args=(s8,(9653,11033)))

p1.start()
p2.start()
p3.start()
p4.start()

p5.start()
p6.start()
p7.start()
p8.start()

In [None]:
for key in population_dense:
    population_dense[key]+=s1[key]+s2[key]+s3[key]+s4[key]+s5[key]+s6[key]+s7[key]+s8[key]

In [None]:
population_dense_df = pd.DataFrame(population_dense)
population_dense_df.to_csv('/work/data/features/population_density.csv',index=False)

# Merge features into one dataset

In [5]:
glc = pd.read_csv('/work/data/landslide/Global_Landslide_Catalog_Export.csv')
weather = pd.read_csv('/work/data/weather/weather_full.csv')
weather_new = weather.drop(['solarenergy','preciptype','precipprob','solarradiation','snow','snowdepth','uvindex',
'description','icon','datetime','sunrise','sunriseEpoch','sunset','sunsetEpoch','datetimeEpoch','event_id'],axis=1)

elevation = pd.read_csv('/work/data/features/elevation.csv')
continents = pd.read_csv('/work/data/features/continents.csv')
season = pd.read_csv('/work/data/features/season.csv')
forest = pd.read_csv('/work/data/features/forest.csv')
soil = pd.read_csv('/work/data/features/soil_texture.csv')
ppd = pd.read_csv('/work/data/features/population_density.csv')

In [8]:
dataset = pd.concat([glc,weather_new,elevation,continents,season,forest,soil,ppd],axis=1)
dataset.to_csv('/work/data/landslide/GLC_features.csv',index=False)
print(dataset.shape)

(11033, 67)


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=a3127b5d-0424-48ff-88d6-e1f4925737c1' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>