# Requesting Anophelines Data

The original data contain different species of Anophelines. Due to time limit, we only train study the specific species __Anophelines Funestus__

In [43]:
import pandas as pd
#the data can be downloaded from https://www.kaggle.com/jboysen/malaria-mosquito
# We have eliminated the data before 1990 due to the limit of available climate data
original_data=pd.read_csv('data/Anophelines.csv', engine='python')
original_data=original_data.loc[original_data['YeStart']>1990]
print(original_data.shape)
original_data.head()

(5342, 41)


Unnamed: 0,Country,GAUL_Admin1,GAUL_Admin2,Full_Name,Lat,Long,LatLong_Source,YeStart,YeEnd,An gambiae_complex,...,An rufipes,An coustani s.l,An ziemanni,An paludis,Adults/Larvae,Sampling_Methods,Species_Identification,Other sib species names,Other Anopheline species,Source_Title
2,Angola,Bengo,Dande,Cabungo,-8.5667,13.5,Other,2002,2002,Y,...,,,,,Adults,"IR, Bednet traps","M, PCR",,,"Calzetta M, Santolamazza F, Carrara GC, Cani P..."
7,Angola,Bengo,Icolo E Bengo,Kilunda,-8.9268,13.5878,Other,2001,2002,,...,,,,,Adults,IR,"M, CBS",,,"Boccolini D, Carrara GC, Dia I, Fortes F, Cani..."
14,Angola,Benguela,Balombo,Barragem,-12.339,14.7892,Combination,2008,2009,Y,...,,Y,Y,,Adults,CDC Light trap,M,,"An maculipalpis, An tenebrosus","Brosseau L, Drame PM, Besnard P, Toto JC, Foum..."
15,Angola,Benguela,Balombo,Candiero,-12.3665,14.761,Combination,2008,2009,Y,...,,Y,Y,,Adults,CDC Light trap,M,,"An maculipalpis, An tenebrosus","Brosseau L, Drame PM, Besnard P, Toto JC, Foum..."
16,Angola,Benguela,Balombo,Canjala,-12.3547,14.7365,Combination,2008,2009,Y,...,,Y,Y,,Adults,CDC Light trap,M,,"An maculipalpis, An tenebrosus","Brosseau L, Drame PM, Besnard P, Toto JC, Foum..."


In [44]:
df=original_data[['Full_Name','Lat','Long','YeStart','YeEnd','An funestus  s.l']]

# In the original data, Y means occurrence
# We formulate the problem as a binary classification problem
# Replace all 'Y' with 1, and all 'Nan' with 0
mapping={'Y':1}
data=df.replace({'An funestus  s.l':mapping,'An gambiae ss':mapping})
data=data.fillna(0)
data=data.rename(columns={'An funestus  s.l':'Funestus'})
data.to_csv('data/Mosquitoes.csv',index=False)
data.head()

Unnamed: 0,Full_Name,Lat,Long,YeStart,YeEnd,Funestus
2,Cabungo,-8.5667,13.5,2002,2002,0.0
7,Kilunda,-8.9268,13.5878,2001,2002,1.0
14,Barragem,-12.339,14.7892,2008,2009,1.0
15,Candiero,-12.3665,14.761,2008,2009,1.0
16,Canjala,-12.3547,14.7365,2008,2009,1.0


Due to time limit, our climate features conssist of __[monthly minimum temperatures, monthly maximum temperatures]__. More climate features can be added in the future work.

In [None]:
for feature in ['min','max']:
  for i in range(12):
    data[f'{feature}_{i+1}']=''
  
data.head()

# Requesting Climate Data

After the preparion, now we should add climate data. For each row of _data_ , we request and compute monthly minimum temperatures and monthly maximum temperatures, the number of features is 24

In [None]:
import json
from ibmpairs import paw, authentication
from geopandas import GeoDataFrame
import numpy as np

def get_min_temp(lat,long,end_year):
  PAIRS_API_KEY = '1FkApXtW3DHJWYPRLIFLWkhmADLiKFhe2uNOclT1CoU' 
  PAIRS_SERVER   = 'https://pairs.res.ibm.com'
  PAIRS_CREDENTIALS = authentication.OAuth2(api_key = PAIRS_API_KEY)
  query_json = {
    "layers" : [
      {"type" : "raster", "id" : "49429"}
    ],
    "spatial" : {"type" : "point", "coordinates" : [f"{lat}", f"{long}"]},
    "temporal" : {"intervals" : [
      {"start" : f"{end_year-1}-01-01T00:00:00Z", "end" : f"{end_year}-01-01T00:00:00Z"}
    ]}
  }
  query = paw.PAIRSQuery(query_json, PAIRS_SERVER, PAIRS_CREDENTIALS, authType='api-key')
  try:
    query.submit()
  except:
    return [np.nan]*12
  df=query.vdf
  df=df[['timestamp','value']]
  # return df
  temp_month=[None]*12
  for month in range(12):
    temp_month[month]=[]
  for idx in range(df.shape[0]):
    month=df.iloc[idx]['timestamp'].month
    temp_month[month-1].append(df.iloc[idx]['value']-273.15)
  if all(list(map(len,temp_month))):
    return list(map(min,temp_month))
  else:
    return [np.nan]*12
    

def get_max_temp(lat,long,end_year):
  PAIRS_API_KEY = '1FkApXtW3DHJWYPRLIFLWkhmADLiKFhe2uNOclT1CoU' # Put your API key here. Important: Best practice is not to include secrets in source code.
  PAIRS_SERVER   = 'https://pairs.res.ibm.com'
  PAIRS_CREDENTIALS = authentication.OAuth2(api_key = PAIRS_API_KEY)
  query_json = {
    "layers" : [
      {"type" : "raster", "id" : "49430"}
    ],
    "spatial" : {"type" : "point", "coordinates" : [f"{lat}", f"{long}"]},
    "temporal" : {"intervals" : [
      {"start" : f"{end_year-1}-01-01T00:00:00Z", "end" : f"{end_year}-01-01T00:00:00Z"}
    ]}
  }
  query = paw.PAIRSQuery(query_json, PAIRS_SERVER, PAIRS_CREDENTIALS, authType='api-key')
  try:
    query.submit()
  except:
    return [np.nan]*12
  df=query.vdf
  df=df[['timestamp','value']]
  # return df
  temp_month=[None]*12
  for month in range(12):
    temp_month[month]=[]
  for idx in range(df.shape[0]):
    month=df.iloc[idx]['timestamp'].month
    temp_month[month-1].append(df.iloc[idx]['value']-273.15)
  if all(list(map(len,temp_month))):
    return list(map(max,temp_month))
  else:
    return [np.nan]*12

The following process takes about __10 HOURS__ to finish !!!

In [None]:
# ERA5 A global reanalysis data set produced by ECMWF, the European Centre for Medium-Range Weather Forecasts.
# ERA5 is the direct successor to the ERA Interim reanalysis.
# It provides global, hourly data at a resolution of 0.25 by 0.25 degrees.

# use the following line if you want to continue from where you pause
# for i in range(503,data.shape[0]):
for i in range(data.shape[0]):
  element=data.iloc[i]
  lat=element['Lat']
  long=element['Long']
  end_year=element['YeEnd']
  min_temp=get_min_temp(lat,long,end_year)
  for month in range(12):
    data.at[i,f'min_{month+1}']=min_temp[month]
  min_temp=get_max_temp(lat,long,end_year)
  for month in range(12):
    data.at[i,f'max_{month+1}']=min_temp[month]
    
  # store intermediate data, in case of interrupt
  if i%50==0:
    print(f'{i}\{data.shape[0]}')
    print('-'*20)
    data.to_csv(f'data_ERA5/data_{i}.csv',index=False)
# final result

# Some locations'[latitude and longitude] climate data are not available. And these features are NaN
# We need to drop all rows containning NaN
data.dropna(inplace=True)
data.to_csv('data_ERA5/data.csv',index=False)


In [40]:
data=pd.read_csv('data_ERA5/data.csv')
# shuffle and split the data into training data and testing data
data = data.sample(frac=1).reset_index(drop=True)
train_data=data.iloc[range(4000)]
test_data=data.iloc[range(4000,data.shape[0])]
print(train_data.shape)
print(test_data.shape)
train_data.to_csv('data_ERA5/train_data.csv',index=False)
test_data.to_csv('data_ERA5/test_data.csv',index=False)

(4000, 31)
(772, 31)


In [41]:
data.head().head()

Unnamed: 0.1,Unnamed: 0,Full_Name,Lat,Long,YeStart,YeEnd,Funestus,min_1,min_2,min_3,...,max_3,max_4,max_5,max_6,max_7,max_8,max_9,max_10,max_11,max_12
0,4065,Umbada,15.6397,32.3753,1994,2001,0.0,8.847192,13.334955,12.643304,...,41.430383,41.816675,43.216119,41.544672,41.178217,41.478143,40.649072,39.915735,37.358789,33.019922
1,2346,Marigiza,-4.449,39.467,2015,2015,1.0,24.637994,24.677911,24.037805,...,30.652307,29.250909,27.572107,27.581537,26.489771,27.072778,26.845422,28.566949,30.331079,31.151025
2,4757,Save River,-21.2859,32.3766,1999,2002,0.0,16.903864,19.486322,18.505945,...,31.428522,32.387842,31.177209,28.936761,29.571222,35.055566,37.962,37.090417,37.380548,33.597375
3,3718,Dimat,16.5221,-15.1086,2008,2010,1.0,12.425928,13.719843,14.607233,...,37.488062,42.968866,44.366571,43.563257,40.637689,39.512964,42.764886,43.569025,37.512659,36.312006
4,2363,Dabaso,-3.3397,40.0007,1997,1998,0.0,22.952783,22.442499,23.197229,...,31.368646,30.92196,28.841669,27.905389,27.884729,27.539331,28.542352,28.274469,29.178125,29.556177


In [42]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,Full_Name,Lat,Long,YeStart,YeEnd,Funestus,min_1,min_2,min_3,...,max_3,max_4,max_5,max_6,max_7,max_8,max_9,max_10,max_11,max_12
0,4065,Umbada,15.6397,32.3753,1994,2001,0.0,8.847192,13.334955,12.643304,...,41.430383,41.816675,43.216119,41.544672,41.178217,41.478143,40.649072,39.915735,37.358789,33.019922
1,2346,Marigiza,-4.449,39.467,2015,2015,1.0,24.637994,24.677911,24.037805,...,30.652307,29.250909,27.572107,27.581537,26.489771,27.072778,26.845422,28.566949,30.331079,31.151025
2,4757,Save River,-21.2859,32.3766,1999,2002,0.0,16.903864,19.486322,18.505945,...,31.428522,32.387842,31.177209,28.936761,29.571222,35.055566,37.962,37.090417,37.380548,33.597375
3,3718,Dimat,16.5221,-15.1086,2008,2010,1.0,12.425928,13.719843,14.607233,...,37.488062,42.968866,44.366571,43.563257,40.637689,39.512964,42.764886,43.569025,37.512659,36.312006
4,2363,Dabaso,-3.3397,40.0007,1997,1998,0.0,22.952783,22.442499,23.197229,...,31.368646,30.92196,28.841669,27.905389,27.884729,27.539331,28.542352,28.274469,29.178125,29.556177
