# Requesting Anophelines Data

The original data contain different species of Anophelines. Due to time limit, we only train study the specific species __Anophelines Funestus__

In [None]:
import pandas as pd
#the data can be downloaded from https://www.kaggle.com/jboysen/malaria-mosquito
# We have eliminated the data before 1990 due to the limit of available climate data
original_data=pd.read_csv('data/Anophelines.csv', engine='python')
original_data=original_data.loc[original_data['YeStart']>1990]
print(original_data.shape)
original_data.head()

In [None]:
df=original_data[['Full_Name','Lat','Long','YeStart','YeEnd','An funestus  s.l']]

# In the original data, Y means occurrence
# We formulate the problem as a binary classification problem
# Replace all 'Y' with 1, and all 'Nan' with 0
mapping={'Y':1}
data=df.replace({'An funestus  s.l':mapping,'An gambiae ss':mapping})
data=data.fillna(0)
data=data.rename(columns={'An funestus  s.l':'Funestus'})
data.to_csv('Mosquitoes.csv',index=False)
data.head()

Due to time limit, our climate features conssist of __[monthly minimum temperatures, monthly maximum temperatures]__. More climate features can be added in the future work.

In [None]:
for feature in ['min','max']:
  for i in range(12):
    data[f'{feature}_{i+1}']=''
  
data.head()

# Requesting Climate Data

After the preparion, now we should add climate data. For each row of _data_ , we request and compute monthly minimum temperatures and monthly maximum temperatures, the number of features is 24

In [None]:
import json
from ibmpairs import paw, authentication
from geopandas import GeoDataFrame
import numpy as np

def get_min_temp(lat,long,end_year):
  PAIRS_API_KEY = '1FkApXtW3DHJWYPRLIFLWkhmADLiKFhe2uNOclT1CoU' 
  PAIRS_SERVER   = 'https://pairs.res.ibm.com'
  PAIRS_CREDENTIALS = authentication.OAuth2(api_key = PAIRS_API_KEY)
  query_json = {
    "layers" : [
      {"type" : "raster", "id" : "49429"}
    ],
    "spatial" : {"type" : "point", "coordinates" : [f"{lat}", f"{long}"]},
    "temporal" : {"intervals" : [
      {"start" : f"{end_year-1}-01-01T00:00:00Z", "end" : f"{end_year}-01-01T00:00:00Z"}
    ]}
  }
  query = paw.PAIRSQuery(query_json, PAIRS_SERVER, PAIRS_CREDENTIALS, authType='api-key')
  try:
    query.submit()
  except:
    return [np.nan]*12
  df=query.vdf
  df=df[['timestamp','value']]
  # return df
  temp_month=[None]*12
  for month in range(12):
    temp_month[month]=[]
  for idx in range(df.shape[0]):
    month=df.iloc[idx]['timestamp'].month
    temp_month[month-1].append(df.iloc[idx]['value']-273.15)
  if all(list(map(len,temp_month))):
    return list(map(min,temp_month))
  else:
    return [np.nan]*12
    

def get_max_temp(lat,long,end_year):
  PAIRS_API_KEY = '1FkApXtW3DHJWYPRLIFLWkhmADLiKFhe2uNOclT1CoU' # Put your API key here. Important: Best practice is not to include secrets in source code.
  PAIRS_SERVER   = 'https://pairs.res.ibm.com'
  PAIRS_CREDENTIALS = authentication.OAuth2(api_key = PAIRS_API_KEY)
  query_json = {
    "layers" : [
      {"type" : "raster", "id" : "49430"}
    ],
    "spatial" : {"type" : "point", "coordinates" : [f"{lat}", f"{long}"]},
    "temporal" : {"intervals" : [
      {"start" : f"{end_year-1}-01-01T00:00:00Z", "end" : f"{end_year}-01-01T00:00:00Z"}
    ]}
  }
  query = paw.PAIRSQuery(query_json, PAIRS_SERVER, PAIRS_CREDENTIALS, authType='api-key')
  try:
    query.submit()
  except:
    return [np.nan]*12
  df=query.vdf
  df=df[['timestamp','value']]
  # return df
  temp_month=[None]*12
  for month in range(12):
    temp_month[month]=[]
  for idx in range(df.shape[0]):
    month=df.iloc[idx]['timestamp'].month
    temp_month[month-1].append(df.iloc[idx]['value']-273.15)
  if all(list(map(len,temp_month))):
    return list(map(max,temp_month))
  else:
    return [np.nan]*12

In [None]:
# ERA5 A global reanalysis data set produced by ECMWF, the European Centre for Medium-Range Weather Forecasts.
# ERA5 is the direct successor to the ERA Interim reanalysis.
# It provides global, hourly data at a resolution of 0.25 by 0.25 degrees.

# use the following line if you want to continue from where you pause
# for i in range(503,data.shape[0]):
for i in range(data.shape[0]):
  element=data.iloc[i]
  lat=element['Lat']
  long=element['Long']
  end_year=element['YeEnd']
  min_temp=get_min_temp(lat,long,end_year)
  for month in range(12):
    data.at[i,f'min_{month+1}']=min_temp[month]
  min_temp=get_max_temp(lat,long,end_year)
  for month in range(12):
    data.at[i,f'max_{month+1}']=min_temp[month]
    
  # store intermediate data, in case of interrupt
  if i%50==0:
    print(f'{i}\{data.shape[0]}')
    print('-'*20)
    data.to_csv(f'data_ERA5/data_{i}.csv',index=False)
# final result

# Some locations'[latitude and longitude] climate data are not available. And these features are NaN
# We need to drop all rows containning NaN
data.dropna(inplace=True)
data.to_csv('data_ERA5/data.csv',index=False)
#split the data to train_data and test_data
train_data=data.iloc[range(4000)]
test_data=data.iloc[range(4000,data.shape[0])]
print(train_data.shape)
print(test_data.shape)