In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import seaborn as sn
%matplotlib notebook
from sklearn import preprocessing
import torch
from torch import nn

In [55]:
data = pd.read_csv('./data/Africa_Vectors_database_1898-2016.csv', sep = ',', encoding = "ISO-8859-1")
data.columns

Index(['Country', 'GAUL_Admin1', 'GAUL_Admin2', 'Full_Name', 'Lat', 'Long',
       'LatLong_Source', 'YeStart', 'YeEnd', 'An gambiae_complex',
       'An gambiae ss', 'SS M Form (An colluzzi or Mopti forms)',
       'SS S Form (savanah or Bamako forms)', 'An arabiensis', 'An. melas',
       'An. merus', 'An bwambae', 'An funestus  s.l',
       'An funestus s.s. (specified)', 'An rivulorum', 'An leesoni',
       'An parensis', 'An vaneedeni', 'An nili s.l', 'An moucheti s.l',
       'An pharoensis', 'An hancocki', 'An mascarensis', 'An marshalli',
       'An squamous', 'An wellcomei', 'An rufipes', 'An coustani s.l',
       'An ziemanni ', 'An paludis ', 'Adults/Larvae', 'Sampling_Methods',
       'Species_Identification', 'Other sib species names',
       'Other Anopheline species', 'Source_Title'],
      dtype='object')

In [105]:
def get_data():
    #Load the row data from the file 
    data = pd.read_csv('./data/Africa_Vectors_database_1898-2016.csv', sep = ',', encoding = "ISO-8859-1")
    
    # remove white spaces at the begining and end of column names and labels in the columns
    Region = 'GAUL_Admin2'
    data.columns = data.columns.str.strip()
    data['Country']= data['Country'].str.strip()
    data[Region]= data[Region].str.strip()
    data['Adults/Larvae']= data['Adults/Larvae'].str.strip()

    # convert the 3 columns to Upper case
    data['Country'] = data['Country'].str.upper()
    data[Region] = data[Region].str.upper()
    data['Adults/Larvae'] = data['Adults/Larvae'].str.upper()

    #change the column name of Full_Name to Region
    data = data.rename(columns={Region: 'Region'})

    #Taking the mean over the two years, round is to make sure we do not have decimals in years 
    data['Year'] = list(round(data[['YeStart', 'YeEnd']].mean(axis=1)))

    #Selecting the features to keep
    features =['Country','Region', 'Lat', 'Long','Year', 'An gambiae_complex', 'An gambiae ss', 'SS M Form (An colluzzi or Mopti forms)', 'SS S Form (savanah or Bamako forms)','An arabiensis','An. melas','An. merus','An bwambae','An funestus  s.l','An funestus s.s. (specified)','An rivulorum','An leesoni','An parensis','An vaneedeni','An nili s.l','An moucheti s.l','An pharoensis','An hancocki','An mascarensis','An marshalli','An squamous','An wellcomei','An rufipes','An coustani s.l','An ziemanni','An paludis','Adults/Larvae']

    #Returning a dataset with only the features kept
    featured_data= data[features]

    #remove records with Lat,Long missing values 
    featured_data = featured_data.dropna(axis=0, subset=['Lat'])

    #encoding the species classes 
    encoded_data = featured_data.replace(np.nan,0).replace('Y',1)

    # Reseting the  index
    encoded_data=encoded_data.reset_index(drop=True)

    #encoding the labels columns 
    # Label encoding for Country, Region, and  Adults/Larvae columns 
    le = preprocessing.LabelEncoder()
    encoded_data['Country'] = le.fit_transform(encoded_data['Country'])
    encoded_data['Adults/Larvae'] = le.fit_transform(encoded_data['Adults/Larvae'])
    encoded_data['Region'] = le.fit_transform(encoded_data['Region'].astype(str))
    
    #normalize the data
    #encoded_data=(encoded_data-encoded_data.mean())/encoded_data.std()

    
    #normalize the longitude and latitude 
#     encoded_data['Lat']=(encoded_data['Lat']-encoded_data['Lat'].mean())/encoded_data['Lat'].std()
#     encoded_data['Long']=(encoded_data['Long']-encoded_data['Long'].mean())/encoded_data['Long'].std()
#     encoded_data['Year']=(encoded_data['Year']-encoded_data['Year'].mean())/encoded_data['Year'].std()
      
    #feature scaling for year, longitude and latitude 
    encoded_data['Lat']=(encoded_data['Lat']-encoded_data['Lat'].min())/encoded_data['Lat'].max()
    encoded_data['Long']=(encoded_data['Long']-encoded_data['Long'].min())/encoded_data['Long'].max()
    encoded_data['Year']=(encoded_data['Year']-encoded_data['Year'].min())/encoded_data['Year'].max()

    #convert the year column from float to int 
    #data = data.astype({'Year':'int'})


    return encoded_data

In [107]:
# split the data into training and testing datasets 
inputs = get_data().values[:,4:] #species columns 
targets = get_data().values[:,2:4] #Lat & Long
train_inputs = inputs[0:9000]
train_targets = targets[0:9000]
test_inputs = inputs[9000:]
test_targets = targets[9000:]
#get_data().Region.unique()
# output = get_data().drop_duplicates()
# output.groupby('Region').size()
D = get_data().Region.value_counts()
T = pd.DataFrame({'index':D.index, 'Value':D.values})
T
#D.loc[D.Region.value_counts()]
# get_data().Region.value_counts()
# plt.hist(D)
T[T.Value == 1].sum()  

index    730804
Value       678
dtype: int64

In [80]:
N,D = train_inputs.shape
C = test_target.shape[1]
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

NameError: name 'test_target' is not defined

In [54]:
model = nn.Sequential(
    nn.Linear(D, D*2),
    nn.ReLU(),
    nn.Linear(D*2, 100),
    nn.ReLU(),
    nn.Linear(100,50),
    nn.ReLU(),
    nn.Linear(50,C)
)
model.to(device)

Sequential(
  (0): Linear(in_features=28, out_features=56, bias=True)
  (1): ReLU()
  (2): Linear(in_features=56, out_features=100, bias=True)
  (3): ReLU()
  (4): Linear(in_features=100, out_features=50, bias=True)
  (5): ReLU()
  (6): Linear(in_features=50, out_features=2, bias=True)
)

In [41]:
#optimization paramaters
lr = 1e-3
lambda_l2 = 1e-5
criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=lambda_l2) # built-in L2

In [42]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")