In [4]:
import pandas as pd
import numpy as np
%matplotlib notebook
from sklearn import preprocessing

In [12]:
def get_data():
    #Load the row data from the file 
    data = pd.read_csv('./data/Africa_Vectors_database_1898-2016.csv', sep = ',', encoding = "ISO-8859-1")
    
    # remove white spaces at the begining and end of column names and labels in the columns
    Region = 'GAUL_Admin1'
    data.columns = data.columns.str.strip()
    data['Country']= data['Country'].str.strip()
    data[Region]= data[Region].str.strip()
    data['Adults/Larvae']= data['Adults/Larvae'].str.strip()

    # convert the 3 columns to Upper case
    data['Country'] = data['Country'].str.upper()
    data[Region] = data[Region].str.upper()
    data['Adults/Larvae'] = data['Adults/Larvae'].str.upper()

    #change the column name of Full_Name to Region
    data = data.rename(columns={Region: 'Region'})

    #Taking the mean over the two years, round is to make sure we do not have decimals in years 
    data['Year'] = list(round(data[['YeStart', 'YeEnd']].mean(axis=1)))

    #Selecting the features to keep
    features =['Country','Region', 'Lat', 'Long','Year', 'An gambiae_complex', 'An gambiae ss', 'SS M Form (An colluzzi or Mopti forms)', 'SS S Form (savanah or Bamako forms)','An arabiensis','An. melas','An. merus','An bwambae','An funestus  s.l','An funestus s.s. (specified)','An rivulorum','An leesoni','An parensis','An vaneedeni','An nili s.l','An moucheti s.l','An pharoensis','An hancocki','An mascarensis','An marshalli','An squamous','An wellcomei','An rufipes','An coustani s.l','An ziemanni','An paludis','Adults/Larvae']

    #Returning a dataset with only the features kept
    featured_data= data[features]

    #remove records with Lat,Long missing values 
    featured_data = featured_data.dropna(axis=0, subset=['Lat'])

    #encoding the species classes 
    encoded_data = featured_data.replace(np.nan,0).replace('Y',1)

    # Reseting the  index
    encoded_data=encoded_data.reset_index(drop=True)

    #encoding the labels columns 
    # Label encoding for Country, Region, and  Adults/Larvae columns 
    le = preprocessing.LabelEncoder()
    encoded_data['Country'] = le.fit_transform(encoded_data['Country'])
    encoded_data['Adults/Larvae'] = le.fit_transform(encoded_data['Adults/Larvae'])
    encoded_data['Region'] = le.fit_transform(encoded_data['Region'].astype(str))
    
    #normalize the data
    #encoded_data=(encoded_data-encoded_data.mean())/encoded_data.std()

    
    #normalize the longitude and latitude 
#     encoded_data['Lat']=(encoded_data['Lat']-encoded_data['Lat'].mean())/encoded_data['Lat'].std()
#     encoded_data['Long']=(encoded_data['Long']-encoded_data['Long'].mean())/encoded_data['Long'].std()
#     encoded_data['Year']=(encoded_data['Year']-encoded_data['Year'].mean())/encoded_data['Year'].std()
      
    #feature scaling for year, longitude and latitude 
    encoded_data['Lat']=(encoded_data['Lat']-encoded_data['Lat'].min())/encoded_data['Lat'].max()
    encoded_data['Long']=(encoded_data['Long']-encoded_data['Long'].min())/encoded_data['Long'].max()
    encoded_data['Year']=(encoded_data['Year']-encoded_data['Year'].min())/encoded_data['Year'].max()
    encoded_data['Region']=(encoded_data['Region']-encoded_data['Region'].min())/encoded_data['Region'].max()

    #convert the year column from float to int 
    #data = data.astype({'Year':'int'})


    return encoded_data

In [13]:
get_data()

Unnamed: 0,Country,Region,Lat,Long,Year,An gambiae_complex,An gambiae ss,SS M Form (An colluzzi or Mopti forms),SS S Form (savanah or Bamako forms),An arabiensis,...,An hancocki,An mascarensis,An marshalli,An squamous,An wellcomei,An rufipes,An coustani s.l,An ziemanni,An paludis,Adults/Larvae
0,0,0.104478,1.168497,0.751352,0.033234,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0.104478,1.162796,0.755336,0.043155,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0.104478,1.164257,0.754047,0.050595,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0.104478,1.171800,0.753051,0.034722,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,1
4,0,0.104478,1.166569,0.773947,0.019841,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
5,0,0.104478,1.137143,0.755336,0.026786,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
6,0,0.104478,1.133342,0.758604,0.019841,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
7,0,0.104478,1.147965,0.755762,0.050595,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0.104478,1.151426,0.755691,0.032242,1,0,0,0,0,...,0,0,0,0,0,0,1,1,0,1
9,0,0.104478,1.140038,0.756430,0.034722,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
