In this workbook we assemble the required features of the data set as per the observations from the exploratory data analysis.

In [37]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('ticks')
%matplotlib inline

In [4]:
wells_features = pd.read_csv('data/well_features.csv')
wells_labels = pd.read_csv('data/well_labels.csv')

In [5]:
wells_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 40 columns):
id                       59400 non-null int64
amount_tsh               59400 non-null float64
date_recorded            59400 non-null object
funder                   55765 non-null object
gps_height               59400 non-null int64
installer                55745 non-null object
longitude                59400 non-null float64
latitude                 59400 non-null float64
wpt_name                 59400 non-null object
num_private              59400 non-null int64
basin                    59400 non-null object
subvillage               59029 non-null object
region                   59400 non-null object
region_code              59400 non-null int64
district_code            59400 non-null int64
lga                      59400 non-null object
ward                     59400 non-null object
population               59400 non-null int64
public_meeting           56066 non-null object
r

In [6]:
wells_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 2 columns):
id              59400 non-null int64
status_group    59400 non-null object
dtypes: int64(1), object(1)
memory usage: 928.2+ KB


In [8]:
wells_labels.status_group.value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

### Encoding the labels

Encoding the labels into numeric values is simple since there are only three categories. Using a dictionary in such cases makes the intent also explicit

In [10]:
status_group_to_numeric = {'functional needs repair' : 0,
                           'functional' : 1,
                           'non functional' : 2}

In [11]:
wells_labels['status'] = wells_labels['status_group'].map(status_group_to_numeric)

In [12]:
wells_labels.status.value_counts()

1    32259
2    22824
0     4317
Name: status, dtype: int64

### Encoding the features

In [16]:
wells_features.shape

(59400, 40)

In [64]:
gps_ht_bins = pd.qcut(wells_features.gps_height, 4, labels=range(4))

In [65]:
gps_ht_bins.value_counts()

0    21934
3    14850
2    14843
1     7773
Name: gps_height, dtype: int64

In [69]:
wells_features.construction_year.value_counts(ascending=True)

1966       17
1965       19
1961       21
1962       30
1964       40
1969       59
1968       77
1963       85
1967       88
1960      102
1971      145
2013      176
1973      184
1979      192
1977      202
1981      238
1987      302
1989      316
1991      324
1970      411
1976      414
1986      434
1975      437
1983      488
1988      521
2001      540
1993      608
1992      640
1997      644
1974      676
1972      708
1994      738
1982      744
1984      779
1996      811
1980      811
1985      945
1990      954
1998      966
1999      979
2005     1011
1995     1014
1978     1037
2002     1075
2012     1084
2004     1123
2011     1256
2003     1286
2006     1471
2007     1587
2000     2091
2009     2533
2008     2613
2010     2645
0       20709
Name: construction_year, dtype: int64

In [71]:
def bin_construction_yr(c):
    if c >= 1960 and c < 1970:
        return 1
    elif c >= 1971 and c < 1980:
        return 2
    elif c >= 1981 and c < 1990:
        return 3
    elif c >= 1991 and c < 2000:
        return 4
    elif c >= 2001 and c < 2010:
        return 5
    elif c >= 2011 and c < 2020:
        return 6
    else:
        return 0

In [75]:
construct_yr_bins = wells_features.construction_year.apply(bin_construction_yr)

In [76]:
construct_yr_bins.value_counts()

0    27621
5    13239
4     6724
3     4767
2     3995
6     2516
1      538
Name: construction_year, dtype: int64

In [78]:
wells_features.amount_tsh.describe()

count     59400.000000
mean        317.650385
std        2997.574558
min           0.000000
25%           0.000000
50%           0.000000
75%          20.000000
max      350000.000000
Name: amount_tsh, dtype: float64

In [79]:
def is_tsh_zero(tsh):
    if tsh == 0:
        return 1
    else:
        return 0

In [84]:
def take_log(tsh):
    if tsh == 0:
        return 0
    else:
        return np.log(tsh)

In [86]:
tsh_zero = wells_features.amount_tsh.apply(is_tsh_zero)

In [88]:
def group_funded(funder):
    if funder == 'Government Of Tanzania': return 'Govt'
    elif funder == 'Danida': return 'F1'
    elif funder == 'Hesawa': return 'F2'
    elif funder == 'Rwssp': return 'F3'
    elif funder == 'World Bank': return 'F4'
    elif funder == 'Kkkt': return 'F5'
    elif funder == 'World Vision': return 'F6'
    elif funder == 'Unicef': return 'F7'
    elif funder == 'Tasaf': return 'F8'
    elif funder == 'District Council': return 'F9'
    else: 
        return 'Oth'

In [89]:
funded_by = wells_features.funder.apply(group_funded)

In [90]:
funded_by.value_counts()

Oth     36967
Govt     9084
F1       3114
F2       2202
F3       1374
F4       1349
F5       1287
F6       1246
F7       1057
F8        877
F9        843
Name: funder, dtype: int64

In [94]:
wells_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 45 columns):
id                       59400 non-null int64
amount_tsh               59400 non-null float64
date_recorded            59400 non-null object
funder                   55765 non-null object
gps_height               59400 non-null int64
installer                55745 non-null object
longitude                59400 non-null float64
latitude                 59400 non-null float64
wpt_name                 59400 non-null object
num_private              59400 non-null int64
basin                    59400 non-null object
subvillage               59029 non-null object
region                   59400 non-null object
region_code              59400 non-null int64
district_code            59400 non-null int64
lga                      59400 non-null object
ward                     59400 non-null object
population               59400 non-null int64
public_meeting           56066 non-null object
r

In [93]:
wells_features = wells_features.assign(gps_ht_bin = pd.qcut(wells_features.gps_height, 4, labels=range(4)),
                                       construct_yr_bin = wells_features.construction_year.apply(bin_construction_yr),
                                       tsh = wells_features.amount_tsh.apply(take_log),
                                       tsh_zero = wells_features.amount_tsh.apply(is_tsh_zero),
                                       funded_by = wells_features.funder.apply(group_funded))