Tanzania Water Pumps

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
%matplotlib inline

In [2]:
df1 = pd.read_csv('Data/water_columns.csv')
df1.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [3]:
df2 = pd.read_csv('Data/water_status_group.csv')
df2.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


In [4]:
water_pump = df1.merge(df2, how='right')
water_pump.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


In [5]:
water_pump.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 41 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     59400 non-null  int64  
 1   amount_tsh             59400 non-null  float64
 2   date_recorded          59400 non-null  object 
 3   funder                 55765 non-null  object 
 4   gps_height             59400 non-null  int64  
 5   installer              55745 non-null  object 
 6   longitude              59400 non-null  float64
 7   latitude               59400 non-null  float64
 8   wpt_name               59400 non-null  object 
 9   num_private            59400 non-null  int64  
 10  basin                  59400 non-null  object 
 11  subvillage             59029 non-null  object 
 12  region                 59400 non-null  object 
 13  region_code            59400 non-null  int64  
 14  district_code          59400 non-null  int64  
 15  lg

In [6]:
water_pump['status_group'].value_counts(normalize=True)

functional                 0.543081
non functional             0.384242
functional needs repair    0.072677
Name: status_group, dtype: float64

In [7]:
water_pump.isnull().sum()

id                           0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [8]:
# **Looking at the data we can drop a few columns, and other will haev high multicollinearity
#so we can drop**
# drop wpt_name,   num_private,    region_code/region,    subvillage/district_code/lga/ward, 
# recorded_by,     scheme_management/scheme_name,      #management/management_group, 
# extraction_type/extraction_type_group/ extraction_type_class,
# payment/payment_type,    water_quality/quality_group,     quantity/quantity_group
# source/source_type/source_class,       waterpoint_type/waterpoint_type_group  


In [9]:
water_pump.columns

Index(['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height',
       'installer', 'longitude', 'latitude', 'wpt_name', 'num_private',
       'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga',
       'ward', 'population', 'public_meeting', 'recorded_by',
       'scheme_management', 'scheme_name', 'permit', 'construction_year',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'payment_type',
       'water_quality', 'quality_group', 'quantity', 'quantity_group',
       'source', 'source_type', 'source_class', 'waterpoint_type',
       'waterpoint_type_group', 'status_group'],
      dtype='object')

In [10]:
print(water_pump['amount_tsh'].nunique())
print()
print ()
print(water_pump['amount_tsh'].value_counts())
# have a lot of 0 values - 

##"The vertical distance between the SSH and the SDH is called the 
#Total Static Head (TSH). This is the work the pump must do in order 
#to lift the water to the desired elevation. Often times pumps are 
#needed to draw water from a source such as a well."



#maybe drop zero values

98


0.0         41639
500.0        3102
50.0         2472
1000.0       1488
20.0         1463
            ...  
8500.0          1
6300.0          1
220.0           1
138000.0        1
12.0            1
Name: amount_tsh, Length: 98, dtype: int64


In [11]:
print(water_pump['date_recorded'].nunique())
print()
print()
print(water_pump['date_recorded'].value_counts())
#maybe use this to calculate age?

356


2011-03-15    572
2011-03-17    558
2013-02-03    546
2011-03-14    520
2011-03-16    513
             ... 
2004-04-05      1
2011-09-20      1
2011-09-27      1
2011-09-18      1
2004-07-01      1
Name: date_recorded, Length: 356, dtype: int64


In [30]:
print(water_pump['funder'].nunique())
print()
print()
print(water_pump['funder'].value_counts())

#lots of unknown/zero values

1897


Government Of Tanzania    9084
Danida                    3114
Hesawa                    2202
Rwssp                     1374
World Bank                1349
                          ... 
Rotary Club Australia        1
Kwa Ditriki Cho              1
Mwanaisha Mwidadi            1
Sema S                       1
Misheni                      1
Name: funder, Length: 1897, dtype: int64


In [31]:
print(water_pump['gps_height'].nunique())
print()
print()
print(water_pump['gps_height'].value_counts())
#lots of unknown/zero values

2428


 0       20438
-15         60
-16         55
-13         55
-20         52
         ...  
 2285        1
 2424        1
 2552        1
 2413        1
 2385        1
Name: gps_height, Length: 2428, dtype: int64


In [14]:
print(water_pump['installer'].nunique())
print()
print()
print(water_pump['installer'].value_counts())
#lots of unknown/zero values
#- impute unknown

2145


DWE                      17402
Government                1825
RWE                       1206
Commu                     1060
DANIDA                    1050
                         ...  
Rotary club Australia        1
ambwene mwaikeke             1
Enyuati                      1
Pentecost church             1
Wizara  ya maji              1
Name: installer, Length: 2145, dtype: int64


In [15]:
print(water_pump['longitude'].nunique())
print()
print()
print(water_pump['longitude'].value_counts())
#lots of unknown/zero values

57516


0.000000     1812
37.540901       2
33.010510       2
39.093484       2
32.972719       2
             ... 
37.579803       1
33.196490       1
34.017119       1
33.788326       1
30.163579       1
Name: longitude, Length: 57516, dtype: int64


In [16]:
print(water_pump['latitude'].nunique())
print()
print()
print(water_pump['latitude'].value_counts())

57517


-2.000000e-08    1812
-6.985842e+00       2
-3.797579e+00       2
-6.981884e+00       2
-7.104625e+00       2
                 ... 
-5.726001e+00       1
-9.646831e+00       1
-8.124530e+00       1
-2.535985e+00       1
-2.598965e+00       1
Name: latitude, Length: 57517, dtype: int64


In [17]:
print(water_pump['basin'].nunique())
print()
print()
print(water_pump['basin'].value_counts())

9


Lake Victoria              10248
Pangani                     8940
Rufiji                      7976
Internal                    7785
Lake Tanganyika             6432
Wami / Ruvu                 5987
Lake Nyasa                  5085
Ruvuma / Southern Coast     4493
Lake Rukwa                  2454
Name: basin, dtype: int64


In [18]:
print(water_pump['subvillage'].nunique())
print()
print()
print(water_pump['subvillage'].value_counts())
#has almost 20,000 responses - some numbers and some words
#drop

19287


Madukani       508
Shuleni        506
Majengo        502
Kati           373
Mtakuja        262
              ... 
Chimen           1
Binja            1
Nduwi B          1
Kigina Kati      1
Mheme            1
Name: subvillage, Length: 19287, dtype: int64


In [19]:
print(water_pump['region'].nunique())
print()
print()
print(water_pump['region'].value_counts())

21


Iringa           5294
Shinyanga        4982
Mbeya            4639
Kilimanjaro      4379
Morogoro         4006
Arusha           3350
Kagera           3316
Mwanza           3102
Kigoma           2816
Ruvuma           2640
Pwani            2635
Tanga            2547
Dodoma           2201
Singida          2093
Mara             1969
Tabora           1959
Rukwa            1808
Mtwara           1730
Manyara          1583
Lindi            1546
Dar es Salaam     805
Name: region, dtype: int64


In [20]:
print(water_pump['region_code'].nunique())
print()
print()
print(water_pump['region_code'].value_counts())

27


11    5300
17    5011
12    4639
3     4379
5     4040
18    3324
19    3047
2     3024
16    2816
10    2640
4     2513
1     2201
13    2093
14    1979
20    1969
15    1808
6     1609
21    1583
80    1238
60    1025
90     917
7      805
99     423
9      390
24     326
8      300
40       1
Name: region_code, dtype: int64


In [21]:
print(water_pump['district_code'].nunique())
print()
print()
print(water_pump['district_code'].value_counts())
#drop

20


1     12203
2     11173
3      9998
4      8999
5      4356
6      4074
7      3343
8      1043
30      995
33      874
53      745
43      505
13      391
23      293
63      195
62      109
60       63
0        23
80       12
67        6
Name: district_code, dtype: int64


In [22]:
print(water_pump['lga'].nunique())
print()
print()
print(water_pump['lga'].value_counts())
#drop

125


Njombe          2503
Arusha Rural    1252
Moshi Rural     1251
Bariadi         1177
Rungwe          1106
                ... 
Moshi Urban       79
Kigoma Urban      71
Arusha Urban      63
Lindi Urban       21
Nyamagana          1
Name: lga, Length: 125, dtype: int64


In [23]:
print(water_pump['ward'].nunique())
print()
print()
print(water_pump['ward'].value_counts())
#drop

2092


Igosi        307
Imalinyi     252
Siha Kati    232
Mdandu       231
Nduruma      217
            ... 
Korongoni      1
Ukata          1
Burungura      1
Uchindile      1
Matarawe       1
Name: ward, Length: 2092, dtype: int64


In [24]:
print(water_pump['population'].nunique())
print()
print()
print(water_pump['population'].value_counts())
#lots of unknown/zero values 

#maybe drop 
#convert to 0-50 to unk

1049


0       21381
1        7025
200      1940
150      1892
250      1681
        ...  
3241        1
1960        1
1685        1
2248        1
1439        1
Name: population, Length: 1049, dtype: int64


In [25]:
print(water_pump['public_meeting'].nunique())
print()
print()
print(water_pump['public_meeting'].value_counts())
#doesnt seem like it would be relevant

2


True     51011
False     5055
Name: public_meeting, dtype: int64


In [26]:
print(water_pump['scheme_management'].nunique())
print()
print()
print(water_pump['scheme_management'].value_counts())
#drop none 

12


VWC                 36793
WUG                  5206
Water authority      3153
WUA                  2883
Water Board          2748
Parastatal           1680
Private operator     1063
Company              1061
Other                 766
SWC                    97
Trust                  72
None                    1
Name: scheme_management, dtype: int64


In [27]:
print(water_pump['scheme_name'].nunique())
print()
print()
print(water_pump['scheme_name'].value_counts())
#drop col
#28000 missing values

2696


K                682
None             644
Borehole         546
Chalinze wate    405
M                400
                ... 
Sola               1
Nkwe               1
BL Nsherehehe      1
hekule             1
BL Kyongwa         1
Name: scheme_name, Length: 2696, dtype: int64


In [28]:
print(water_pump['permit'].nunique())
print()
print()
print(water_pump['permit'].value_counts())
# has 356 empty values, maybe drop nans?


2


True     38852
False    17492
Name: permit, dtype: int64


In [34]:
water_pump['permit'].groupby(water_pump['status_group']).value_counts(normalize=True)

status_group             permit
functional               True      0.704276
                         False     0.295724
functional needs repair  True      0.671397
                         False     0.328603
non functional           True      0.672186
                         False     0.327814
Name: permit, dtype: float64

In [29]:
print(water_pump['construction_year'].nunique())
print()
print()
print(water_pump['construction_year'].value_counts())
#lots of unknown/zero values - 20709
#unknown

55


0       20709
2010     2645
2008     2613
2009     2533
2000     2091
2007     1587
2006     1471
2003     1286
2011     1256
2004     1123
2012     1084
2002     1075
1978     1037
1995     1014
2005     1011
1999      979
1998      966
1990      954
1985      945
1980      811
1996      811
1984      779
1982      744
1994      738
1972      708
1974      676
1997      644
1992      640
1993      608
2001      540
1988      521
1983      488
1975      437
1986      434
1976      414
1970      411
1991      324
1989      316
1987      302
1981      238
1977      202
1979      192
1973      184
2013      176
1971      145
1960      102
1967       88
1963       85
1968       77
1969       59
1964       40
1962       30
1961       21
1965       19
1966       17
Name: construction_year, dtype: int64


In [None]:
drop_cols = ['id','wpt_name', 'num_private','recorded_by', 'scheme_name']
water_pump.drop(drop_cols, axis=1, inplace=True)
water_pump.info()

In [None]:
for col_name in water_pump.columns:
    print(water_pump[col_name].value_counts())

In [None]:
water_pump['construction_year'].value_counts()

In [None]:
water_pump['current_yr'] = 2022
water_pump

In [None]:
water_pump.info()

In [None]:
water_pump['age'] = water_pump['current_yr'] - water_pump['construction_year']
water_pump

In [None]:
water_pump['pop/year'] = water_pump['population'] / water_pump['age']

In [None]:
water_pump.corr().status_group.sort_values(ascending=False)


In [None]:
water_pump['num_private'].value_counts()

In [None]:
scatterplot_data = X_train_numeric.drop("sqft_living", axis=1) #already graphed above

fig, axes = plt.subplots(ncols=3, nrows=5, figsize=(12, 8))
fig.set_tight_layout(True)

for index, col in enumerate(scatterplot_data.columns):
    ax = axes[index//3][index%3]
    ax.scatter(X_train_numeric[col], y_train, alpha=0.2)
    ax.set_xlabel(col)
    ax.set_ylabel("price")

    
fig.delaxes(axes[4][1])
fig.delaxes(axes[4][2])

In [None]:
water_pump['funder'].value_counts()

In [None]:
water_pump['funder'].nunique()

In [None]:
water_pump['funder'].value_counts().plot(kind='hist');

In [None]:
water_pump.groupby('amount_tsh').count().plot(kind='hist');

In [None]:
water_pump.groupby('gps_height').count().plot(kind='hist');

In [None]:
water_pump.groupby('gps_height').count().plot(kind='hist');

In [None]:
water_pump.groupby('gps_height').count().plot(kind='hist');

In [None]:
water_pump.groupby('gps_height').count().plot(kind='hist');

In [None]:
water_pump.groupby('gps_height').count().plot(kind='hist');

In [None]:
water_pump.groupby('gps_height').count().plot(kind='hist');