In [963]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

df = pd.read_csv("Training_set_values.csv")
labels = pd.read_csv("Training_set_labels.csv")

df = pd.merge(df, labels, on='id')
del labels

In [964]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 41 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     59400 non-null  int64  
 1   amount_tsh             59400 non-null  float64
 2   date_recorded          59400 non-null  object 
 3   funder                 55765 non-null  object 
 4   gps_height             59400 non-null  int64  
 5   installer              55745 non-null  object 
 6   longitude              59400 non-null  float64
 7   latitude               59400 non-null  float64
 8   wpt_name               59400 non-null  object 
 9   num_private            59400 non-null  int64  
 10  basin                  59400 non-null  object 
 11  subvillage             59029 non-null  object 
 12  region                 59400 non-null  object 
 13  region_code            59400 non-null  int64  
 14  district_code          59400 non-null  int64  
 15  lg

In [965]:
df.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd,,,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


In [966]:
missing_values_count = df.isnull().sum()
print(missing_values_count)

id                           0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [967]:
#percentage of missing data
total_cells = np.product(df.shape)
total_missing = missing_values_count.sum()

percent_missing = (total_missing/total_cells) * 100
print(percent_missing)

1.8926665024226


In [968]:
df.funder.value_counts()

Government Of Tanzania          9084
Danida                          3114
Hesawa                          2202
Rwssp                           1374
World Bank                      1349
                                ... 
Meko Balo                          1
People From Egypt                  1
Egypt Technical Co Operation       1
Nazaleti                           1
Soko La Magomeni                   1
Name: funder, Length: 1897, dtype: int64

In [969]:
df.funder.head()

0           Roman
1         Grumeti
2    Lottery Club
3          Unicef
4     Action In A
Name: funder, dtype: object

In [970]:
#to avoid inconsistent data
df.funder = df.funder.str.lower()

In [971]:
# Keep top 5 values and set the rest to 'other'

def funder_keep_top(row):  

    if row['funder']=='government of tanzania':
        return 'gov'
    elif row['funder']=='danida':
        return 'danida'
    elif row['funder']=='hesawa':
        return 'hesawa'
    elif row['funder']=='rwssp':
        return 'rwssp'
    elif row['funder']=='world bank':
        return 'world_bank'    
    else:
        return 'other'
    
df['funder'] = df.apply(lambda row: funder_keep_top(row), axis=1)

In [972]:
# inspect 'installer'.

df.installer.value_counts()

DWE              17402
Government        1825
RWE               1206
Commu             1060
DANIDA            1050
                 ...  
WASHIMA              1
Mr Kas               1
CGI                  1
Wedeco               1
Private owned        1
Name: installer, Length: 2145, dtype: int64

In [973]:
#to avoid inconsistent data
df.installer = df.installer.str.lower()

In [974]:
df.installer.head(50)

0                    roman
1                  grumeti
2             world vision
3                   unicef
4                  artisan
5                      dwe
6                     dwsp
7                      dwe
8                water aid
9                  artisan
10                 private
11                  danida
12            world vision
13    lawatefuka water sup
14                  wedeco
15                     dwe
16                     dwe
17                     dwe
18                     dwe
19                   danid
20                     twe
21                     dwe
22                     dwe
23                     isf
24                     dwe
25                     dwe
26             kilolo star
27                     dwe
28        district council
29    lawatefuka water sup
30                   water
31                      wu
32                     dwe
33                     dwe
34                     NaN
35               not known
36      central government
3

In [975]:
# Keep top 5 values and set the rest to 'other'

def installer_keep_top(row):

    if row['installer']=='DWE'.lower():
        return 'dwe'
    elif row['installer']=='Government'.lower():
        return 'gov'
    elif row['installer']=='RWE'.lower():
        return 'rwe'
    elif row['installer']=='Commu'.lower():
        return 'commu'
    elif row['installer']=='DANIDA'.lower():
        return 'danida'
    else:
        return 'other'  

df['installer'] = df.apply(lambda row: installer_keep_top(row), axis=1)

In [976]:
# 'subvillage'.

print(df.subvillage.value_counts())

Madukani     508
Shuleni      506
Majengo      502
Kati         373
Mtakuja      262
            ... 
Kukegere       1
Mwamaduhi      1
Marti          1
Kiumbo         1
Kumyange       1
Name: subvillage, Length: 19287, dtype: int64


In [977]:
print('Number of villages: ', len(df.subvillage.value_counts()))


Number of villages:  19287


In [978]:
# since there are a lot of unique values and top valued villages doesn't cover a majority of them, let's drop this

df = df.drop('subvillage', axis=1)

In [979]:
# 'public_meeting'.

df.public_meeting.value_counts()

True     51011
False     5055
Name: public_meeting, dtype: int64

In [980]:
# fill the unknown data with a constant

df.public_meeting = df.public_meeting.fillna('Unknown')

In [981]:
# 'scheme_management' 

df.scheme_management.value_counts()

VWC                 36793
WUG                  5206
Water authority      3153
WUA                  2883
Water Board          2748
Parastatal           1680
Private operator     1063
Company              1061
Other                 766
SWC                    97
Trust                  72
None                    1
Name: scheme_management, dtype: int64

In [982]:
#to avoid inconsistent data
df.scheme_management = df.scheme_management.str.lower()

In [983]:
# keep only top 5

def scheme_keep_top(row):

    if row['scheme_management']=='VWC'.lower():
        return 'vwc'
    elif row['scheme_management']=='WUG'.lower():
        return 'wug'
    elif row['scheme_management']=='Water authority'.lower():
        return 'wtr_auth'
    elif row['scheme_management']=='WUA'.lower():
        return 'wua'
    elif row['scheme_management']=='Water Board'.lower():
        return 'wtr_brd'
    else:
        return 'other'

df['scheme_management'] = df.apply(lambda row: scheme_keep_top(row), axis=1)

In [984]:
df.scheme_name.value_counts()

K                            682
None                         644
Borehole                     546
Chalinze wate                405
M                            400
                            ... 
Kashangu water supply          1
Mradi wa maji wa sed farm      1
Pefa water supply              1
Fufu                           1
Iton                           1
Name: scheme_name, Length: 2696, dtype: int64

In [985]:
len(df.scheme_name.unique())

2697

In [986]:
# there are a lots of unique factors and top 5 doesn't represent a majority of them

df = df.drop('scheme_name', axis=1)

In [987]:
# The last column containing null values is 'permit'.

df.permit.value_counts()

True     38852
False    17492
Name: permit, dtype: int64

In [988]:
# replace unknown data with a constant

df.permit = df.permit.fillna('Unknown')

In [989]:
df.apply(lambda x: sum(x.isnull()))

id                       0
amount_tsh               0
date_recorded            0
funder                   0
gps_height               0
installer                0
longitude                0
latitude                 0
wpt_name                 0
num_private              0
basin                    0
region                   0
region_code              0
district_code            0
lga                      0
ward                     0
population               0
public_meeting           0
recorded_by              0
scheme_management        0
permit                   0
construction_year        0
extraction_type          0
extraction_type_group    0
extraction_type_class    0
management               0
management_group         0
payment                  0
payment_type             0
water_quality            0
quality_group            0
quantity                 0
quantity_group           0
source                   0
source_type              0
source_class             0
waterpoint_type          0
w

All the columns with null values are handled

In [990]:
# columns with string values (object type)

str_cols = df.select_dtypes(include = ['object'])
str_cols.apply(lambda x: len(x.unique()))

date_recorded              356
funder                       6
installer                    6
wpt_name                 37400
basin                        9
region                      21
lga                        125
ward                      2092
public_meeting               3
recorded_by                  1
scheme_management            6
permit                       3
extraction_type             18
extraction_type_group       13
extraction_type_class        7
management                  12
management_group             5
payment                      7
payment_type                 7
water_quality                8
quality_group                6
quantity                     5
quantity_group               5
source                      10
source_type                  7
source_class                 3
waterpoint_type              7
waterpoint_type_group        6
status_group                 3
dtype: int64

In [991]:
# 'Date recorded'

df.date_recorded.describe()

count          59400
unique           356
top       2011-03-15
freq             572
Name: date_recorded, dtype: object

In [992]:
# Change the column to represent the number of days since the most recently recorded datetime

df.date_recorded = pd.to_datetime(df.date_recorded)
df.date_recorded.describe()

  after removing the cwd from sys.path.


count                   59400
unique                    356
top       2011-03-15 00:00:00
freq                      572
first     2002-10-14 00:00:00
last      2013-12-03 00:00:00
Name: date_recorded, dtype: object

In [993]:
# obtain a 'days_since_recorded' column.

df.date_recorded = pd.datetime(2013, 12, 3) - pd.to_datetime(df.date_recorded)
df.columns = ['days_since_recorded' if x=='date_recorded' else x for x in df.columns]
df.days_since_recorded = df.days_since_recorded.astype('timedelta64[D]').astype(int)
df.days_since_recorded.describe()

  This is separate from the ipykernel package so we can avoid doing imports until


count    59400.000000
mean       613.616970
std        334.216374
min          0.000000
25%        297.000000
50%        419.000000
75%        977.000000
max       4068.000000
Name: days_since_recorded, dtype: float64

In [994]:
# 'wpt_name' 

df.wpt_name.value_counts()

none                   3563
Shuleni                1748
Zahanati                830
Msikitini               535
Kanisani                323
                       ... 
Mtenga                    1
Kwa Chiku Ramadhan        1
Kwa Mkomba                1
Kwa Mzee Adrea Baru       1
Ahakasheshe Sta           1
Name: wpt_name, Length: 37400, dtype: int64

In [995]:
# large number of values and top categories doesn't have a larger factor
df = df.drop('wpt_name', axis=1)

In [996]:
df.basin.value_counts()

Lake Victoria              10248
Pangani                     8940
Rufiji                      7976
Internal                    7785
Lake Tanganyika             6432
Wami / Ruvu                 5987
Lake Nyasa                  5085
Ruvuma / Southern Coast     4493
Lake Rukwa                  2454
Name: basin, dtype: int64

In [997]:
df.region.value_counts()

Iringa           5294
Shinyanga        4982
Mbeya            4639
Kilimanjaro      4379
Morogoro         4006
Arusha           3350
Kagera           3316
Mwanza           3102
Kigoma           2816
Ruvuma           2640
Pwani            2635
Tanga            2547
Dodoma           2201
Singida          2093
Mara             1969
Tabora           1959
Rukwa            1808
Mtwara           1730
Manyara          1583
Lindi            1546
Dar es Salaam     805
Name: region, dtype: int64

In [998]:
# since all basin, lga, ward and region contain geographical information
df = df.drop(['region', 'lga', 'ward'], axis=1)

In [999]:
df.recorded_by.value_counts()

GeoData Consultants Ltd    59400
Name: recorded_by, dtype: int64

In [1000]:
# All the rows have the same value 
df = df.drop('recorded_by', axis=1)

In [1001]:
# extraction_type, extraction_type_group and extraction_type_class contains similar information
df = df.drop(['extraction_type', 'extraction_type_group'], axis=1)

In [1002]:
df.management.value_counts()

vwc                 40507
wug                  6515
water board          2933
wua                  2535
private operator     1971
parastatal           1768
water authority       904
other                 844
company               685
unknown               561
other - school         99
trust                  78
Name: management, dtype: int64

In [1003]:
# This is almost identical to 'scheme_management'

df = df.drop('management', axis=1)

In [1004]:
df.management_group.value_counts()

user-group    52490
commercial     3638
parastatal     1768
other           943
unknown         561
Name: management_group, dtype: int64

In [1005]:
# this also similar to 'scheme_management'.

df = df.drop('management_group', axis=1)

In [1006]:
df.payment.value_counts()

never pay                25348
pay per bucket            8985
pay monthly               8300
unknown                   8157
pay when scheme fails     3914
pay annually              3642
other                     1054
Name: payment, dtype: int64

In [1007]:
df.payment_type.value_counts()

never pay     25348
per bucket     8985
monthly        8300
unknown        8157
on failure     3914
annually       3642
other          1054
Name: payment_type, dtype: int64

In [1008]:
# Payment and payment_type contain identical data

df = df.drop('payment', 1)

In [1009]:
df.water_quality.value_counts()

soft                  50818
salty                  4856
unknown                1876
milky                   804
coloured                490
salty abandoned         339
fluoride                200
fluoride abandoned       17
Name: water_quality, dtype: int64

In [1010]:
df.quality_group.value_counts()

good        50818
salty        5195
unknown      1876
milky         804
colored       490
fluoride      217
Name: quality_group, dtype: int64

In [1011]:
# Water_quality and quality_group contain identical data
df = df.drop('quality_group', 1)

In [1012]:
df.quantity.value_counts()

enough          33186
insufficient    15129
dry              6246
seasonal         4050
unknown           789
Name: quantity, dtype: int64

In [1013]:
df.quantity_group.value_counts()

enough          33186
insufficient    15129
dry              6246
seasonal         4050
unknown           789
Name: quantity_group, dtype: int64

In [1014]:
# Quantity and quantity_group contain identical data.

df = df.drop('quantity_group', 1)

In [1015]:
df.source.value_counts()

spring                  17021
shallow well            16824
machine dbh             11075
river                    9612
rainwater harvesting     2295
hand dtw                  874
lake                      765
dam                       656
other                     212
unknown                    66
Name: source, dtype: int64

In [1016]:
df.source_class.value_counts()

groundwater    45794
surface        13328
unknown          278
Name: source_class, dtype: int64

In [1017]:
df.source_type.value_counts()

spring                  17021
shallow well            16824
borehole                11949
river/lake              10377
rainwater harvesting     2295
dam                       656
other                     278
Name: source_type, dtype: int64

In [1018]:
# Source and source_type contain very similar information

df = df.drop('source', 1)

In [1019]:
# gps_height, longitude, latitude, region_code and district_code are all geographic info
# 'num_private' hasn't been given a discription

df = df.drop(['longitude', 'latitude', 'region_code', 'district_code',
             'num_private', 'id'], axis=1)

In [1020]:
str_cols.apply(lambda x: len(x.unique()))

date_recorded              356
funder                       6
installer                    6
wpt_name                 37400
basin                        9
region                      21
lga                        125
ward                      2092
public_meeting               3
recorded_by                  1
scheme_management            6
permit                       3
extraction_type             18
extraction_type_group       13
extraction_type_class        7
management                  12
management_group             5
payment                      7
payment_type                 7
water_quality                8
quality_group                6
quantity                     5
quantity_group               5
source                      10
source_type                  7
source_class                 3
waterpoint_type              7
waterpoint_type_group        6
status_group                 3
dtype: int64

In [1022]:
df.construction_year.value_counts()

0       20709
2010     2645
2008     2613
2009     2533
2000     2091
2007     1587
2006     1471
2003     1286
2011     1256
2004     1123
2012     1084
2002     1075
1978     1037
1995     1014
2005     1011
1999      979
1998      966
1990      954
1985      945
1980      811
1996      811
1984      779
1982      744
1994      738
1972      708
1974      676
1997      644
1992      640
1993      608
2001      540
1988      521
1983      488
1975      437
1986      434
1976      414
1970      411
1991      324
1989      316
1987      302
1981      238
1977      202
1979      192
1973      184
2013      176
1971      145
1960      102
1967       88
1963       85
1968       77
1969       59
1964       40
1962       30
1961       21
1965       19
1966       17
Name: construction_year, dtype: int64

In [1023]:
# make construction_year a categorical column

def construction_make_categorical(row):
    if int(row['construction_year']) >= 1960 and int(row['construction_year']) < 1970:
        return '60s'
    elif row['construction_year'] >= 1970 and row['construction_year'] < 1980:
        return '70s'
    elif row['construction_year'] >= 1980 and row['construction_year'] < 1990:
        return '80s'
    elif row['construction_year'] >= 1990 and row['construction_year'] < 2000:
        return '90s'
    elif row['construction_year'] >= 2000 and row['construction_year'] < 2010:
        return '00s'
    elif row['construction_year'] >= 2010:
        return '10s'
    else:
        return 'unknown'
    
df['construction_year'] = df.apply(lambda row: construction_make_categorical(row), axis=1)

In [1024]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   amount_tsh             59400 non-null  float64
 1   days_since_recorded    59400 non-null  int64  
 2   funder                 59400 non-null  object 
 3   gps_height             59400 non-null  int64  
 4   installer              59400 non-null  object 
 5   basin                  59400 non-null  object 
 6   population             59400 non-null  int64  
 7   public_meeting         59400 non-null  object 
 8   scheme_management      59400 non-null  object 
 9   permit                 59400 non-null  object 
 10  construction_year      59400 non-null  object 
 11  extraction_type_class  59400 non-null  object 
 12  payment_type           59400 non-null  object 
 13  water_quality          59400 non-null  object 
 14  quantity               59400 non-null  object 
 15  so

In [1025]:
#amount of people served by wells

df.population.describe()

count    59400.000000
mean       179.909983
std        471.482176
min          0.000000
25%          0.000000
50%         25.000000
75%        215.000000
max      30500.000000
Name: population, dtype: float64

In [1026]:
df.amount_tsh.describe()

count     59400.000000
mean        317.650385
std        2997.574558
min           0.000000
25%           0.000000
50%           0.000000
75%          20.000000
max      350000.000000
Name: amount_tsh, dtype: float64

In [1027]:
df.gps_height.describe()

count    59400.000000
mean       668.297239
std        693.116350
min        -90.000000
25%          0.000000
50%        369.000000
75%       1319.250000
max       2770.000000
Name: gps_height, dtype: float64

In [1028]:
#from sklearn.preprocessing import MinMaxScaler

features_to_normalize = ['gps_height', 'amount_tsh', 'population']
scaler = StandardScaler()
df[features_to_normalize] = scaler.fit_transform(df[features_to_normalize])
df[features_to_normalize].head()

Unnamed: 0,gps_height,amount_tsh,population
0,1.041252,1.895665,-0.150399
1,1.054237,-0.10597,0.21229
2,0.025541,-0.09763,0.14866
3,-0.584751,-0.10597,-0.25857
4,-0.9642,-0.10597,-0.381587


In [1029]:
df.head()

Unnamed: 0,amount_tsh,days_since_recorded,funder,gps_height,installer,basin,population,public_meeting,scheme_management,permit,construction_year,extraction_type_class,payment_type,water_quality,quantity,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,1.895665,995,other,1.041252,other,Lake Nyasa,-0.150399,True,vwc,False,90s,gravity,annually,soft,enough,spring,groundwater,communal standpipe,communal standpipe,functional
1,-0.10597,272,other,1.054237,other,Lake Victoria,0.21229,Unknown,other,True,10s,gravity,never pay,soft,insufficient,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,-0.09763,281,other,0.025541,other,Pangani,0.14866,True,vwc,True,00s,gravity,per bucket,soft,enough,dam,surface,communal standpipe multiple,communal standpipe,functional
3,-0.10597,309,other,-0.584751,other,Ruvuma / Southern Coast,-0.25857,True,vwc,True,80s,submersible,never pay,soft,dry,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,-0.10597,874,other,-0.9642,other,Lake Victoria,-0.381587,True,other,True,unknown,gravity,never pay,soft,seasonal,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


In [1030]:
#df = df.drop('status_group', 1)
df.to_csv('pump_train_for_models.csv', index=False) 

In [1031]:
test = pd.read_csv(r"Test_set_values.csv")

In [1032]:
test['funder'] = test['funder'].str.lower()
test['installer'] = test['installer'].str.lower()
test['scheme_management'] = test['scheme_management'].str.lower()

In [1033]:
test.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,50785,0.0,2013-02-04,dmdd,1996,dmdd,35.290799,-4.059696,Dinamu Secondary School,0,Internal,Magoma,Manyara,21,3,Mbulu,Bashay,321,True,GeoData Consultants Ltd,parastatal,,True,2012,other,other,other,parastatal,parastatal,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,other,other
1,51630,0.0,2013-02-04,government of tanzania,1569,dwe,36.656709,-3.309214,Kimnyak,0,Pangani,Kimnyak,Arusha,2,2,Arusha Rural,Kimnyaki,300,True,GeoData Consultants Ltd,vwc,TPRI pipe line,True,2000,gravity,gravity,gravity,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe
2,17168,0.0,2013-02-01,,1567,,34.767863,-5.004344,Puma Secondary,0,Internal,Msatu,Singida,13,2,Singida Rural,Puma,500,True,GeoData Consultants Ltd,vwc,P,,2010,other,other,other,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,other,other
3,45559,0.0,2013-01-22,finn water,267,finn water,38.058046,-9.418672,Kwa Mzee Pange,0,Ruvuma / Southern Coast,Kipindimbi,Lindi,80,43,Liwale,Mkutano,250,,GeoData Consultants Ltd,vwc,,True,1987,other,other,other,vwc,user-group,unknown,unknown,soft,good,dry,dry,shallow well,shallow well,groundwater,other,other
4,49871,500.0,2013-03-27,bruder,1260,bruder,35.006123,-10.950412,Kwa Mzee Turuka,0,Ruvuma / Southern Coast,Losonga,Ruvuma,10,3,Mbinga,Mbinga Urban,60,,GeoData Consultants Ltd,water board,BRUDER,True,2000,gravity,gravity,gravity,water board,user-group,pay monthly,monthly,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe


In [1034]:
# perform the same modifications to the test set

testid = test['id']

test = test.drop(['longitude', 'latitude', 'region_code', 'district_code',
                  'num_private', 'id', 'payment', 'management_group', 'management', 
                  'extraction_type', 'extraction_type_group', 'recorded_by','region', 'lga',
                  'ward', 'wpt_name', 'scheme_name', 'subvillage', 'quantity_group',
                 'quality_group', 'source'], axis=1)

test.date_recorded = pd.datetime(2013, 12, 3) - pd.to_datetime(test.date_recorded)
test.columns = ['days_since_recorded' if x=='date_recorded' else x for x in test.columns]
test.days_since_recorded = test.days_since_recorded.astype('timedelta64[D]').astype(int)

test.permit = test.permit.fillna('Unknown')
test.public_meeting = test.public_meeting.fillna('Unknown')

test['scheme_management'] = test.apply(lambda row: scheme_keep_top(row), axis=1)
test['construction_year'] = test.apply(lambda row: construction_make_categorical(row), axis=1)
test['installer'] = test.apply(lambda row: installer_keep_top(row), axis=1)
test['funder'] = test.apply(lambda row: funder_keep_top(row), axis=1)

  # This is added back by InteractiveShellApp.init_path()


In [1035]:
test[features_to_normalize] = scaler.fit_transform(test[features_to_normalize])
test[features_to_normalize].head()

Unnamed: 0,gps_height,amount_tsh,population
0,1.939784,-0.128571,0.291567
1,1.322052,-0.128571,0.246837
2,1.319159,-0.128571,0.672837
3,-0.561525,-0.128571,0.140337
4,0.875028,0.070562,-0.264363


In [1036]:
test.head()

Unnamed: 0,amount_tsh,days_since_recorded,funder,gps_height,installer,basin,population,public_meeting,scheme_management,permit,construction_year,extraction_type_class,payment_type,water_quality,quantity,source_type,source_class,waterpoint_type,waterpoint_type_group
0,-0.128571,302,other,1.939784,other,Internal,0.291567,True,other,True,10s,other,never pay,soft,seasonal,rainwater harvesting,surface,other,other
1,-0.128571,302,gov,1.322052,dwe,Pangani,0.246837,True,vwc,True,00s,gravity,never pay,soft,insufficient,spring,groundwater,communal standpipe,communal standpipe
2,-0.128571,305,other,1.319159,other,Internal,0.672837,True,vwc,Unknown,10s,other,never pay,soft,insufficient,rainwater harvesting,surface,other,other
3,-0.128571,315,other,-0.561525,other,Ruvuma / Southern Coast,0.140337,Unknown,vwc,True,80s,other,unknown,soft,dry,shallow well,groundwater,other,other
4,0.070562,251,other,0.875028,other,Ruvuma / Southern Coast,-0.264363,Unknown,wtr_brd,True,00s,gravity,monthly,soft,enough,spring,groundwater,communal standpipe,communal standpipe


In [1037]:
testid.head()

0    50785
1    51630
2    17168
3    45559
4    49871
Name: id, dtype: int64

In [1038]:
df.shape

(59400, 20)

In [1039]:
test.shape

(14850, 19)

In [1040]:
# The train and test sets match up. We can save the test set now.

test.to_csv('pump_test_for_models.csv', index=False)

**Run the model**

In [1041]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

train = r"pump_train_for_models.csv"
test = r"pump_test_for_models.csv"

train = pd.read_csv(train)
test = pd.read_csv(test)

In [1042]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   amount_tsh             59400 non-null  float64
 1   days_since_recorded    59400 non-null  int64  
 2   funder                 59400 non-null  object 
 3   gps_height             59400 non-null  float64
 4   installer              59400 non-null  object 
 5   basin                  59400 non-null  object 
 6   population             59400 non-null  float64
 7   public_meeting         59400 non-null  object 
 8   scheme_management      59400 non-null  object 
 9   permit                 59400 non-null  object 
 10  construction_year      59400 non-null  object 
 11  extraction_type_class  59400 non-null  object 
 12  payment_type           59400 non-null  object 
 13  water_quality          59400 non-null  object 
 14  quantity               59400 non-null  object 
 15  so

In [1043]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14850 entries, 0 to 14849
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   amount_tsh             14850 non-null  float64
 1   days_since_recorded    14850 non-null  int64  
 2   funder                 14850 non-null  object 
 3   gps_height             14850 non-null  float64
 4   installer              14850 non-null  object 
 5   basin                  14850 non-null  object 
 6   population             14850 non-null  float64
 7   public_meeting         14850 non-null  object 
 8   scheme_management      14850 non-null  object 
 9   permit                 14850 non-null  object 
 10  construction_year      14850 non-null  object 
 11  extraction_type_class  14850 non-null  object 
 12  payment_type           14850 non-null  object 
 13  water_quality          14850 non-null  object 
 14  quantity               14850 non-null  object 
 15  so

In [1044]:
#get one hot encodings of string columns

dummy_cols = ['funder', 'installer', 'basin', 'public_meeting', 'scheme_management', 'permit',
              'construction_year', 'extraction_type_class', 'payment_type', 'water_quality',
              'quantity', 'source_type', 'source_class', 'waterpoint_type',
             'waterpoint_type_group']

train = pd.get_dummies(train, columns = dummy_cols)

train = train.sample(frac=1).reset_index(drop=True)

In [1045]:
train.shape

(59400, 95)

In [1046]:
test = pd.get_dummies(test, columns = dummy_cols)

In [1047]:
test.shape
test.head()

Unnamed: 0,amount_tsh,days_since_recorded,gps_height,population,funder_danida,funder_gov,funder_hesawa,funder_other,funder_rwssp,funder_world_bank,installer_commu,installer_danida,installer_dwe,installer_gov,installer_other,installer_rwe,basin_Internal,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,public_meeting_False,public_meeting_True,public_meeting_Unknown,scheme_management_other,scheme_management_vwc,scheme_management_wtr_auth,scheme_management_wtr_brd,scheme_management_wua,scheme_management_wug,permit_False,permit_True,permit_Unknown,construction_year_00s,construction_year_10s,construction_year_60s,...,payment_type_on failure,payment_type_other,payment_type_per bucket,payment_type_unknown,water_quality_coloured,water_quality_fluoride,water_quality_fluoride abandoned,water_quality_milky,water_quality_salty,water_quality_salty abandoned,water_quality_soft,water_quality_unknown,quantity_dry,quantity_enough,quantity_insufficient,quantity_seasonal,quantity_unknown,source_type_borehole,source_type_dam,source_type_other,source_type_rainwater harvesting,source_type_river/lake,source_type_shallow well,source_type_spring,source_class_groundwater,source_class_surface,source_class_unknown,waterpoint_type_cattle trough,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other,waterpoint_type_group_cattle trough,waterpoint_type_group_communal standpipe,waterpoint_type_group_dam,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other
0,-0.128571,302,1.939784,0.291567,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1
1,-0.128571,302,1.322052,0.246837,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
2,-0.128571,305,1.319159,0.672837,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1
3,-0.128571,315,-0.561525,0.140337,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
4,0.070562,251,0.875028,-0.264363,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0


In [1048]:
target = train.status_group
features = train.drop('status_group', axis=1)

X_train, X_val, y_train, y_val = train_test_split(features, target, train_size=0.8)

In [1049]:
X_train.head()

Unnamed: 0,amount_tsh,days_since_recorded,gps_height,population,funder_danida,funder_gov,funder_hesawa,funder_other,funder_rwssp,funder_world_bank,installer_commu,installer_danida,installer_dwe,installer_gov,installer_other,installer_rwe,basin_Internal,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,public_meeting_False,public_meeting_True,public_meeting_Unknown,scheme_management_other,scheme_management_vwc,scheme_management_wtr_auth,scheme_management_wtr_brd,scheme_management_wua,scheme_management_wug,permit_False,permit_True,permit_Unknown,construction_year_00s,construction_year_10s,construction_year_60s,...,payment_type_on failure,payment_type_other,payment_type_per bucket,payment_type_unknown,water_quality_coloured,water_quality_fluoride,water_quality_fluoride abandoned,water_quality_milky,water_quality_salty,water_quality_salty abandoned,water_quality_soft,water_quality_unknown,quantity_dry,quantity_enough,quantity_insufficient,quantity_seasonal,quantity_unknown,source_type_borehole,source_type_dam,source_type_other,source_type_rainwater harvesting,source_type_river/lake,source_type_shallow well,source_type_spring,source_class_groundwater,source_class_surface,source_class_unknown,waterpoint_type_cattle trough,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other,waterpoint_type_group_cattle trough,waterpoint_type_group_communal standpipe,waterpoint_type_group_dam,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other
54088,-0.10597,278,1.267767,-0.379466,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
3582,-0.10597,994,-0.9642,-0.169488,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0
26457,-0.022569,287,0.810409,1.209155,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
50869,-0.10597,873,-0.9642,-0.381587,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
44722,-0.08929,276,1.194186,0.14866,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0,0,...,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0


In [1050]:
display(train.head())

Unnamed: 0,amount_tsh,days_since_recorded,gps_height,population,status_group,funder_danida,funder_gov,funder_hesawa,funder_other,funder_rwssp,funder_world_bank,installer_commu,installer_danida,installer_dwe,installer_gov,installer_other,installer_rwe,basin_Internal,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,public_meeting_False,public_meeting_True,public_meeting_Unknown,scheme_management_other,scheme_management_vwc,scheme_management_wtr_auth,scheme_management_wtr_brd,scheme_management_wua,scheme_management_wug,permit_False,permit_True,permit_Unknown,construction_year_00s,construction_year_10s,...,payment_type_on failure,payment_type_other,payment_type_per bucket,payment_type_unknown,water_quality_coloured,water_quality_fluoride,water_quality_fluoride abandoned,water_quality_milky,water_quality_salty,water_quality_salty abandoned,water_quality_soft,water_quality_unknown,quantity_dry,quantity_enough,quantity_insufficient,quantity_seasonal,quantity_unknown,source_type_borehole,source_type_dam,source_type_other,source_type_rainwater harvesting,source_type_river/lake,source_type_shallow well,source_type_spring,source_class_groundwater,source_class_surface,source_class_unknown,waterpoint_type_cattle trough,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other,waterpoint_type_group_cattle trough,waterpoint_type_group_communal standpipe,waterpoint_type_group_dam,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other
0,-0.10597,865,-0.9642,-0.381587,non functional,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
1,-0.10597,277,-0.9642,-0.381587,functional,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
2,-0.10597,298,0.933044,0.466808,non functional,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
3,-0.072609,283,-0.117293,0.14866,functional,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
4,-0.10597,309,1.375975,-0.379466,non functional,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1


In [951]:
'''from sklearn.decomposition import PCA

pca = PCA(n_components=50)
X_train = pca.fit_transform(X_train)'''

In [1051]:
X_train.shape

(47520, 94)

TEST model----------------------

In [1052]:
from sklearn.ensemble import RandomForestClassifier
modelRFC = RandomForestClassifier(n_estimators=1000)

In [1053]:
from xgboost import XGBClassifier
modelXGB = XGBClassifier(objective = 'multi:softmax', booster = 'gbtree', nrounds = 'min.error.idx', 
                      num_class = 4, maximize = False, eval_metric = 'merror', eta = .2,
                      max_depth = 14, colsample_bytree = .4)

In [1054]:
modelRFC.fit(X_train, y_train)
X_train.info()
importances = modelRFC.feature_importances_
importances

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47520 entries, 54088 to 12486
Data columns (total 94 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   amount_tsh                                   47520 non-null  float64
 1   days_since_recorded                          47520 non-null  int64  
 2   gps_height                                   47520 non-null  float64
 3   population                                   47520 non-null  float64
 4   funder_danida                                47520 non-null  uint8  
 5   funder_gov                                   47520 non-null  uint8  
 6   funder_hesawa                                47520 non-null  uint8  
 7   funder_other                                 47520 non-null  uint8  
 8   funder_rwssp                                 47520 non-null  uint8  
 9   funder_world_bank                            47520 non-null  uint8  

array([2.93728044e-02, 1.43171512e-01, 1.26252918e-01, 7.64400131e-02,
       2.68178660e-03, 8.88857352e-03, 2.87796501e-03, 1.11152487e-02,
       1.79159633e-03, 2.85109630e-03, 1.11294651e-03, 8.50891384e-04,
       1.11470988e-02, 3.75083034e-03, 1.06578425e-02, 3.36844122e-03,
       7.68206806e-03, 4.64421510e-03, 4.39560151e-03, 7.43340486e-03,
       8.22670507e-03, 6.16994129e-03, 5.44398923e-03, 4.89843698e-03,
       5.41106723e-03, 6.55708182e-03, 7.72791637e-03, 3.85135235e-03,
       8.71037211e-03, 1.09495871e-02, 4.42703785e-03, 3.20632188e-03,
       2.88662093e-03, 5.53770812e-03, 9.81821968e-03, 1.00313278e-02,
       3.28837605e-03, 1.09213321e-02, 8.66597567e-03, 1.41310266e-03,
       8.42034885e-03, 7.66623268e-03, 7.32640325e-03, 6.03409875e-03,
       9.31803151e-03, 6.71646522e-03, 3.60798345e-03, 2.00229288e-02,
       1.03666158e-03, 5.76060144e-03, 3.17593223e-04, 3.66229393e-03,
       6.82572514e-03, 1.49249728e-02, 4.12790535e-03, 1.60996082e-03,
      

In [1055]:
predictionsRFC = modelRFC.predict(test)

output = pd.DataFrame({'id':testid, 'status_group':predictionsRFC})
output.to_csv('my_submission_rfc1.csv', index=False)
print("Submission was successfully saved!")

Submission was successfully saved!


In [1056]:
from sklearn.model_selection import cross_val_score
cross_val_score(modelRFC, X_train, y_train, cv=3)

array([0.78516414, 0.7822601 , 0.77948232])

In [958]:
modelXGB.fit(X_train, y_train)
importances = model.feature_importances_
importances
cross_val_score(modelXGB, X_train, y_train, cv=3)

array([0.78289141, 0.78137626, 0.78055556])

In [None]:
importances = model.feature_importances_
importances

In [961]:
'''pcaTest = PCA(n_components=50)
test = pcaTest.fit_transform(test)'''

In [962]:
predictionsXGB = modelXGB.predict(test)

output = pd.DataFrame({'id':testid, 'status_group':predictionsXGB})
output.to_csv('my_submission_xgb2.csv', index=False)
print("Submission was successfully saved!")

Submission was successfully saved!


--------------------------------------------------

In [None]:
from sklearn.ensemble import RandomForestClassifier

modelrfc = RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=1)
modelrfc.fit(X_train,y_train)
#model.evaluate(X_val, y_val, batch_size=32, verbose=2)

predictions = modelrfc.predict(test)

output = pd.DataFrame({'id':testid, 'status_group':predictions})
output.to_csv('my_submission_rfc.csv', index=False)
print("Submission was successfully saved!")

In [None]:

test.shape

In [None]:
X_train.shape

In [None]:
from sklearn.model_selection import cross_val_score
print(cross_val_score(modelrfc, X_val,y_val, cv=3))

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier, plot_tree

model2 = XGBClassifier(objective = 'multi:softmax',
                      max_depth = 14, colsample_bytree = .4)

model2.fit(X_train,y_train)
preds = model2.predict(test)

output = pd.DataFrame({'id':testid, 'status_group':predictions})
output.to_csv('XGBoost_result.csv', index=False)
print("Submission was successfully saved!")

In [None]:
test.shape

In [None]:
y_train.shape

In [None]:
print(cross_val_score(model2, X_val,y_val, cv=3))