In [43]:
import pandas as pd

In [44]:
feature_filename = "data/Features.csv"
label_filename = "data/Labels.csv"
df_feat = pd.read_csv(feature_filename)
df_label = pd.read_csv(label_filename)
df = pd.merge(df_feat, df_label, on='id', how='inner')

# Data Cleaning

## Drop columns
for more details on why these columns are dropped please refer to the data_understanding.ipynb file

In [45]:
df.drop(["date_recorded", "scheme_name", "num_private", "funder", "wpt_name", "subvillage", "lga", "ward", "public_meeting", "recorded_by", "permit", "extraction_type_group", "extraction_type_class", "scheme_management", "management", "management_group","payment", "water_quality", "quality_group", "quantity_group", "source_type" ,"source_class", "waterpoint_type_group"], inplace=True, axis=1)

## Remove outliers and drop null values
for more details on why I chose to remove these outliers and how I came up with the thresholds please refer to the data_understanding.ipynb file

In [46]:
df = df[df["amount_tsh"] <= 120]
df = df[df["population"] <= 10000]

df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45539 entries, 1 to 59399
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 45539 non-null  int64  
 1   amount_tsh         45539 non-null  float64
 2   gps_height         45539 non-null  int64  
 3   installer          45539 non-null  object 
 4   longitude          45539 non-null  float64
 5   latitude           45539 non-null  float64
 6   basin              45539 non-null  object 
 7   region             45539 non-null  object 
 8   region_code        45539 non-null  int64  
 9   district_code      45539 non-null  int64  
 10  population         45539 non-null  int64  
 11  construction_year  45539 non-null  int64  
 12  extraction_type    45539 non-null  object 
 13  payment_type       45539 non-null  object 
 14  quantity           45539 non-null  object 
 15  source             45539 non-null  object 
 16  waterpoint_type    455

## Reduce Categories of Categorical Features

In [47]:
df.query("quantity != 'unknown'",inplace=True) # drop "unknown" category from quantity feature

# drop three categories in the waterpoint_type feature since there are only very few instances with those "waterpoint_type"-values
df = df[~df["waterpoint_type"].isin(['improved spring', 'cattle trough', 'dam'])]

df = df.groupby("extraction_type").filter(lambda x: len(x) > 100) #use only those categories for which at least 100 instances exist in the data set

# Feature Engineering

In [48]:
def create_target_var(x):
    """Numerical encoding of target variable"""

    if x == "functional":
        return 0
    elif x == "functional needs repair":
        return 0.5
    elif x == "non functional":
        return 1
    else:
        raise "wrong input!"

df["status"] = df["status_group"].apply(create_target_var)

In [49]:
df.to_csv("data/cleaned_data.csv", index=False)