# Package and Raw Data Imports

In [140]:
import numpy as np
import pandas as pd
from functools import reduce
import re
from collections import defaultdict
import copy
from imblearn.over_sampling import SMOTENC
from sklearn.preprocessing import StandardScaler


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [75]:
# Importing train and test datasets from local data directory
y_train_path = '/data/training_set_labels.csv'
X_train_path = '/data/training_set_values.csv'
X_test_path = '/data/test_set_values.csv'

X_train, y_train = pd.read_csv(X_train_path, index_col='id').drop(columns='date_recorded'), pd.read_csv(y_train_path, index_col='id')
X_test = pd.read_csv(X_test_path, index_col='id')

Index(['amount_tsh', 'funder', 'gps_height', 'installer', 'longitude',
       'latitude', 'wpt_name', 'num_private', 'basin', 'subvillage', 'region',
       'region_code', 'district_code', 'lga', 'ward', 'population',
       'public_meeting', 'recorded_by', 'scheme_management', 'scheme_name',
       'permit', 'construction_year', 'extraction_type',
       'extraction_type_group', 'extraction_type_class', 'management',
       'management_group', 'payment', 'payment_type', 'water_quality',
       'quality_group', 'quantity', 'quantity_group', 'source', 'source_type',
       'source_class', 'waterpoint_type', 'waterpoint_type_group'],
      dtype='object')

# Training Data Insights

Massive class inbalance for the 'functional needs repair' group

In [None]:
y_train.status_group.value_counts() / len(y_train)

functional                 0.543081
non functional             0.384242
functional needs repair    0.072677
Name: status_group, dtype: float64

Only categorical columns have missing values

In [None]:
cols_with_nulls = X_train.columns[X_train.isna().any()]
X_train[cols_with_nulls].dtypes

funder               object
installer            object
subvillage           object
public_meeting       object
scheme_management    object
scheme_name          object
permit               object
dtype: object

# Categorical Data Cleaning

In [None]:
def handle_missing_data(df):
    # Ensures that the string representation of missing data is 'missing'
    df = df.replace(['unknown'], 'missing')
    obj_columns = df.select_dtypes(include='object').columns
    df[obj_columns] = df[obj_columns].fillna('missing')
    return df
    
def standardize_strs(df):
    # Lowers string column values
    string_cols = df.select_dtypes(include='object').columns
    df[string_cols] = df[string_cols].apply(lambda x: x.str.lower())
    return df

def bools_to_int(df):
    # Selects the boolean columns
    bool_cols = [name for name in df.columns 
                          if (df[name].isin([True, False]).any()) 
                                  & (df[name].dtype == 'object')]

    # Converts booleans to integers and fills NaNs with a flag value
    df[bool_cols] = df[bool_cols].apply(pd.to_numeric, errors = 'ignore')
    df[bool_cols] = df[bool_cols].fillna(-1)
    return df

def dupl_cleaning(df):
    # Drops columns containing duplicate data
    return df.drop(['payment', 'waterpoint_type'],axis='columns')

def clean_categorical_cols(df):
    # A claaaassic dispatch function
    cleaning_funcs = [dupl_cleaning,bools_to_int, standardize_strs, handle_missing_data]
    cleaned_df = reduce(lambda o, func: func(o), cleaning_funcs, df)
    return cleaned_df

cleaned_X_test = clean_categorical_cols(X_test)
cleaned_X_train = clean_categorical_cols(X_train)

# Oversample

Oversample X train before imputing and handling outliers

In [None]:
def get_categorical_cols(X):
    # Returns all numeric and object categorical column names
    numeric_cats = ['region_code', 'district_code', 'public_meeting', 'permit']
    object_cats = X.select_dtypes(include='object').columns
    return [*numeric_cats, *object_cats]

def oversample(X, y):
    # Sort X & y by id ~just in case~
    y_sorted = y.sort_index(level='id')
    X_sorted =X.sort_index(level='id')
    
    # Creates a cute boolean array indicating categorical columns
    cat_cols = get_categorical_cols(X)
    is_categorical = X_sorted.columns.isin(cat_cols)
    

    encoder = SMOTENC(categorical_features=is_categorical, random_state=666)
    X_res, y_res = encoder.fit_resample(X_sorted.values, y_sorted.values.ravel())
    
    return X_res, y_res

X_train_resamp, y_train_resamp = oversample(cleaned_X_train, y_train)

index= np.arange(len(X_train_resamp))
X_train_resamp = pd.DataFrame(X_train_resamp, columns=cleaned_X_train.columns, index=index)
y_train_resamp = pd.DataFrame(y_train_resamp, index=index, columns=['functional_group'])

resampled_paths = ['/data/X_train_resamp.csv', '/data/y_train_resamp.csv']

X_train_resamp.to_csv(resampled_paths[0])
y_train_resamp.to_csv(resampled_paths[1])



In [166]:
y_train = pd.read_csv('/data/y_train_resamp.csv')
X_train = pd.read_csv('/data/X_train_resamp.csv').drop(columns='date_recorded')
X_train.select_dtypes(include='object').columns

Index(['Unnamed: 0', 'functional_group'], dtype='object')


# Data Cleaning Continued

## Cleaning string values

In [126]:
def get_word_attributes(df, df_unique, word):
  substrings = []
  parents = []
  for value in df_unique:
    if value != word:
      if value in word:
        substrings.append(value)
      elif word in value:
        parents.append(value)

  return {'freq':len(df[df==word])/len(df)
              ,'substrings': substrings
              ,'parents': parents}


def create_word_trees(df, cols):
  all_targets = []
  for col in cols:
    df[col] = df[col].fillna('missing').str.lower().map(lambda x: re.sub(r'()-', '_', x))
    df_filt = df.loc[df[col].map(len) >2, col]
    df_unique = df_filt.unique()
    word_attr = {}
    for word in df_unique:
        word_attr[word] = get_word_attributes(df_filt, df_unique, word)
    all_targets.append((col, word_attr))    
  return all_targets


target_cols = 'funder installer wpt_name subvillage ward '.split()
trees = create_word_trees(X_train, target_cols)

In [160]:
def get_base_words(group):
  base_words = [key for key in group.keys()
                            if group[key]['parents'] and not group[key]['substrings']]
  return base_words

def get_replacements(base_words, tree):
    replacements = {}
    for word in base_words:
      parent_freqs = [(parent, tree[parent]['freq']) for parent in tree[word]['parents']]
      best_parent, best_freq = sorted(parent_freqs, key=lambda x:x[1], reverse=True)[0]
      
      if best_freq > tree[word]['freq']:
        replacements[word] = best_parent
      else:
        replacements[best_parent] = word
    return replacements

def update_tree(replacements, bases, tree):
  # Delete old base keys
  for key in bases:
    del tree[key]

  for key in tree.keys():
    tree[key]['substrings']  = list(set(tree[key]['substrings']) - set(bases))

  for replaced, replaced_with in replacements.items():
    if replaced in tree.keys():
      tree[replaced_with] = tree[replaced].copy()
      del tree[replaced]
  return tree

def dispatch(df,replacement_groups, nlevels=1):
  trees = replacement_groups.copy()
  replacements_all = []

  for col, tree in trees: 
    df[col] = df[col].fillna('missing').str.lower().map(lambda x: re.sub(r'()-', '_', x))    
    updated_tree = tree.copy()
    for _ in range(nlevels):
      base_words = get_base_words(updated_tree)
          
      if base_words:
        replacements = get_replacements(base_words, tree)

        df[col] = df[col].map(replacements)
        updated_tree = update_tree(replacements, base_words, updated_tree)
  return df

cleaned_X_train = dispatch(X_train.copy(), copy.deepcopy(trees))
cleaned_X_test = dispatch(X_test.copy(), copy.deepcopy(trees))

In [212]:
  def get_train_info(_train):
    train_info = pd.DataFrame() 
    
    train_info['std'] = _train.std(ddof=0)
    train_info['mean'] = _train.mean()
    train_info['median'] = _train.median()
    train_info['z_score_thresh'] = 3.5
    dist_cols = train_info.index[train_info.index.isin(('latitude', 'longitude', 'gps_height'))]
    if not dist_cols.empty:
      train_info.loc[dist_cols, 'z_score_thresh'] = 5

    train_info['lower_thresh'] = -train_info.z_score_thresh *train_info['std']+train_info['mean']
    return train_info

def replace_outliers(dfs):
    train, test = dfs
    target_cols = train.select_dtypes(exclude='object').columns
    train_info = get_train_info(train[target_cols])
    
    def replace_with_median(col, train_info):
      target = col.name
      median = train_info.loc[target, 'median']
      threshold = train_info.loc[target, 'lower_thresh']
      std = train_info.loc[target, 'std']

      col[col > -threshold] =  col[col > -threshold] - 2*std
      col[col < threshold] = col[col < threshold] + 2*std
      return col

    train[target_cols] = train[target_cols].apply(replace_with_median, args=(train_info,),axis=0)
    test[target_cols] = test[target_cols].apply(replace_with_median, args=(train_info,),axis=0)

    return [train,test]

def scale_cols(dfs):
    train_df, test_df = dfs
    cols_to_scale = train_df.select_dtypes(exclude='object').columns

    ss = StandardScaler()
    train_df[cols_to_scale] = ss.fit_transform(train_df[cols_to_scale])
    test_df[cols_to_scale] = ss.transform(test_df[cols_to_scale])
    return [train_df, test_df]


def imputer(dfs):
    train, test = dfs
    train_info = get_train_info(train['gps_height'].reset_index())
    fill_values = [('gps_height', 0)]
    
    for col, missing_value in fill_values:
      train_median = train_info.loc[col, 'median']
      train.loc[train[col] == missing_value, col] = train_median
      test.loc[test[col] == missing_value,col] = train_median

    return [train,test]

def clean_numeric_cols(train_df, test_df):
    funcs = [imputer, replace_outliers,scale_cols]
    cleaned_dfs = reduce(lambda o, func: func(o), funcs, [train_df,test_df])
    return cleaned_dfs

cleaned_train, cleaned_test = clean_numeric_cols(cleaned_X_train.drop(columns='Unnamed: 0').copy(deep=True), cleaned_X_test.copy(deep=True))

In [210]:
train_height_imp = '/data/cleaned_X_train_dist_imputed.csv'
test_height_imp = '/data/cleaned_X_test_dist_imputed.csv'

cleaned_train.to_csv(train_height_imp)
cleaned_test.to_csv(test_height_imp)