In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from itertools import chain
from functools import reduce




train_height_imp = pd.read_csv('/data/cleaned_X_train_dist_imputed.csv')
test_height_imp = pd.read_csv('/data/cleaned_X_test_dist_imputed.csv')
y = pd.read_csv('/data/y_train_resamp.csv')
train_height_imp['y'] = y['functional_group']
train_height_imp['id'] = np.arange(len(train_height_imp))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Feature Creation

## Bin Construction Year
Bin the construction years into intervals which span 3 years

In [None]:
def bin_construct_year(df):
  # Bins the construction year into 3 year intervals
  
  # Hardocde 1963 because some rows have a construction year of '0' 
  year_bins = pd.interval_range(1963,df['construction_year'].max(),freq=3)
  df['construct_year_bin'] = pd.cut(df['construction_year'], year_bins)

  # Creates labels indicating the construction year to replace the raw interval dtype
  labels = [f'{str(bin.left)}-{str(bin.right)}' for bin in year_bins]
  df['construct_year_bin']  = df['construct_year_bin'].map(dict(zip(year_bins,labels))).astype(str, errors='ignore')
  # Fills any rows that weren't binned (like those equal to '0') as missing
  df.loc[df['construct_year_bin'].isna(), 'construct_year_bin'] = 'missing'
  return df

## Create KNN columns

For each point in the training data:
 

1.   Find the nearest neighbors using 3-dim Euclidean distance
2.  Determine percentage of neighbors that have a certain condition for each point






In [3]:
def cross_join_dfs(df_1, df_2, suffixes=['_test', '_cond_probs']):
    # Pair every row in df_1 with every row in df_2, like a cross join in SQL or cartesian product
    df_1['merge_key'], df_2['merge_key'] = 1,1

    cross_joined = df_1.merge(df_2,on='merge_key', suffixes=suffixes).drop(columns='merge_key')
    return cross_joined

def filter_dfs(dfs, filter_for):
  filtered = [df[df.columns[df.columns.isin(filter_for)]].copy() 
                          for df in dfs]
  return filtered                          

def merge_X_neighbor_cols(X_neighbor, X):
  if not 'id' in X.columns:
    X['id'] = np.arange(len(X))
  combined = X.iloc[:750].merge(X_neighbor.reset_index()[['func_neighbor_perc'
                                                ,'non_func_neighbor_perc'
                                                ,'needs_repair_neighbor_perc'
                                                ,'id']]
                                                  ,on='id')
  return combined

In [None]:
def calc_y_prop(group,k):
  # Find the K nearest points and counts the number of times our target values appear
  
  y_prop = group.nlargest(k, 'distance')['y'].value_counts()/k

  return y_prop.reset_index()  
  
  

def calc_distance(df):
  # Compute the distances between the selected point and all points in X Train
  df['distance'] = np.sqrt((df['latitude_1']-df['latitude'])**2  
                                    +(df['longitude_1']-df['longitude'])**2  
                                    +(df['gps_height_1']-df['gps_height'])**2)
  
  # Remove the points that are in the same exact location of our isolated point
  df = df[df['distance']!=0]
  return df

def reformat(dfs, X):
  df = pd.concat(dfs)
  df.columns = ['y_pred', 'proportion'] 
  df = df.reset_index(level='id')
  df['neighbor_cols'] = df['y_pred'].map({'functional': 'func_neighbor_perc'
                                                ,'non functional': 'non_func_neighbor_perc'
                                                ,'functional needs repair': 'needs_repair_neighbor_perc'})
  res = df.pivot(values='proportion',index='id', columns='neighbor_cols').fillna(0)
  return merge_X_neighbor_cols(res,X)

def get_knn_conditions(X_train,k):
    X_train['id'] = np.arange(len(X_train))
    # Filters X_train_neighbors and X_test for essential columns to lighten the RAM load
    filter_for = ['func_neighbor_perc'
                  ,'non_func_neighbor_perc'
                  ,'needs_repair_neighbor_perc'
                ,'latitude'
                ,'longitude'
                ,'id'
                ,'y'
                ,'gps_height']
    filtered_train = filter_dfs([X_train], filter_for)[0]
    
    # Separates X_test into batches and dispatches them, along with all X_neighbor points 
    processed = []
    # Filter out duplicate values created with oversampling
    filt_no_dupl = filtered_train.drop_duplicates(subset=['latitude', 'longitude', 'gps_height']).copy()
    batch_indexes = pd.interval_range(0,len(X_train), freq = 250)
    for interval in batch_indexes:
        train_batch = filtered_train.iloc[interval.left:interval.right,:].copy()
        cross_joined = cross_join_dfs(train_batch, filt_no_dupl, suffixes=['', '_1'])
        with_distance = calc_distance(cross_joined)
        
        res = with_distance.groupby('id').apply(calc_y_prop, k)
        processed.append(res)

    res = reformat(processed, X_train)
    return res

k=5
X_train_with_neighbor = get_knn_conditions(train_height_imp,k)

In [14]:
def calc_precision(df,value):
  num = len(df[(df['y'] == 'functional') & (df['pred_y'] == 'functional')])
  denom = len(df[df['pred_y'] == 'functional'])
  return  num/denom  
def get_pred(df):
  df.loc[(df['func_neighbor_perc'] > df['non_func_neighbor_perc']) & (df['func_neighbor_perc'] > df['needs_repair_neighbor_perc']), 'pred_y'] = 'functional'
  df.loc[(df['func_neighbor_perc'] < df['non_func_neighbor_perc']) & (df['non_func_neighbor_perc'] > df['needs_repair_neighbor_perc']), 'pred_y'] = 'non functional'
  df.loc[(df['func_neighbor_perc'] < df['needs_repair_neighbor_perc']) & (df['non_func_neighbor_perc'] < df['needs_repair_neighbor_perc']), 'pred_y'] = 'functional needs repair'
  print(f"Precision: functional {calc_precision(df,'functional')} non functional {calc_precision(df,'non functional')} needs_repair {calc_precision(df,'functional needs repair')}")

get_pred(neighbor_data)

Precision: functional 1.0 non functional 1.0 needs_repair 1.0


In [None]:
def find_closest_train_perc(test_data, cond_probs):
    joined = cross_join_dfs(test_data, cond_probs)
    
    # Compute the euclidean distance between the X_test_point and X_train_point pairs
    joined['distance'] = np.sqrt((joined['latitude_test']-joined['latitude_cond_probs'])**2  
                                      +(joined['longitude_test']-joined['longitude_cond_probs'])**2  
                                      +(joined['gps_height_test']-joined['gps_height_cond_probs'])**2)
    
    # For each test id, return the pairing with the closest distance
    joined_filt = joined.loc[joined.groupby('id')['distance'].idxmin()]
    return joined_filt

def get_test_neighbor_perc(X_neighbor, X_test):
    # Decided on a vectorized batch processing approach because get_knn_cond was EXTREMELY slow
    
    # Filters X_train_neighbors and X_test for essential columns to lighten the RAM load
    filter_for = ['func_neighbor_perc'
                  ,'non_func_neighbor_perc'
                  ,'needs_repair_neighbor_perc'
                ,'latitude'
                ,'longitude'
                ,'id'
                ,'gps_height']
    X_neighbor_filt, X_test_filt = filter_dfs(X_neighbor, X_test, filter_for)
    
    # Filter out duplicate values created with oversampling
    X_neighbor_filt = X_neighbor_filt.drop_duplicates()
    
    # Separates X_test into batches and dispatches them, along with all X_neighbor points 
    processed = []
    batch_indexes = pd.interval_range(0,len(X_test_filt), freq = 350)
    for interval in batch_indexes:
        X_test_batch = X_test_filt.iloc[interval.left:interval.right,:].copy()
        processed.append(find_closest_train_perc(X_test_batch, X_neighbor_filt))
        
    # Need to drop the NaN rows that overindexing with iloc creates 
    full_df = pd.concat(processed).dropna()
    return full_df

test_neighbor_data = get_test_neighbor_perc(X_train_with_neighbor, X_test_raw)

In [None]:
X_test_with_neighbor = merge_X_neighbor_cols(X_test_with_neighbor, test_height_imp)
#X_test = bin_construct_year(X_test)

# OHE and Dropping Features

In [None]:
cols_to_keep = [
              ,'water_quality'
              ,'waterpoint_type_group'
              ,'source'
              ,'quantity'
              ,'quality_group'
              ,'water_quality'
              ,'payment_type'
              ,'management'
              ,'extraction_type'
              ,'amount_tsh'
              ,'permit'
              ,'public_meeting'
              ,'func_neighbor_perc'
              ,'non_func_neighbor_perc'
              ,'needs_repair_neighbor_perc'
              ,'id'
              ,'y']
train_filt, X_test_filt = filter_dfs([X_train_with_neighbor, X_test_with_neighbor], cols_to_keep)

In [None]:
# Using this to ensure that any one hot encoded 'missing' values are properly differentiated 

def add_suffix(col, value):
  name = col.name
  new_value = value + '_'+ name
  col[col == value] = new_value
  return col

In [None]:
def separate_train_data(df):
    df['id'] = np.arange(len(df))
    y_actual = df[['y', 'id']]
    X = df.drop(columns='y')
    return y_actual, X

y_train, X_train = separate_train_data(train_filt)

In [None]:
def one_hot_encode(fit_df, transform_df, target_cols):
  target_fit, target_transform = [df[target_cols].apply(add_suffix, args=('missing',)) for df in [fit_df, transform_df]]
  ohe = OneHotEncoder(sparse=False).fit(target_fit)
  
  transformed = ohe.transform(target_transform)
  cols = list(chain(*ohe.categories_))
  res = pd.DataFrame(transformed, columns =cols)
  return res



In [None]:
def ohe_dispatch(train, test):
    cat_cols = train.select_dtypes(include='object').columns

    test_ohe = one_hot_encode(train, test, cat_cols)
    train_ohe = one_hot_encode(train, train, cat_cols)

    full_train = pd.concat([train, train_ohe], axis='columns').drop(columns=cat_cols)
    full_test = pd.concat([test, test_ohe], axis='columns').drop(columns=cat_cols)
    return full_train, full_test
    
full_train, full_test  = ohe_dispatch(X_train, X_test_filt)

In [None]:
full_train.to_csv('/data/modeling_data_train.csv')
full_test.to_csv('/data/modeling_data_test.csv')