## Import libraries and load files

In [0]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

In [2]:
get_available_gpus()

['/device:GPU:0']

In [0]:
import tensorflow as tf

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import SCORERS

from sklearn.model_selection import KFold
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD
from warnings import warn

import numpy as np
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import silhouette_samples, silhouette_score, davies_bouldin_score, adjusted_rand_score

In [0]:
#Read the data using the Unnamed (probably id) as index
url = 'https://s3.amazonaws.com/drivendata/data/4/public/81e8f2de-9915-4934-b9ae-9705685c9d50.csv'
#url = '../src/data/raw/training.csv'
training = pd.read_csv(url, index_col='Unnamed: 0')

labels = ['Function', 'Object_Type', 'Operating_Status', 'Position_Type', 'Pre_K', 'Reporting', 
          'Sharing', 'Student_Type', 'Use']

numeric = ['FTE', 'Total']

categoric = [ 'Facility_or_Department', 'Function_Description', 
            'Fund_Description', 'Job_Title_Description', 'Location_Description', 
            'Object_Description', 'Position_Extra', 'Program_Description', 'SubFund_Description', 
            'Sub_Object_Description', 
            'Text_1', 'Text_2', 'Text_3', 'Text_4']

### FunctionTransformers

In [0]:
# Define combine_text_columns()
def combine_text_columns(data_frame):
    """ converts all text in each row of data_frame to single vector """
    
    # Drop non-text columns that are in the df
    text_data = data_frame[categoric].copy()
    
    # Replace nans with blanks
    text_data.fillna("", inplace=True)
    
    for category in categoric:
      training.loc[:,category] = training[category].str.lower()
    
    
    # Join all text items in a row that have a space in between
    return text_data.apply(lambda x: " ".join(x), axis=1)

In [0]:
groupped_FTE = training[['FTE', 'Object_Type']].groupby(by='Object_Type')
groupped_total = training[['Total', 'Object_Type']].groupby(by='Object_Type')
# Define combine_numeric_columns()
def combine_numeric_columns(data_frame, groupped_FTE=groupped_FTE, groupped_total=groupped_total):
    """ process all the numeric data """
    
    # Drop non-numeric columns that are in the df
    data = data_frame[numeric].copy()
    
    #Remove inconsistent data
    data.loc[(data[numeric[0]] < 0) | (data[numeric[0]] > 1), numeric[0]] = np.nan
    data.loc[(data[numeric[1]] < 0), numeric[1]] = np.nan
    
    #Impute the missing data with the median from each class
    for group in groupped_FTE.median().index:
      indexes_FTE = groupped_FTE.get_group(group).index.values
      indexes_total = groupped_total.get_group(group).index.values
      data.loc[ data.FTE.isnull() & np.isin(data.index.values,indexes_FTE), 'FTE'] = groupped_FTE.median().loc[group, "FTE"]
      data.loc[ data.Total.isnull() & np.isin(data.index.values,indexes_total), 'Total'] = groupped_total.median().loc[group,"Total"]
      
    return data

In [0]:
# Preprocess the text data: get_text_data
get_text_data = FunctionTransformer(combine_text_columns, validate=False)

# Preprocess the numeric data: get_numeric_data
get_numeric_data = FunctionTransformer(combine_numeric_columns, validate=False)

In [0]:
# Recover the targets and split the data
y = pd.get_dummies(training['Object_Type'])

X = training.drop(columns=labels)

# rus = RandomUnderSampler(random_state=0)
# X_resampled, y_resampled = rus.fit_resample(X, y)

### Pipeline

Apply the transformations on numeric and categorica data. Neither dimension reduction or standard scaler are used.

In [0]:
pl = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imp', SimpleImputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer',HashingVectorizer(token_pattern="[A-Za-z0-9]+(?=\\s+)", 
                                                    norm=None, 
                                                    binary=False,
                                                    ngram_range=(1,2)) 
                    )
                ]))
             ]
        )),
        ('reduce_dim', TruncatedSVD(n_components = 100)),
        # ('clf', AgglomerativeClustering(memory='mycachedir', 
        #                     compute_full_tree=True, n_clusters=3))
        
    ])

Applying the steps, we got a sparse matrix with 1048578 features.

In [0]:
data_X= pl.fit_transform(X, y)

In [0]:
rus = RandomUnderSampler()
X_resampled, y_resampled = rus.fit_resample(data_X, y.values)

In [13]:
X_resampled.shape

(35420, 100)

In [14]:
y_resampled.shape

(35420, 11)

## Training
The model is trained and tested using the number of groups varying between 2 and 20. As the agglomerative clustering method is deterministic, the model is fitted only one time.

In [0]:
results = []

In [0]:
for k in range(2, 21):
  agg = AgglomerativeClustering(memory='mycachedir', 
                             compute_full_tree=True, n_clusters=k)
  with tf.device('/gpu:0'):
    #fit model to data
    cluster_labels = agg.fit_predict(X_resampled)
    
  # The silhouette_score gives the average value for all the samples.
  # This gives a perspective into the density and separation of the formed
  # clusters
  silhouette_avg = silhouette_score(X_resampled, cluster_labels)
  print("For n_clusters =", k,
        "The average silhouette_score is :", silhouette_avg)
  
  db_avg = davies_bouldin_score(X_resampled, cluster_labels)
  print("For n_clusters =", k,
        "The average db_score is :", db_avg)
  
  # Append the results
  results.append({'k':k, 'silhouette': silhouette_avg,
                 'db': db_avg})

For n_clusters = 2 The average silhouette_score is : 0.9985316325554371
For n_clusters = 2 The average db_score is : 0.10570938565578004


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


For n_clusters = 3 The average silhouette_score is : 0.9892538125423193
For n_clusters = 3 The average db_score is : 0.47954795284588975


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


For n_clusters = 4 The average silhouette_score is : 0.987351984210137
For n_clusters = 4 The average db_score is : 0.402396162915278


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


For n_clusters = 5 The average silhouette_score is : 0.9821865245924213
For n_clusters = 5 The average db_score is : 0.36551950315862924


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


For n_clusters = 6 The average silhouette_score is : 0.9465753249478333
For n_clusters = 6 The average db_score is : 0.40052222202320226


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


For n_clusters = 7 The average silhouette_score is : 0.9465603429622959
For n_clusters = 7 The average db_score is : 0.44509160921938956


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


For n_clusters = 8 The average silhouette_score is : 0.8922431116937046
For n_clusters = 8 The average db_score is : 0.4655814409064645


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


For n_clusters = 9 The average silhouette_score is : 0.892207370368799
For n_clusters = 9 The average db_score is : 0.4036978909153579


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


For n_clusters = 10 The average silhouette_score is : 0.8921634205486239
For n_clusters = 10 The average db_score is : 0.41466608304339714


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


For n_clusters = 11 The average silhouette_score is : 0.8921679358518021
For n_clusters = 11 The average db_score is : 0.3539080269586909


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


For n_clusters = 12 The average silhouette_score is : 0.8922819657298096
For n_clusters = 12 The average db_score is : 0.3589793761030178


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


In [0]:
results