# Clustering Model using K-Modes Algorithm

In [223]:
import kmodes
from kmodes import kmodes
from datetime import datetime
from sklearn import preprocessing
from kmodes.kmodes import KModes

import pandas as pd
import numpy as np
import sqlite3

import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.express as px

import missingno as mn

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## Preprocess Data (OneHeatEncoding, Join Tables)

### load in data

In [79]:
conn = sqlite3.connect('covid_trials.db')
trial_info=pd.read_sql("select * from trial_info", con = conn)
study_designs=pd.read_sql("select * from study_designs", con = conn)
interventions=pd.read_sql("select * from interventions", con = conn)
outcome_measures=pd.read_sql("select * from outcome_measures", con = conn)
sponsor_collaborators=pd.read_sql("select * from sponsor_collaborators", con = conn)
funded_bys=pd.read_sql("select * from funded_bys", con = conn)
study_type=pd.read_sql("select * from study_type", con = conn)
conn.close()

### `trial_info` as `df`

In [80]:
# Preprocessing data

df = trial_info.copy()

## age
df['Age'] = df.Age.str.extract(r'[(](.*?)[)]')

## date
date_columns = ['Start Date',                       
                'Completion Date',
                'First Posted',
                'Last Update Posted' ]

def rep_m(m):
    months = ["January", "February", "March", "April", "May", "June", "July", 
              "August", "September", "October", "November", "December"]
    months = [x.upper() for x in months]
    for i in months:
        if m == i:
            m = months.index(i) + 1
    return str(m)

def to_date(date_str):
    if date_str == "NAN NAN":
        return np.nan
    else:
        date_str = date_str.split()
        Y = date_str[1]
        m = rep_m(date_str[0])

        date = datetime.strptime(Y + "-" + m, "%Y-%m")

        return date


df[date_columns] = df[date_columns].applymap(to_date)

## trial duration
def get_interval_day(arrLike, start, end):   
    start_date = arrLike[start]
    end_date = arrLike[end]

    return (end_date - start_date).days

# https://blog.csdn.net/u010339879/article/details/79505570
def month_delta(start_date, end_date):
    flag = True
    if start_date > end_date:
        start_date, end_date = end_date, start_date
        flag = False
    year_diff = end_date.year - start_date.year
    end_month = year_diff * 12 + end_date.month
    delta = end_month - start_date.month
    return -delta if flag is False else delta


def get_interval_month(arrLike, start, end):   
    start_date = arrLike[start]
    end_date = arrLike[end]

    return month_delta(start_date, end_date)

df['Trial_Duration_Days'] = df.apply(
    get_interval_day, axis=1, args=('Start Date', 'Completion Date'))

df['Trial_Duration_Months'] = df.apply(
    get_interval_month, axis=1, args=('Start Date', 'Completion Date'))

In [81]:
# ## categorize duration
# def cate_duration(duration):
#     if duration < 1:
#         return 'less then 1 month'
#     elif duration <= 3:
#         return '1 - 3 months'
#     elif duration <= 6:
#         return '4 - 6 months'
#     elif duration <= 12:
#         return '7 - 12 months'
#     elif duration <= 24:
#         return '1 - 2 years'
#     elif duration <= 60:
#         return '2 - 5 years'
#     elif duration <= 120:
#         return '5 - 10 years'
#     else:
#         return 'over 10 years'

# df["Trial_Duration_Category"] = df.Trial_Duration_Months.apply(cate_duration)

# ## categorize enrollment
# def cate_eroll(enroll):
#     if enroll < 10:
#         return 'less then 10'
#     elif enroll <= 50:
#         return '11 - 50'
#     elif enroll <= 100:
#         return '51 - 100'
#     elif enroll <= 200:
#         return '101 - 200'
#     elif enroll <= 500:
#         return '201 - 500'
#     elif enroll <= 1000:
#         return '501 - 1000'
#     elif enroll <= 5000:
#         return '1001 - 5000'
#     elif enroll <= 10000:
#         return '5001 - 10000'
#     else:
#         return 'over 10000'  

# df["Enrollment_Category"] = df.Enrollment.apply(cate_eroll)

In [82]:
df["Trial_Duration_Category"] = pd.cut(df.Trial_Duration_Months,
                                      [-float('inf'),0,3,6,12,24,60,120,float('inf')],
                                      labels=['less then 1 month','1 - 3 months','4 - 6 months','7 - 12 months',
                                              '1 - 2 years','2 - 5 years','5 - 10 years','over 10 years'])
df["Enrollment_Category"] = pd.cut(df.Enrollment,
                                  [-float('inf'),9,50,100,200,500,1000,5000,10000,float('inf')],
                                  labels=['less then 10','11 - 50','51 - 100','101 - 200',
                                          '201 - 500','501 - 1000','1001 - 5000','5001 - 10000','over 10000'])

In [89]:
df["Study Type"] = (
    df["Study Type"].
    replace({"TREATMENT IND/PROTOCOL":"EXPANDED ACCESS",
             "INTERMEDIATE-SIZE POPULATION":"EXPANDED ACCESS",})
    .replace(regex={r'EXPANDED ACCESS:.*':"EXPANDED ACCESS"})
)

In [105]:
df["Study Type"].value_counts()

INTERVENTIONAL     2187
OBSERVATIONAL      1635
EXPANDED ACCESS      27
Name: Study Type, dtype: int64

In [90]:
mask_cols = ["PARTICIPANT", "CARE PROVIDER", "INVESTIGATOR", "OUTCOMES ASSESSOR"]
mask = study_designs[["NCT Number", "MASKING"]].replace({np.nan:""})
for col in mask_cols:
    mask[col] = [col in mask.MASKING[i] for i in range(mask.MASKING.size)]
mask.drop(columns="MASKING", inplace=True)
mask

Unnamed: 0,NCT Number,PARTICIPANT,CARE PROVIDER,INVESTIGATOR,OUTCOMES ASSESSOR
0,NCT04372602,True,True,True,False
1,NCT04364698,False,False,False,False
2,NCT04482621,True,False,True,False
3,NCT04459637,False,False,False,False
4,NCT04425538,False,False,False,False
...,...,...,...,...,...
3844,NCT04589923,False,False,False,False
3845,NCT03871491,True,True,True,False
3846,NCT04386876,False,False,False,False
3847,NCT04276987,False,False,False,False


### `study_designs` without "MASKING"

In [55]:
study_designs_ = study_designs.drop(columns="MASKING")
study_designs_

Unnamed: 0,NCT Number,ALLOCATION,INTERVENTION MODEL,PRIMARY PURPOSE,OBSERVATIONAL MODEL,TIME PERSPECTIVE
0,NCT04372602,RANDOMIZED,SINGLE GROUP ASSIGNMENT,TREATMENT,,
1,NCT04364698,,,,COHORT,PROSPECTIVE
2,NCT04482621,RANDOMIZED,PARALLEL ASSIGNMENT,TREATMENT,,
3,NCT04459637,,,,COHORT,PROSPECTIVE
4,NCT04425538,,SINGLE GROUP ASSIGNMENT,TREATMENT,,
...,...,...,...,...,...,...
3844,NCT04589923,,,,COHORT,PROSPECTIVE
3845,NCT03871491,RANDOMIZED,PARALLEL ASSIGNMENT,PREVENTION,,
3846,NCT04386876,RANDOMIZED,CROSSOVER ASSIGNMENT,OTHER,,
3847,NCT04276987,,SINGLE GROUP ASSIGNMENT,TREATMENT,,


### `interventions`

In [54]:
interventions_type = interventions
interventions_type.iloc[:,1:] = (interventions.replace({np.nan:0}) == 0).iloc[:,1:]
interventions_type

Unnamed: 0,NCT Number,DRUG,PROCEDURE,OTHER,DEVICE,BIOLOGICAL,DIAGNOSTIC TEST,DIETARY SUPPLEMENT,GENETIC,COMBINATION PRODUCT,BEHAVIORAL,RADIATION
0,NCT04372602,False,False,True,True,True,True,True,True,True,True,True
1,NCT04364698,True,True,True,True,True,True,True,True,True,True,True
2,NCT04482621,False,True,False,True,True,True,True,True,True,True,True
3,NCT04459637,True,True,True,True,True,True,True,True,True,True,True
4,NCT04425538,False,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...
3844,NCT04589923,True,True,True,False,True,True,True,True,True,True,True
3845,NCT03871491,False,True,True,True,True,True,True,True,True,True,True
3846,NCT04386876,False,True,True,True,True,True,True,True,True,True,True
3847,NCT04276987,True,True,True,True,False,True,True,True,True,True,True


### `study_type`

In [70]:
study_type["Study Type"] = (
    study_type["Study Type"].
    replace({"TREATMENT IND/PROTOCOL":"EXPANDED ACCESS",
             "INTERMEDIATE-SIZE POPULATION":"EXPANDED ACCESS",})
    .replace(regex={r'EXPANDED ACCESS:.*':"EXPANDED ACCESS"})
)
study_type_ = (
    study_type.drop(columns = "index").
    assign(value=True).
    drop_duplicates().
    pivot(index='NCT Number', columns='Study Type', values='value').
    replace({np.nan:False})
)
study_type_

Study Type,EXPANDED ACCESS,INTERVENTIONAL,OBSERVATIONAL
NCT Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NCT00571389,False,False,True
NCT01306084,False,False,True
NCT02735707,False,True,False
NCT02765191,False,True,False
NCT02788903,False,False,True
...,...,...,...
NCT04619628,False,True,False
NCT04619680,False,True,False
NCT04619693,False,False,True
NCT04619706,False,True,False


### `funded_bys`

In [37]:
funded_bys_ = (
    funded_bys.drop(columns = "index").
    assign(value=True).
    drop_duplicates().
    pivot(index='NCT Number', columns='Funded Bys', values='value').
    replace({np.nan:False})
)
funded_bys_

Funded Bys,INDUSTRY,NIH,OTHER,U.S. FED
NCT Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NCT00571389,True,False,False,False
NCT01306084,False,True,False,False
NCT02735707,False,False,True,False
NCT02765191,False,False,True,False
NCT02788903,False,False,True,False
...,...,...,...,...
NCT04619628,True,False,False,False
NCT04619680,True,False,True,False
NCT04619693,False,False,True,False
NCT04619706,True,False,False,False


### Merge data

In [72]:
df.columns

Index(['NCT Number', 'Title', 'Locations', 'Status', 'Study Results',
       'Conditions', 'Gender', 'Age', 'Phases', 'Enrollment', 'URL',
       'Location_Country', 'Location_City_or_State', 'Location_Institution',
       'Start Date', 'Completion Date', 'First Posted', 'Last Update Posted',
       'Trial_Duration_Days', 'Trial_Duration_Months',
       'Trial_Duration_Category', 'Enrollment_Category'],
      dtype='object')

In [139]:
df_= (
    df.drop(columns = ['Title','Locations','Conditions','Enrollment','URL',
                        'Location_City_or_State','Location_Institution','Start Date',
                        'Completion Date','First Posted','Last Update Posted',
                        'Trial_Duration_Days','Trial_Duration_Months']).
    set_index('NCT Number')
)

In [140]:
df_ = (
    df_.merge(study_designs.set_index("NCT Number"),left_index=True, right_index=True).
    replace({np.nan:"NO RECORD", "NAN":"NO RECORD", "N/A":"NO RECORD"})
      )

## Fit Model

In [252]:
km = KModes(n_clusters=3, init = "Huang", n_init = 3,verbose=0)
fitClusters = km.fit_predict(df_)

In [247]:
fitClusters

array([0, 1, 2, ..., 0, 0, 0], dtype=uint16)

In [248]:
clusterCentroidsDf = pd.DataFrame(km.cluster_centroids_)
clusterCentroidsDf.columns = df_.columns
clusterCentroidsDf

Unnamed: 0,Status,Study Results,Gender,Age,Phases,Location_Country,Funded Bys,Study Type,Trial_Duration_Category,Enrollment_Category,ALLOCATION,INTERVENTION MODEL,MASKING,PRIMARY PURPOSE,OBSERVATIONAL MODEL,TIME PERSPECTIVE
0,RECRUITING,NO RESULTS AVAILABLE,ALL,"ADULT, OLDER ADULT",NOT APPLICABLE,UNITED STATES,OTHER,INTERVENTIONAL,7 - 12 months,11 - 50,NO RECORD,SINGLE GROUP ASSIGNMENT,NONE (OPEN LABEL),TREATMENT,NO RECORD,NO RECORD
1,RECRUITING,NO RESULTS AVAILABLE,ALL,"ADULT, OLDER ADULT",NO RECORD,FRANCE,OTHER,OBSERVATIONAL,1 - 3 months,201 - 500,NO RECORD,NO RECORD,NO RECORD,NO RECORD,COHORT,PROSPECTIVE
2,RECRUITING,NO RESULTS AVAILABLE,ALL,"ADULT, OLDER ADULT",PHASE 2,UNITED STATES,OTHER,INTERVENTIONAL,7 - 12 months,51 - 100,RANDOMIZED,PARALLEL ASSIGNMENT,NONE (OPEN LABEL),TREATMENT,NO RECORD,NO RECORD


## Hyperparameter Tunning

### Elbow Method

As K increase, cost will strictly decrease but the speed of decrease may slow down. So when it is not decrease that fast, we may stop and choose that K.
https://datascience.stackexchange.com/questions/64455/how-to-evaluate-the-k-modes-clusters

In [363]:
hyperparams = {
    "n_clusters":range(2,11),
    "init":["Huang","Cao"]
}

para_cost = {}

for init in hyperparams["init"]:
    cost = []
    for n in hyperparams["n_clusters"]:
        km = KModes(n_clusters=n, init = init, n_init = 1, verbose=0)
        km.fit_predict(df_)
        cost.append(km.cost_)
    cost_decrease_ratio = [(cost[n-1] - cost[n])/cost[n-1] if n > 0 else 1 for n, k in enumerate(cost)]
    if_decrease_slow = [1 if cost_decrease_ratio[n] < 0.02 else 0 for n, k in enumerate(cost_decrease_ratio)]
    if 1 in if_decrease_slow:
        idx = np.argwhere(np.array(if_decrease_slow)==1).min() - 1
    else:
        idx = len(if_decrease_slow) - 1
    k = list(hyperparams["n_clusters"])[idx]
    para_cost[(init, k)] = cost[idx]

In [364]:
para_cost

{('Huang', 5): 16047.0, ('Cao', 4): 16973.0}

## Visualize Outcome

In [232]:
clustersDf = pd.DataFrame(fitClusters_cao)
clustersDf.columns = ['Cluster Predicted']
combine = df_.reset_index().merge(clustersDf, left_index=True, right_index=True).set_index("NCT Number")

In [233]:
col = "Trial_Duration_Category"
dfm = combine.assign(count=1).groupby(['Cluster Predicted',col]).agg({"count":'count'}).reset_index()
dfm

Unnamed: 0,Cluster Predicted,Trial_Duration_Category,count
0,0,1 - 2 years,244
1,0,1 - 3 months,424
2,0,2 - 5 years,110
3,0,4 - 6 months,195
4,0,5 - 10 years,13
5,0,7 - 12 months,101
6,0,NO RECORD,27
7,0,less then 1 month,46
8,0,over 10 years,9
9,1,1 - 2 years,304


In [234]:
px.bar(dfm , x="Cluster Predicted", y="count", color=col,barmode='group',height=400)

### maybe a heatmap?