In [1]:
import pandas as pd

DATA = '/kaggle/input/huntington-disease-dataset/Huntington_Disease_Dataset.csv'
df = pd.read_csv(filepath_or_buffer=DATA).drop(columns=['Patient_ID'])
df.head()

Unnamed: 0,Age,Sex,Family_History,HTT_CAG_Repeat_Length,Motor_Symptoms,Cognitive_Decline,Chorea_Score,Brain_Volume_Loss,Functional_Capacity,Gene_Mutation_Type,HTT_Gene_Expression_Level,Protein_Aggregation_Level,Random_Protein_Sequence,Random_Gene_Sequence,Disease_Stage,Gene/Factor,Chromosome_Location,Function,Effect,Category
0,31,Male,No,67,Moderate,Severe,8.8,3.2,94,Deletion,1.67,0.58,DAHKIRSPMRVGPHYYAQCDNNDTGSDKEHWLKTEAAPMTMDRTVE...,GCCAGCAGCGCCCGAGCGTATGAGGTATATGGATTGGACATTGGGC...,Middle,HTT,4p16.3,CAG Trinucleotide Repeat Expansion,Neurodegeneration,Primary Cause
1,33,Female,Yes,38,Severe,Moderate,3.24,5.98,50,Point Mutation,0.18,0.3,PANGFWYHNCLRFWNIPPYVMEGFPLADITEVHKWRVSGFMCWETQ...,AGTTTTCAGTGAGACTCTTCCCCAAAAGCCTCCACTACGACAGTGT...,Pre-Symptomatic,HTT,4p16.3,CAG Trinucleotide Repeat Expansion,Neurodegeneration,Primary Cause
2,69,Male,Yes,37,Severe,Moderate,1.01,2.82,69,Duplication,0.9,1.04,NWHEGHGASTWKATMVAWCLMVQHAVTWKEGNTRCREMSCMNFTQL...,TATACCACCAGTGGGAAGAGTAACGATTTTGGAGCGCCCCGAGTCC...,Early,MSH3,5q14.1,Mismatch Repair,CAG Repeat Expansion,Trans-acting Modifier
3,66,Male,Yes,50,Mild,Severe,3.21,6.77,76,Deletion,1.16,1.87,KCVQYIQATQMLVQSWGQRNPIMQSSEPDRAHDYESGTPKTYTYML...,GCGCGACCGACCAAAGGACCCATGGTGGTGATCTGTCATTGGATTC...,Pre-Symptomatic,MSH3,5q14.1,Mismatch Repair,CAG Repeat Expansion,Trans-acting Modifier
4,43,Female,Yes,48,Moderate,Mild,2.31,7.53,70,Insertion,1.85,2.94,DQPGNMTRQNKNHCMWRAKRPTKHPGHKPGEIDKEKSEQNDADSSA...,GGGACCGCGGTTCTAGAAGAGAGGTTCTCTGACCGCCGAAGGATTC...,Late,HTT (Somatic Expansion),4p16.3,CAG Repeat Instability,Faster Disease Onset,Cis-acting Modifier


How much data do we have?

In [2]:
df.shape

(48536, 20)

We have a lot of data. How complete is our dataset?

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48536 entries, 0 to 48535
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        48536 non-null  int64  
 1   Sex                        48536 non-null  object 
 2   Family_History             48536 non-null  object 
 3   HTT_CAG_Repeat_Length      48536 non-null  int64  
 4   Motor_Symptoms             48536 non-null  object 
 5   Cognitive_Decline          36417 non-null  object 
 6   Chorea_Score               48536 non-null  float64
 7   Brain_Volume_Loss          48536 non-null  float64
 8   Functional_Capacity        48536 non-null  int64  
 9   Gene_Mutation_Type         48536 non-null  object 
 10  HTT_Gene_Expression_Level  48536 non-null  float64
 11  Protein_Aggregation_Level  48536 non-null  float64
 12  Random_Protein_Sequence    48536 non-null  object 
 13  Random_Gene_Sequence       48536 non-null  obj

Almost all of our data is non-null. Unfortunately, a lot of our data is non-numerical, and it's not clear what we would do with the protein sequence or gene sequence data. Let's take a sample and build a scatter plot using dimensionality reduction with TSNE.

In [4]:
from plotly import express
from plotly import io

io.renderers.default = 'iframe'

Let's make a list of candidate target variables.

In [5]:
TARGETS = ['Family_History', 'Motor_Symptoms', 'Cognitive_Decline', 'Gene_Mutation_Type', 'Disease_Stage', 'Gene/Factor', 'Chromosome_Location', 'Function', 'Effect', 'Category']

Are our target variables balanced or unbalanced?

In [6]:
for target in TARGETS:
    print(df[target].value_counts().to_dict())

{'Yes': 24329, 'No': 24207}
{'Moderate': 16261, 'Severe': 16157, 'Mild': 16118}
{'Moderate': 12243, 'Severe': 12132, 'Mild': 12042}
{'Insertion': 12239, 'Duplication': 12220, 'Point Mutation': 12133, 'Deletion': 11944}
{'Middle': 12289, 'Pre-Symptomatic': 12191, 'Early': 12032, 'Late': 12024}
{'HTT': 12229, 'MLH1': 12223, 'MSH3': 12056, 'HTT (Somatic Expansion)': 12028}
{'4p16.3': 24257, '3p22.2': 12223, '5q14.1': 12056}
{'Mismatch Repair': 24279, 'CAG Trinucleotide Repeat Expansion': 12229, 'CAG Repeat Instability': 12028}
{'CAG Repeat Expansion': 24279, 'Neurodegeneration': 12229, 'Faster Disease Onset': 12028}
{'Trans-acting Modifier': 24279, 'Primary Cause': 12229, 'Cis-acting Modifier': 12028}


Let's build our scatter plot. For performance reasons we'll use a smallish sample of our data.

In [7]:
from sklearn.manifold import TSNE

COLUMNS = [key for key, value in df.dtypes.to_dict().items() if str(value) in {'int64', 'float64'}]
N = 1000
RANDOM_STATE = 2025

reducer = TSNE(random_state=RANDOM_STATE)
sample_df = df.sample(n=N, random_state=RANDOM_STATE)
plot_df = pd.DataFrame(columns=['x', 'y'], data=reducer.fit_transform(X=sample_df[COLUMNS]))
for target in TARGETS:
    plot_df[target] = sample_df[target].to_list()

Now let's color our scatter plots by each of the target variables and see if we see any local clustering that corresponds to the target variable.

In [8]:
express.scatter(data_frame=plot_df, x='x', y='y', color=TARGETS[0])

In [9]:
express.scatter(data_frame=plot_df, x='x', y='y', color=TARGETS[1])

In [10]:
express.scatter(data_frame=plot_df, x='x', y='y', color=TARGETS[2])

In [11]:
express.scatter(data_frame=plot_df, x='x', y='y', color=TARGETS[3])

In [12]:
express.scatter(data_frame=plot_df, x='x', y='y', color=TARGETS[4])

In [13]:
express.scatter(data_frame=plot_df, x='x', y='y', color=TARGETS[5])

In [14]:
express.scatter(data_frame=plot_df, x='x', y='y', color=TARGETS[6])

In [15]:
express.scatter(data_frame=plot_df, x='x', y='y', color=TARGETS[7])

In [16]:
express.scatter(data_frame=plot_df, x='x', y='y', color=TARGETS[8])

In [17]:
express.scatter(data_frame=plot_df, x='x', y='y', color=TARGETS[9])

Unfortunately we don't see any local clustering that corresponds to our target variables.