In [1]:
import os
import pandas as pd

for dirname, _, filenames in os.walk('/kaggle/input'):
    if filenames:
        input_file = os.path.join(dirname, filenames[0])

df = pd.read_csv(filepath_or_buffer=input_file, index_col=['Case_No'])
# we have several booleans we want to transform
df['target'] = df['Class/ASD Traits '] == 'Yes'
df['male'] = df['Sex'] == 'm'
df['jaundice'] = df['Jaundice'] == 'yes'
df['family'] = df['Family_mem_with_ASD'] == 'yes'
df.head()

Unnamed: 0_level_0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Who completed the test,Class/ASD Traits,target,male,jaundice,family
Case_No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,1,1,0,1,...,f,middle eastern,yes,no,family member,No,False,False,True,False
2,1,1,0,0,0,1,1,0,0,0,...,m,White European,yes,no,family member,Yes,True,True,True,False
3,1,0,0,0,0,0,1,1,0,1,...,m,middle eastern,yes,no,family member,Yes,True,True,True,False
4,1,1,1,1,1,1,1,1,1,1,...,m,Hispanic,no,no,family member,Yes,True,True,False,False
5,1,1,0,1,1,1,1,1,1,1,...,f,White European,no,yes,family member,Yes,True,False,False,True


In [2]:
import warnings
from plotly import express
warnings.filterwarnings(action='ignore', category=FutureWarning)

x = 'Qchat-10-Score'
express.histogram(data_frame=df, x=x, color='target').show()

If we want to build a classifier we need to leave this variable out.

In [3]:
import warnings
from plotly import express
warnings.filterwarnings(action='ignore', category=FutureWarning)

columns = ['Age_Mons','Sex', 'Ethnicity', 'Jaundice', 'Family_mem_with_ASD', 'Who completed the test',]
for x in columns:
    express.histogram(data_frame=df, x=x, color='target').show()

We don't have an enormous amount of data so let's use dimension reduction to see if it clusters.

In [4]:
df.columns

Index(['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons',
       'Qchat-10-Score', 'Sex', 'Ethnicity', 'Jaundice', 'Family_mem_with_ASD',
       'Who completed the test', 'Class/ASD Traits ', 'target', 'male',
       'jaundice', 'family'],
      dtype='object')

In [5]:
from umap import UMAP
from arrow import now

target = 'target'
columns = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons',  'male', 'jaundice', 'family']

time_start = now()
reducer = UMAP(n_components=2, random_state=2024, transform_seed=2024, verbose=True, n_jobs=1, n_epochs=100)
df[['x', 'y']] = pd.DataFrame(data=reducer.fit_transform(X=df[columns]))
express.scatter(data_frame=df, x='x', y='y', color='target', height=800, ).show()
print('UMAP done in {}'.format(now() - time_start))

2024-03-16 12:55:56.993005: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-16 12:55:56.993108: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-16 12:55:57.142522: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(n_epochs=100, n_jobs=1, random_state=2024, transform_seed=2024, verbose=True)
Sat Mar 16 12:56:07 2024 Construct fuzzy simplicial set
Sat Mar 16 12:56:08 2024 Finding Nearest Neighbors
Sat Mar 16 12:56:11 2024 Finished Nearest Neighbor Search
Sat Mar 16 12:56:13 2024 Construct embedding


Epochs completed:   0%|            0/100 [00:00]

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
Sat Mar 16 12:56:14 2024 Finished embedding


UMAP done in 0:00:07.140450


Dimension reduction induces clustering, but the target variable is not strongly evident in the induced clusters.

Before we proceed let's look at our correlations with the target variable.

In [6]:
express.histogram(data_frame=df[columns + [target]].corr()['target'].to_frame().reset_index().head(n=len(columns)), x='index', y='target')

All of our A variables are highly correlated with the target except one. This is encouraging. Let's build a model.

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from arrow import now

time_start = now()
X_train, X_test, y_train, y_test = train_test_split(df[columns], df[target], test_size=0.34, random_state=2024, stratify=df[target])

regression = LogisticRegression(max_iter=1000, tol=1e-12)
regression.fit(X=X_train, y=y_train)
print('fit complete after {} iterations.'.format(regression.n_iter_[0]))
print('accuracy: {:5.4f}'.format(regression.score(X=X_test, y=y_test)))
express.histogram(x=columns, y=regression.coef_[0]).show()
print('done in {}'.format(now() - time_start))

fit complete after 92 iterations.
accuracy: 1.0000


done in 0:00:00.089079
