In [357]:
import pandas as pd

df1 = pd.read_csv('clinical.cart.2024-02-01/clinical.tsv', sep='\t')
df2 = pd.read_csv('clinical.cart.2024-02-01/exposure.tsv', sep='\t')

df1 = df1.iloc[::2] #every alternate rows are duplicates
df1 = df1.drop(904) #patient with most data missing
df2 = df2.drop(452)
df1 = df1.reset_index(drop=True)
df2 = df2.reset_index(drop=True)
df1 = df1.drop(columns=df1.columns[df1.nunique() == 1]) #delete non-unique
df1 = df1.drop(columns=['case_id',
                        'case_submitter_id',
                        'age_at_index', #same info as days to birth
                        'days_to_death', #too many nulls, gives data on dead/alive
                        'year_of_death', #gives data on d/a
                        'year_of_birth', #gives data on d/a
                        'days_to_last_follow_up', #gives data on d/a
                        'year_of_diagnosis', #gives data on d/a
                        'age_at_diagnosis' #same data as days to birth
                        ])

df1['alcohol_history'] = df2['alcohol_history']

for i in df1.columns:
    if i in ['days_to_birth', 'gender', 'vital_status', 'prior_malignancy', 'prior_treatment']:
        continue
    df1 = pd.concat([df1, pd.get_dummies(df1[i])], axis=1)
    df1 = df1.drop(i, axis=1)

df1['gender'] = df1['gender'].map({'male': True, 'female': False})
df1['prior_malignancy'] = df1['prior_malignancy'].map({'yes': True, 'no': False})
df1['prior_treatment'] = df1['prior_treatment'].map({'Yes': True, 'No': False})
df1['vital_status'] = df1['vital_status'].map({'Alive': True, 'Dead': False})
df1['days_to_birth'] = pd.to_numeric(df1['days_to_birth'].str[1:])

df1['cigarettes_per_day_is_null'] = df2['cigarettes_per_day'] == "'--"
df1['cigarettes_per_day'] = pd.to_numeric(df2['cigarettes_per_day'], errors='coerce', downcast='integer').fillna(-1)
df1['pack_years_smoked_is_null'] = df2['pack_years_smoked'] == "'--"
df1['pack_years_smoked'] = pd.to_numeric(df2['pack_years_smoked'], errors='coerce', downcast='integer').fillna(-1)
df1['years_smoked_is_null'] = df2['years_smoked'] == "'--"
df1['years_smoked'] = pd.to_numeric(df2['years_smoked'], errors='coerce', downcast='integer').fillna(-1)

In [375]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X = df1.drop('vital_status', axis=1)
y = df1['vital_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

classifier = RandomForestClassifier()

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.660377358490566


In [2]:
import pandas as pd

df1 = pd.read_csv('clinical.cart.2024-02-01/clinical.tsv', sep='\t')
df2 = pd.read_csv('clinical.cart.2024-02-01/exposure.tsv', sep='\t')

df3 = df1.iloc[::2].copy() #every alternate rows are duplicates
df3['treatment_or_therapy_2'] = df1.iloc[1::2]['treatment_or_therapy'].values
df3['treatment_type_2'] = df1.iloc[1::2]['treatment_type'].values

df3 = df3.drop(904) #patient with most data missing
df2 = df2.drop(452)
df3 = df3.reset_index(drop=True)
df2 = df2.reset_index(drop=True)
df3 = df3.drop(columns=df3.columns[df3.nunique() == 1]) #delete non-unique
df2 = df2.drop(columns=df2.columns[df2.nunique() == 1])

df3 = pd.concat([df3, df2.iloc[:, -4:]], axis=1)

In [3]:
summary_df = pd.DataFrame({'Column Name': df3.columns,
                           'Null Values': (df3 == "'--").sum(),
                           'Unique Values': df3.apply(lambda col: col.unique())})


summary_df.to_csv('clinical_summary.tsv', sep='\t', index=False)