In [44]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 100)
pd.options.mode.chained_assignment = None  # default='warn'

In [45]:
path = 'data/'
data = pd.read_excel(path + 'Oncothromb_bbddMarzo2018.xlsx')
print(data.shape)

(408, 92)


In [24]:
data = data[data['excluido']==0]
data.shape

(391, 92)

In [46]:
y = data['caseAtVisit']
y = pd.Series([1 if x in [0,1] else 0 for x in y])
len(y)
y.value_counts()

0    329
1     79
dtype: int64

In [47]:
clinical_vars = ['id','excluido','bmi','Family','tipusTumor_desc','estadiGrup','khorana']
df_clinical = data[clinical_vars]
df_clinical.shape

(408, 7)

In [48]:
print(df_clinical['tipusTumor_desc'].isna().sum())
df_clinical['tipusTumor_desc'].value_counts()

0


Cáncer colorrectal                 167
Cáncer de pulmón no microcítico     90
Cáncer de páncreas                  80
Cáncer gástrico o de estómago       56
Cáncer esófago                      14
-                                    1
Name: tipusTumor_desc, dtype: int64

In [49]:
df_clinical['tipusTumor_desc'].replace('-', np.NaN, inplace=True)

In [50]:
df_clinical.dropna(inplace=True)
df_clinical.shape

(405, 7)

In [51]:
df_clinical['bmi'].replace(['Underweight: BMI < 18.5 Kg/m2','Normal: BMI ~ 18.5-24.9 Kg/m2'], 0, inplace=True)
df_clinical['bmi'].replace(['Overweight: BMI ~25-29.9 Kg/m2','Obese: BMI > 30 kg/m2'], 1, inplace=True)

df_clinical['tipusTumor_HR'] = [1 if t=='Cáncer de pulmón no microcítico' else 0 for t in df_clinical['tipusTumor_desc']]
df_clinical['tipusTumor_VHR'] = [1 if t in ['Cáncer de páncreas','Cáncer gástrico o de estómago'] else 0 for t in df_clinical['tipusTumor_desc']]
df_clinical.drop('tipusTumor_desc', axis=1, inplace=True)

df_clinical['estadiGrup'].replace(['IA','IB','IIA','IIB','IIC','III','IIIA','IIIB','IIIC'],0, inplace=True)
df_clinical['estadiGrup'].replace(['IV','IVA','IVB'],1, inplace=True)

In [52]:
df_clinical.shape

(405, 8)

In [53]:
df_clinical.head()

Unnamed: 0,id,excluido,bmi,Family,estadiGrup,khorana,tipusTumor_HR,tipusTumor_VHR
0,19,1,0,0,1,2.0,0,1
1,1,0,1,0,1,2.0,0,0
2,14,0,0,0,0,2.0,0,1
3,67,0,0,0,1,1.0,1,0
4,91,0,0,0,1,0.0,0,0


In [54]:
genetic_vars = ['id', 'excluido','rs2232698','rs6025','rs5985','rs4524']
df_genetic = data[genetic_vars]
df_genetic.shape

(408, 6)

In [55]:
df_genetic.replace('NoCall', np.NaN, inplace=True)

In [56]:
df_genetic.isna().sum()

id           0
excluido     0
rs2232698    1
rs6025       0
rs5985       0
rs4524       2
dtype: int64

In [57]:
# df_genetic.dropna(inplace=True)
df_genetic.fillna('CC', inplace=True)
df_genetic.shape

(408, 6)

In [58]:
df_genetic['rs2232698'].replace(['CC','CT'], [0,1], inplace=True)
df_genetic['rs6025'].replace(['GG','AG'], [0,1], inplace=True)
df_genetic['rs5985'].replace(['GG','GT','TT'], [0,1,2], inplace=True)
df_genetic['rs4524'].replace(['CC','CT','TT'], [0,1,2], inplace=True)

In [59]:
df_genetic.head()

Unnamed: 0,id,excluido,rs2232698,rs6025,rs5985,rs4524
0,19,1,0,0,0,2
1,1,0,0,0,1,1
2,14,0,0,0,0,1
3,67,0,0,0,0,2
4,91,0,0,0,0,2


In [60]:
ids = list(set(df_clinical['id'].values) & set(df_genetic['id'].values))
df = df_clinical[df_clinical['id'].isin(ids)].merge(df_genetic[df_genetic['id'].isin(ids)], on=['id','excluido'])
df.shape

(405, 12)

In [61]:
#df = df[df['excluido']==0]
#ids = df['id'].unique()
df.head() 

Unnamed: 0,id,excluido,bmi,Family,estadiGrup,khorana,tipusTumor_HR,tipusTumor_VHR,rs2232698,rs6025,rs5985,rs4524
0,19,1,0,0,1,2.0,0,1,0,0,0,2
1,1,0,1,0,1,2.0,0,0,0,0,1,1
2,14,0,0,0,0,2.0,0,1,0,0,0,1
3,67,0,0,0,1,1.0,1,0,0,0,0,2
4,91,0,0,0,1,0.0,0,0,0,0,0,2


In [62]:
y = data[data['id'].isin(ids)]['caseAtVisit']
y = pd.Series([1 if x in [0,1] else 0 for x in y])
len(y)

405

In [63]:
y.value_counts()

0    327
1     78
dtype: int64

In [64]:
df['VTE'] = y

In [65]:
df.to_csv(path+'data_TiC_Onco_large.csv', index=False)

In [93]:
from utils import print_summary

X = df[df.columns.difference(['id','excluido','VTE'])]
y = df['VTE']

print_summary(X,y)


                      Variable  VTE (n)  VTE (%)  No-VTE (n)  No-VTE (%)
0                       Family        6      7.7          12         3.7
1                          bmi       40     51.3         151        46.2
2                 estadiGrup I       53     67.9         138        42.2
3                estadiGrup II        0      0.0           0         0.0
4               estadiGrup III        0      0.0           0         0.0
5                estadiGrup IV        0      0.0           0         0.0
6   rs2232698 - 0 risk alleles       74     94.9         322        98.5
7    rs2232698 - 1 risk allele        4      5.1           5         1.5
8      rs4524 - 0 risk alleles        1      1.3          22         6.7
9       rs4524 - 1 risk allele       26     33.3         117        35.8
10     rs4524 - 2 risk alleles       51     65.4         188        57.5
11     rs5985 - 0 risk alleles       42     53.8         189        57.8
12      rs5985 - 1 risk allele       30     38.5  