# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

## Extract raw data and selected desired columns

In [80]:
dados = pd.read_csv('Dados/dados originais.csv',sep=';').drop(['id','Unique_id'],axis=1)
dados=dados[dados['vaccinedose_PLACEBO']==0]

def convert(x):
    try:
        x = int(x)
        return float(x)
    except:
        return float(x.replace(',','.'))
for col in dados.columns:
    dados[col] = dados[col].apply(lambda x:convert(x))
    
X = dados.iloc[:,3:-23]
Y = dados.iloc[:,-8:]
colsY = list(Y.columns)
X.index = range(len(X))
Y.index = range(len(Y))

## Remove genes with low variance

In [81]:
tol = 3.0
for col in X.columns:
    if X[col].var() < tol:
        X.drop([col],axis=1,inplace=True)
colsX = list(X.columns)
print(f"Shape X: {X.shape}")
print(f"Shape Y: {Y.shape}")

Shape X: (363, 45)
Shape Y: (363, 8)


## De-log and normalize

In [82]:
X = 2**X
min_max_scaler = preprocessing.MinMaxScaler()
X = min_max_scaler.fit_transform(X)
X = pd.DataFrame(X)
X.columns = colsX
X

Unnamed: 0,CCL3,CCL11,CCR7,CD209,CD274,CLEC7A,CXCL9,CXCL13,DSE,EGF,...,NLRP6,OAS1,SOCS1,SPP1,TAGAP,TAP1,TBX21,TGFBR2,TLR5,TLR9
0,0.00000,0.002863,0.001858,0.010713,0.000000,0.477069,0.000000,0.020403,0.006396,0.006442,...,0.000000,0.013069,0.000007,0.001726,0.509594,0.000000,0.000000,0.376522,0.002026,0.041214
1,0.00000,0.004998,0.000825,0.015367,0.010798,0.665187,0.000000,0.018910,0.008878,0.008996,...,0.000000,0.006163,0.000017,0.001378,0.566437,0.506104,0.000000,0.463134,0.004960,0.000000
2,0.00000,0.002838,0.001156,0.017816,0.006171,0.782126,0.000000,0.029419,0.009923,0.003693,...,0.000000,0.006050,0.000007,0.000000,0.680109,0.612115,0.000000,0.723574,0.000000,0.053821
3,0.00000,0.003860,0.001193,0.019114,0.008794,0.821578,0.000000,0.020714,0.010402,0.008644,...,0.000000,0.007127,0.000017,0.000000,0.689731,0.378681,0.000000,0.623864,0.001889,0.056133
4,0.03716,0.006459,0.001534,0.006929,0.021788,0.557538,0.000000,0.018062,0.005926,0.000000,...,0.000000,0.004304,0.000035,0.000000,0.699488,0.771748,0.000000,0.384696,0.002498,0.032430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358,0.00000,0.000000,0.027990,0.000000,0.154117,0.012878,0.032574,0.000000,0.000000,0.015914,...,0.387241,0.118100,0.095382,0.129208,0.034803,0.103892,0.022292,0.000000,0.672965,0.000000
359,0.00000,0.000000,0.000000,0.000000,0.246559,0.154259,0.049841,0.000000,0.000000,0.026680,...,0.000000,0.190450,0.147616,0.192866,0.104508,0.115395,0.010068,0.000000,0.000000,0.000000
360,0.00000,0.000000,0.041063,0.000000,0.317087,0.056341,0.090657,0.000000,0.000000,0.073097,...,0.463084,0.382954,0.233251,0.256149,0.083163,0.163298,0.034180,0.000000,0.534959,0.000000
361,0.00000,0.000000,0.000000,0.000000,0.204087,0.213337,0.037979,0.000000,0.000000,0.023567,...,0.000000,0.157593,0.114220,0.162820,0.099123,0.105468,0.009135,0.606462,0.000000,0.000000


## Export

In [84]:
dados = pd.concat([X,Y],axis=1,ignore_index=True)
dados.columns = colsX+colsY
dados.to_csv('Dados/dados tratados.csv',index=False)
dados

Unnamed: 0,CCL3,CCL11,CCR7,CD209,CD274,CLEC7A,CXCL9,CXCL13,DSE,EGF,...,TLR5,TLR9,Arthralgia,Arthritis,Chills,Fatigue,Fever,Headache,Myalgia,Nausea
0,0.00000,0.002863,0.001858,0.010713,0.000000,0.477069,0.000000,0.020403,0.006396,0.006442,...,0.002026,0.041214,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.00000,0.004998,0.000825,0.015367,0.010798,0.665187,0.000000,0.018910,0.008878,0.008996,...,0.004960,0.000000,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0
2,0.00000,0.002838,0.001156,0.017816,0.006171,0.782126,0.000000,0.029419,0.009923,0.003693,...,0.000000,0.053821,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
3,0.00000,0.003860,0.001193,0.019114,0.008794,0.821578,0.000000,0.020714,0.010402,0.008644,...,0.001889,0.056133,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
4,0.03716,0.006459,0.001534,0.006929,0.021788,0.557538,0.000000,0.018062,0.005926,0.000000,...,0.002498,0.032430,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358,0.00000,0.000000,0.027990,0.000000,0.154117,0.012878,0.032574,0.000000,0.000000,0.015914,...,0.672965,0.000000,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
359,0.00000,0.000000,0.000000,0.000000,0.246559,0.154259,0.049841,0.000000,0.000000,0.026680,...,0.000000,0.000000,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
360,0.00000,0.000000,0.041063,0.000000,0.317087,0.056341,0.090657,0.000000,0.000000,0.073097,...,0.534959,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
361,0.00000,0.000000,0.000000,0.000000,0.204087,0.213337,0.037979,0.000000,0.000000,0.023567,...,0.000000,0.000000,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
