# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

## Extract raw data and selected desired columns

In [93]:
dados = pd.read_csv('Dados/dados originais.csv',sep=';').drop(['id','Unique_id'],axis=1)
dados=dados[dados['vaccinedose_PLACEBO']==0]

def convert(x):
    try:
        x = int(x)
        return float(x)
    except:
        return float(x.replace(',','.'))
for col in dados.columns:
    dados[col] = dados[col].apply(lambda x:convert(x))
    
X = dados.iloc[:,3:-23]
colsX = list(X.columns)
Y = dados.iloc[:,-8:]
colsY = list(Y.columns)
X.index = range(len(X))
Y.index = range(len(Y))
print(f"Shape X: {X.shape}")
print(f"Shape Y: {Y.shape}")

Shape X: (363, 141)
Shape Y: (363, 8)


## Remove genes with low variance

In [81]:
tol = 3.0
for col in X.columns:
    if X[col].var() < tol:
        X.drop([col],axis=1,inplace=True)
colsX = list(X.columns)
print(f"Shape X: {X.shape}")
print(f"Shape Y: {Y.shape}")

Shape X: (363, 45)
Shape Y: (363, 8)


## De-log and normalize

In [94]:
X = 2**X
min_max_scaler = preprocessing.MinMaxScaler()
X = min_max_scaler.fit_transform(X)
X = pd.DataFrame(X)
X.columns = colsX
X

Unnamed: 0,BCL2,BLR1,BMP6,BPI,CAMTA1,CASP8,CCL2,CCL3,CCL4,CCL5,...,TLR10,TNF,TNFRSF1A,TNFRSF1B,TNFRSF18,TNIP1,TWIST1,VEGF,ZNF331,ZNF532
0,0.166472,0.063108,0.008375,0.048233,0.000000,0.480825,0.000000,0.00000,0.135281,0.481678,...,0.000000,0.008869,0.324297,0.667481,0.004759,0.010257,0.0,0.000000,0.055700,0.000000
1,0.088291,0.037261,0.005805,0.040766,0.000000,0.487687,0.000000,0.00000,0.111837,0.279471,...,0.000000,0.006003,0.478663,0.681901,0.005203,0.070248,0.0,0.000000,0.000000,0.000000
2,0.177581,0.068813,0.005525,0.060332,0.000000,0.586110,0.000000,0.00000,0.132248,0.301811,...,0.000000,0.010378,0.520902,0.972141,0.005203,0.064849,0.0,0.000000,0.000000,0.000000
3,0.130992,0.051503,0.006319,0.060332,0.000000,0.577893,0.000000,0.00000,0.095053,0.264261,...,0.000000,0.008225,0.517246,0.979033,0.006402,0.052828,0.0,0.000000,0.016664,0.000000
4,0.195444,0.039247,0.005003,0.034444,0.000000,0.477430,0.000000,0.03716,0.101948,0.351940,...,0.000000,0.007062,0.535781,0.630461,0.003776,0.054406,0.0,0.000000,0.004568,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358,0.189324,0.047526,0.003169,0.008871,0.072400,0.167958,0.000000,0.00000,0.165606,0.067996,...,0.000000,0.009489,0.161480,0.019325,0.028807,0.040132,0.0,0.024561,0.076507,0.014618
359,0.177581,0.053414,0.005147,0.011862,0.000000,0.142688,0.001116,0.00000,0.225647,0.005783,...,0.061217,0.010691,0.135456,0.076385,0.000000,0.055618,0.0,0.043344,0.000000,0.000000
360,0.000000,0.063565,0.007973,0.015808,0.099507,0.120082,0.000000,0.00000,0.260995,0.126339,...,0.374705,0.012484,0.119474,0.095182,0.059879,0.134896,0.0,0.093799,0.088757,0.026485
361,0.177581,0.045816,0.004249,0.010189,0.000000,0.161872,0.000000,0.00000,0.202173,0.003565,...,0.059347,0.011598,0.134463,0.033280,0.000000,0.053611,0.0,0.029375,0.000000,0.000000


## Export

In [95]:
dados = pd.concat([X,Y],axis=1,ignore_index=True)
dados.columns = colsX+colsY
dados.to_csv('Dados/dados tratados.csv',index=False)
dados

Unnamed: 0,BCL2,BLR1,BMP6,BPI,CAMTA1,CASP8,CCL2,CCL3,CCL4,CCL5,...,ZNF331,ZNF532,Arthralgia,Arthritis,Chills,Fatigue,Fever,Headache,Myalgia,Nausea
0,0.166472,0.063108,0.008375,0.048233,0.000000,0.480825,0.000000,0.00000,0.135281,0.481678,...,0.055700,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.088291,0.037261,0.005805,0.040766,0.000000,0.487687,0.000000,0.00000,0.111837,0.279471,...,0.000000,0.000000,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0
2,0.177581,0.068813,0.005525,0.060332,0.000000,0.586110,0.000000,0.00000,0.132248,0.301811,...,0.000000,0.000000,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
3,0.130992,0.051503,0.006319,0.060332,0.000000,0.577893,0.000000,0.00000,0.095053,0.264261,...,0.016664,0.000000,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
4,0.195444,0.039247,0.005003,0.034444,0.000000,0.477430,0.000000,0.03716,0.101948,0.351940,...,0.004568,0.000000,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358,0.189324,0.047526,0.003169,0.008871,0.072400,0.167958,0.000000,0.00000,0.165606,0.067996,...,0.076507,0.014618,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
359,0.177581,0.053414,0.005147,0.011862,0.000000,0.142688,0.001116,0.00000,0.225647,0.005783,...,0.000000,0.000000,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
360,0.000000,0.063565,0.007973,0.015808,0.099507,0.120082,0.000000,0.00000,0.260995,0.126339,...,0.088757,0.026485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
361,0.177581,0.045816,0.004249,0.010189,0.000000,0.161872,0.000000,0.00000,0.202173,0.003565,...,0.000000,0.000000,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
