In [2]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn import metrics

#os datasets escolhidos foram THROMBIN_training/test
#os datasets foram carregados a partir da módulos Pandas e convertidos em Dataframe

path_train  = "./THROMBIN_training_disguised.csv"
path_test = "./THROMBIN_test_disguised.csv"

dataset_training = pd.read_csv(path_train)
dataset_test = pd.read_csv(path_test)


In [3]:
#foram selecionadas as colunas em comum dos datasets de training e test de forma a que ambos contivessem o mesmo tipo de informação

common_cols = list(set(dataset_training.columns) & set(dataset_test.columns))
dataset_training_eq, dataset_test_eq = dataset_training[common_cols], dataset_test[common_cols]

dataset_training_eq.set_index('MOLECULE', inplace=True)
dataset_test_eq.set_index('MOLECULE', inplace=True)


In [4]:
# --> Pré-processamento dos dados

#Averigou-se se os dados seguiam uma distribuição normal
normality_results_training = stats.normaltest(dataset_training_eq.all())
alpha = 1e-3
if normality_results_training[1]<alpha:
    print("Dataset training --> Não segue distribuição normal")
else:
    print("Dataset test --> Segue distribuição normal")
    
#Os dados não seguem distribuição normal

# --> Normalização 

#Normalização dataset treino
train_norm = preprocessing.normalize(dataset_training_eq, norm="l2")
normalizer_train = preprocessing.Normalizer().fit(dataset_training_eq)
train_n = normalizer_train.transform(dataset_training_eq)
train = pd.DataFrame(train_n, columns = dataset_training_eq.columns , index = dataset_training_eq.index )
print(train.head)

#Normalização dataset teste
test_norm = preprocessing.normalize(dataset_test_eq, norm='l2')
normalizer_test = preprocessing.Normalizer().fit(dataset_test_eq)
test_n = normalizer_test.transform(dataset_test_eq)
test = pd.DataFrame(test_n,columns=dataset_test_eq.columns, index=dataset_test_eq.index)
print(test.head)





Dataset training --> Não segue distribuição normal
<bound method NDFrame.head of           D_6898  D_5685  D_5946  D_256  D_4256    D_1069    D_7731  D_3468  \
MOLECULE                                                                      
M_1          0.0     0.0     0.0    0.0     0.0  0.012517  0.000000     0.0   
M_2          0.0     0.0     0.0    0.0     0.0  0.022213  0.000000     0.0   
M_3          0.0     0.0     0.0    0.0     0.0  0.000000  0.000000     0.0   
M_5          0.0     0.0     0.0    0.0     0.0  0.012734  0.000000     0.0   
M_6          0.0     0.0     0.0    0.0     0.0  0.012748  0.000000     0.0   
M_8          0.0     0.0     0.0    0.0     0.0  0.042808  0.000000     0.0   
M_9          0.0     0.0     0.0    0.0     0.0  0.043447  0.000000     0.0   
M_10         0.0     0.0     0.0    0.0     0.0  0.000000  0.000000     0.0   
M_11         0.0     0.0     0.0    0.0     0.0  0.000000  0.000000     0.0   
M_14         0.0     0.0     0.0    0.0     0.0  0

[1698 rows x 3654 columns]>


In [5]:
# --> Remoção dos valores omissos

imp = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
imp.fit(train)

training = pd.DataFrame(imp.transform(train), columns = train.columns, index = train.index)
#print(training.head)

# --> Seleção das features mais importantes
#atrasvés de RandonForestRegressor para o dataset de treino 

x_df_train, y_df_train = training.loc[:,training.columns != "Act"], training["Act"]

rf =  RandomForestRegressor(random_state = 0, n_estimators=50)

rf.fit(x_df_train,y_df_train)
results_train = sorted(zip(map(lambda x: round(x,4), rf.feature_importances_),training.columns),reverse=True) #dá um score de importancia a cada feature
columns_train = [i[1] for i in results_train if i[0]>0.0 or i[1]=='Act'] #seleciona as colunas que apresentam um score acima de 0.0
#print(columns)
#restringir o dataset original apenas às features selecionadas anteriormente
df_train = training.loc[:,list(set(training.columns)&set(columns_train))]
print(df_train.shape)


#selecionar do dataset teste apenas as columas em comum com o dataset treino já com as feature selection aplicado
common_cols_ = list(set(df_train.columns) & set(test.columns))
df_test = test[common_cols_]
print(df_test.shape)



(5059, 880)
(1698, 880)


In [7]:
# --> Binarização

#dataset treino
train_bin = pd.DataFrame(preprocessing.Binarizer().transform(df_train), columns = df_train.columns, index = df_train.index)
print(train_bin.head)

#dataset teste
test_bin = pd.DataFrame(preprocessing.Binarizer().transform(df_test), columns = df_test.columns, index = df_test.index)
print(test_bin.head)


<bound method NDFrame.head of           D_2742  D_1433  D_6010  D_7305  D_2999  D_390  D_5615  D_980  \
MOLECULE                                                                 
M_1          1.0     0.0     0.0     0.0     1.0    0.0     0.0    0.0   
M_2          1.0     0.0     0.0     0.0     1.0    0.0     0.0    1.0   
M_3          1.0     0.0     0.0     0.0     0.0    1.0     0.0    0.0   
M_5          1.0     0.0     0.0     0.0     1.0    1.0     0.0    1.0   
M_6          1.0     0.0     0.0     0.0     1.0    1.0     0.0    1.0   
M_8          1.0     0.0     0.0     0.0     0.0    1.0     0.0    1.0   
M_9          1.0     0.0     0.0     0.0     1.0    0.0     0.0    1.0   
M_10         0.0     0.0     0.0     1.0     0.0    1.0     0.0    1.0   
M_11         0.0     0.0     0.0     1.0     0.0    1.0     0.0    1.0   
M_14         0.0     0.0     0.0     0.0     0.0    0.0     0.0    0.0   
M_343        1.0     0.0     0.0     0.0     1.0    0.0     0.0    1.0   
M_3618  

In [8]:
# --> Holdout 
# divisão da amostra em dados de treino e dados de validação
# 30% dos dados foram para o teste, e 70% para o treino


x_df_train,y_df_train = train_bin.loc[:,train_bin.columns !='Act'], df_train['Act']
x_train_dftrain, x_test_dftrain, y_train_dftrain, y_test_dftrain = train_test_split(x_df_train,y_df_train, test_size=0.3)
print(x_train_dftrain.shape,y_train_dftrain.shape)
print(x_test_dftrain.shape, y_test_dftrain.shape)




(3541, 879) (3541,)
(1518, 879) (1518,)


In [10]:
# algoritmo de machine learning --> support vector machine for regression

#dataset de teste para a previsão de valores de Act
x_df_test,y_df_test = test_bin.loc[:,test_bin.columns !='Act'], df_test['Act']


model = SVR(kernel='rbf', C=1e3, gamma=0.1)
model.fit(x_train_dftrain, y_train_dftrain)
y_pred = model.predict(x_df_test)
print(y_pred)

#medidas de erro
print('Mean Absolute Error:', metrics.mean_absolute_error(y_df_test,y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_df_test,y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_df_test,y_pred)))

[0.12587152 0.12586904 0.12586851 ... 0.12586838 0.12586837 0.12587128]
Mean Absolute Error: 0.03923617359235933
Mean Squared Error: 0.0021269808306361346
Root Mean Squared Error: 0.04611920240676474
