In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.metrics import precision_recall_curve, roc_auc_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.metrics import silhouette_score, davies_bouldin_score, mutual_info_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.cluster import KMeans

### Regressao linear

In [19]:

df_diabetes = pd.read_csv('./data/Cópia de diabetes_numeric.csv')
print(df_diabetes.head(), '\n')
print(df_diabetes.shape, '\n')
print(df_diabetes.info(), '\n')
print(df_diabetes.describe(), '\n')

    age  deficit  c_peptide
0   5.2     -8.1        4.8
1   8.8    -16.1        4.1
2  10.5     -0.9        5.2
3  10.6     -7.8        5.5
4  10.4    -29.0        5.0 

(43, 3) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        43 non-null     float64
 1   deficit    43 non-null     float64
 2   c_peptide  43 non-null     float64
dtypes: float64(3)
memory usage: 1.1 KB
None 

             age    deficit  c_peptide
count  43.000000  43.000000  43.000000
mean    9.032558  -8.148837   4.746512
std     4.022539   7.123080   0.720565
min     0.900000 -29.000000   3.000000
25%     5.500000 -12.700000   4.450000
50%    10.400000  -7.800000   4.900000
75%    11.850000  -2.000000   5.100000
max    15.600000  -0.200000   6.600000 



In [30]:
xtreino, xteste, ytreino, yteste = train_test_split(df_diabetes.drop('c_peptide', axis=1),df_diabetes['c_peptide'], test_size=0.37, random_state=5762)

lm = LinearRegression()
lm.fit(xtreino, ytreino)
predict = lm.predict(xteste)

print('R2:', r2_score(yteste, predict))
print('MSE:', mean_squared_error(yteste, predict))
print('MAE:', mean_absolute_error(yteste, predict))

R2: 0.016814158185133876
MSE: 0.4405517496663409
MAE: 0.5322924084179863


### SVM

In [18]:
df_blood = pd.read_csv('./data/Cópia de bloodtransf.csv')
print(df_blood.head(), '\n')
print(df_blood.shape, '\n')
print(df_blood.info(), '\n')
print(df_blood.describe(), '\n')

   V1  V2     V3  V4  Class
0   2  50  12500  98      2
1   0  13   3250  28      2
2   1  16   4000  35      2
3   2  20   5000  45      2
4   1  24   6000  77      1 

(748, 5) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   V1      748 non-null    int64
 1   V2      748 non-null    int64
 2   V3      748 non-null    int64
 3   V4      748 non-null    int64
 4   Class   748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB
None 

               V1          V2            V3          V4       Class
count  748.000000  748.000000    748.000000  748.000000  748.000000
mean     9.506684    5.514706   1378.676471   34.282086    1.237968
std      8.095396    5.839307   1459.826781   24.376714    0.426124
min      0.000000    1.000000    250.000000    2.000000    1.000000
25%      2.750000    2.000000    500.000000   16.000000    1.000000
50%      7.00000

In [36]:
xtreino, xteste, ytreino, yteste = train_test_split(df_blood.drop('Class', axis=1),df_blood['Class'], test_size=0.37, random_state=5762)

baseline_preds = np.random.choice([0,1], size = len(yteste))

svm = SVC()
svm.fit(xtreino, ytreino)
predict = svm.predict(xteste)

print('accuracy:', accuracy_score(yteste, predict))
print('recall:', recall_score(yteste, predict))
print('precision:', precision_score(yteste, predict))
print('f1:', f1_score(yteste, predict))
print('AUCROC', roc_auc_score(yteste, predict))

accuracy: 0.7870036101083032
recall: 0.981651376146789
precision: 0.7955390334572491
f1: 0.8788501026694046
AUCROC 0.5247239931581403


In [34]:
print(classification_report(yteste, predict))

              precision    recall  f1-score   support

           1       0.80      0.98      0.88       218
           2       0.50      0.07      0.12        59

    accuracy                           0.79       277
   macro avg       0.65      0.52      0.50       277
weighted avg       0.73      0.79      0.72       277



### kmeans

In [17]:
df_wine = pd.read_csv('./data/Cópia de wine.csv')
print(df_wine.head(), '\n')
print(df_wine.shape, '\n')
print(df_wine.info(), '\n')
print(df_wine.describe(), '\n')

   class  Alcohol  Malic_acid   Ash  Alcalinity_of_ash  Magnesium  \
0      1    14.23        1.71  2.43               15.6        127   
1      1    13.20        1.78  2.14               11.2        100   
2      1    13.16        2.36  2.67               18.6        101   
3      1    14.37        1.95  2.50               16.8        113   
4      1    13.24        2.59  2.87               21.0        118   

   Total_phenols  Flavanoids  Nonflavanoid_phenols  Proanthocyanins  \
0           2.80        3.06                  0.28             2.29   
1           2.65        2.76                  0.26             1.28   
2           2.80        3.24                  0.30             2.81   
3           3.85        3.49                  0.24             2.18   
4           2.80        2.69                  0.39             1.82   

   Color_intensity   Hue  OD280%2FOD315_of_diluted_wines  Proline  
0             5.64  1.04                            3.92     1065  
1             4.38  1.

In [46]:
yteste

141    3
129    2
101    2
95     2
175    3
      ..
128    2
155    3
113    2
154    3
64     2
Name: class, Length: 66, dtype: int64

In [48]:
xtreino, xteste, ytreino, yteste = train_test_split(df_wine.drop('class', axis=1),df_wine['class'], test_size=0.37, random_state=5762)

kmeans = KMeans(n_clusters=3, random_state=5762)
kmeans.fit(xtreino)
predict = kmeans.predict(xteste)

print('Coeficiente de Silhueta\n', silhouette_score(xteste, predict)) 
print('\nDavies-Bouldin Score\n', davies_bouldin_score(xteste, predict)) 
print('\nMutual information\n', mutual_info_score(yteste, predict))

Coeficiente de Silhueta
 0.5995566852797415

Davies-Bouldin Score
 0.49625702117326337

Mutual information
 0.4901210809010372


In [23]:
kmeans.cluster_centers_

array([[1.02127660e+00, 1.38044681e+01, 1.88340426e+00, 2.42617021e+00,
        1.70234043e+01, 1.05510638e+02, 2.86723404e+00, 3.01425532e+00,
        2.85319149e-01, 1.91042553e+00, 5.70255319e+00, 1.07829787e+00,
        3.11404255e+00, 1.19514894e+03],
       [2.27536232e+00, 1.25166667e+01, 2.49420290e+00, 2.28855072e+00,
        2.08231884e+01, 9.23478261e+01, 2.07072464e+00, 1.75840580e+00,
        3.90144928e-01, 1.45188406e+00, 4.08695651e+00, 9.41159420e-01,
        2.49072464e+00, 4.58231884e+02],
       [2.25806452e+00, 1.29298387e+01, 2.50403226e+00, 2.40806452e+00,
        1.98903226e+01, 1.03596774e+02, 2.11112903e+00, 1.58403226e+00,
        3.88387097e-01, 1.50338710e+00, 5.65032258e+00, 8.83967742e-01,
        2.36548387e+00, 7.28338710e+02]])