In [1]:
import pandas as pd

# carregar arquivo de dados em um DataFrame
df = pd.read_csv('parkinsons.data')

# exibir as primeiras linhas do DataFrame
print(df.head())


             name  MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
0  phon_R01_S01_1      119.992       157.302        74.997         0.00784   
1  phon_R01_S01_2      122.400       148.650       113.819         0.00968   
2  phon_R01_S01_3      116.682       131.111       111.555         0.01050   
3  phon_R01_S01_4      116.676       137.871       111.366         0.00997   
4  phon_R01_S01_5      116.014       141.781       110.655         0.01284   

   MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  ...  \
0           0.00007   0.00370   0.00554     0.01109       0.04374  ...   
1           0.00008   0.00465   0.00696     0.01394       0.06134  ...   
2           0.00009   0.00544   0.00781     0.01633       0.05233  ...   
3           0.00009   0.00502   0.00698     0.01505       0.05492  ...   
4           0.00011   0.00655   0.00908     0.01966       0.06425  ...   

   Shimmer:DDA      NHR     HNR  status      RPDE       DFA   spread1  \
0      0.0654

In [3]:
# Verificar o número de linhas e colunas do DataFrame
print('Número de linhas e colunas do DataFrame:', df.shape)

# Verificar se existem valores ausentes ou duplicados nos dados
print('Valores ausentes em cada coluna:')
print(df.isnull().sum())

print('Valores duplicados:', df.duplicated().sum())

# Verificar a distribuição das classes no conjunto de dados
print('Distribuição das classes:')
print(df.status.value_counts())

# Verificar a correlação entre as variáveis do conjunto de dados
corr = df.corr()
print('Correlação entre as variáveis:')
print(corr)

# Identificar possíveis outliers nos dados
for col in df.columns:
    if col == 'status' or col == 'name':
        continue
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    min_outlier = q1 - 1.5 * iqr
    max_outlier = q3 + 1.5 * iqr
    outliers = df[(df[col] < min_outlier) | (df[col] > max_outlier)]
    print('Possíveis outliers em {}:'.format(col))
    print(outliers)

# Realizar a normalização das variáveis se necessário
# Vamos usar a biblioteca Scikit-learn para realizar a normalização
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df.iloc[:, 1:] = scaler.fit_transform(df.iloc[:, 1:])
print('Dados normalizados:')
print(df.head())


Número de linhas e colunas do DataFrame: (195, 24)
Valores ausentes em cada coluna:
name                0
MDVP:Fo(Hz)         0
MDVP:Fhi(Hz)        0
MDVP:Flo(Hz)        0
MDVP:Jitter(%)      0
MDVP:Jitter(Abs)    0
MDVP:RAP            0
MDVP:PPQ            0
Jitter:DDP          0
MDVP:Shimmer        0
MDVP:Shimmer(dB)    0
Shimmer:APQ3        0
Shimmer:APQ5        0
MDVP:APQ            0
Shimmer:DDA         0
NHR                 0
HNR                 0
status              0
RPDE                0
DFA                 0
spread1             0
spread2             0
D2                  0
PPE                 0
dtype: int64
Valores duplicados: 0
Distribuição das classes:
1    147
0     48
Name: status, dtype: int64
Correlação entre as variáveis:
                  MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
MDVP:Fo(Hz)          1.000000      0.400985      0.596546       -0.118003   
MDVP:Fhi(Hz)         0.400985      1.000000      0.084951        0.102086   
MDVP:Flo(Hz)        

  corr = df.corr()


Possíveis outliers em Shimmer:APQ5:
               name  MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
1    phon_R01_S01_2      122.400       148.650       113.819         0.00968   
4    phon_R01_S01_5      116.014       141.781       110.655         0.01284   
87   phon_R01_S21_4      176.281       227.381       125.610         0.00520   
88   phon_R01_S21_5      173.898       211.350        74.677         0.00448   
90   phon_R01_S21_7      166.605       206.008        78.032         0.00742   
91   phon_R01_S22_1      151.955       163.335       147.226         0.00419   
100  phon_R01_S24_4      125.641       141.068       116.346         0.03316   
102  phon_R01_S24_6      139.224       586.567        66.157         0.03011   
146  phon_R01_S35_1      169.774       191.759       151.451         0.01568   
148  phon_R01_S35_3      188.620       216.302       165.982         0.01719   
149  phon_R01_S35_4      202.632       565.740       177.258         0.01627   
151 

  df.iloc[:, 1:] = scaler.fit_transform(df.iloc[:, 1:])


In [4]:
print(df.head())

             name  MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
0  phon_R01_S01_1     0.184308      0.112592      0.054815        0.195680   
1  phon_R01_S01_2     0.198327      0.094930      0.278323        0.254130   
2  phon_R01_S01_3     0.165039      0.059128      0.265288        0.280178   
3  phon_R01_S01_4     0.165004      0.072927      0.264200        0.263342   
4  phon_R01_S01_5     0.161150      0.080909      0.260107        0.354511   

   MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  ...  \
0          0.249012  0.145472  0.247588    0.145288      0.312215  ...   
1          0.288538  0.191233  0.323687    0.191042      0.472887  ...   
2          0.328063  0.229287  0.369239    0.229411      0.390634  ...   
3          0.328063  0.209056  0.324759    0.208862      0.414278  ...   
4          0.407115  0.282755  0.437299    0.282870      0.499452  ...   

   Shimmer:DDA       NHR       HNR  status      RPDE       DFA   spread1  \
0     0.33

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import pickle


In [16]:
df = pd.read_csv('parkinsons.data')
df = df.drop('name', axis=1)
X = df.drop(['status'], axis=1)
y = df['status']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [19]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [25]:
mlp = MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=1000, random_state=42)
mlp.fit(X_train, y_train)

# save model to file
filename = 'mlp_model.sav'
pickle.dump(mlp, open(filename, 'wb'))


In [21]:
y_pred = mlp.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
print('Acurácia:', accuracy)
print('Matriz de Confusão:\n', conf_matrix)


Acurácia: 0.9322033898305084
Matriz de Confusão:
 [[12  3]
 [ 1 43]]


In [26]:
# load model from file
loaded_model = pickle.load(open(filename, 'rb'))

# use loaded model to make predictions
y_pred = loaded_model.predict(X_test)

In [28]:
y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0], dtype=int64)