In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

# Load the Parkinson's disease dataset
df = pd.read_csv('../Datasets/parkinsons.csv')

# Remove the 'name' column
df = df.drop(columns='name')

# Ensure all column names are strings
df.columns = df.columns.astype(str)

# Split the dataset into features (X) and target (y)
X = df.drop(columns='status')
y = df['status']

# Identify non-numeric columns
non_numeric_columns = X.select_dtypes(include=['object']).columns

# Handle non-numeric columns using one-hot encoding
if non_numeric_columns.any():
    encoder = OneHotEncoder()
    encoded_columns = pd.DataFrame(encoder.fit_transform(X[non_numeric_columns]).toarray())
    
    # Drop non-numeric columns and add encoded columns
    X = X.drop(non_numeric_columns, axis=1)
    X = X.join(encoded_columns)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Save the model
joblib.dump(model, '../Models/parkinsons_model.pkl')

joblib.dump(scaler, '../Models/parkinsons_scaler.pkl')  # ✅ Save the scaler separately


Accuracy: 94.87%


['../Models/parkinsons_scaler.pkl']