In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from joblib import dump, load

# Load your dataset (replace 'your_dataset.csv' with your actual dataset)
# The dataset should have 'symptoms' and 'disease' columns
df = pd.read_csv('/content/mydrive/dataset.csv')

# Split the dataset into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(
    df['symptoms'], df['disease'], test_size=0.2, random_state=42
)

# Create a pipeline with a simple CountVectorizer and a Random Forest classifier
model = make_pipeline(CountVectorizer(), RandomForestClassifier(random_state=42))

# Train the model
model.fit(train_data, train_labels)

# Make predictions on the test set
predictions = model.predict(test_data)

# Evaluate the model
accuracy = accuracy_score(test_labels, predictions)
print(f"Accuracy: {accuracy}")

# Additional evaluation metrics
print("Classification Report:\n", classification_report(test_labels, predictions))

# Save the model to a file
model_filename = 'disease_prediction_model.joblib'
dump(model, model_filename)
