In [1]:
# Import the modules
import pandas as pd
import sqlite3
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `diseases.sqlite` database into a Pandas DataFrame.

In [18]:
# Connect to the SQLite database
conn = sqlite3.connect('../Project 4/Data/diseases.sqlite')

# Read data from the database into a DataFrame
query = "SELECT * FROM diseases"
df = pd.read_sql_query(query, conn)
conn.close()

# Review the DataFrame
print(df.head(11))
print("Total number of rows:", df.shape[0])

    acidity  palpitations  anxiety  bladder_discomfort  \
0         0             0        0                   0   
1         0             0        0                   0   
2         0             0        0                   0   
3         0             0        0                   0   
4         0             0        0                   0   
5         0             0        0                   0   
6         0             0        0                   0   
7         0             0        0                   0   
8         0             0        0                   0   
9         0             0        0                   0   
10        0             0        0                   0   

    blurred_and_distorted_vision  breathlessness  burning_micturition  \
0                              0               0                    0   
1                              0               0                    0   
2                              0               0                    0   
3          

### Step 2: Create the labels set (`y`)  from the “disease” column, and then create the features (`X`) DataFrame from the remaining columns.

In [34]:
# Count the number of unique diseases in the 'Disease' column
num_diseases = df['Disease'].nunique()

# Print the number of unique diseases
print("Number of unique diseases:", num_diseases)

Number of unique diseases: 10


In [35]:
# Separate the data into labels and features
y = df["Disease"]
X = df.drop(columns="Disease")

In [36]:
# Review the y variable Series
print(y.head(12))
print("Total number of rows:", y.shape[0])

0     6
1     6
2     6
3     6
4     6
5     6
6     6
7     6
8     6
9     6
10    0
11    0
Name: Disease, dtype: int64
Total number of rows: 1200


In [37]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,acidity,palpitations,anxiety,bladder_discomfort,blurred_and_distorted_vision,breathlessness,burning_micturition,chest_pain,chills,congestion,...,slurred_speech,spotting_ urination,stiff_neck,stomach_pain,sweating,swelled_lymph_nodes,throat_irritation,visual_disturbances,vomiting,watering_from_eyes
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [38]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (960, 52)
X_test shape: (240, 52)
y_train shape: (960,)
y_test shape: (240,)


---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [39]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
logistic_model = LogisticRegression()

# Fit the model using training data
logistic_model.fit(X_train, y_train)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [40]:
# Make a prediction using the testing data
y_pred = logistic_model.predict(X_test)
pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,7,7
1,3,3
2,6,6
3,2,2
4,3,3
...,...,...
235,7,7
236,5,5
237,9,9
238,2,2


### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [41]:
# Generate a confusion matrix for the model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 1.0
Confusion Matrix:
[[21  0  0  0  0  0  0  0  0  0]
 [ 0 26  0  0  0  0  0  0  0  0]
 [ 0  0 24  0  0  0  0  0  0  0]
 [ 0  0  0 23  0  0  0  0  0  0]
 [ 0  0  0  0 23  0  0  0  0  0]
 [ 0  0  0  0  0 33  0  0  0  0]
 [ 0  0  0  0  0  0 19  0  0  0]
 [ 0  0  0  0  0  0  0 22  0  0]
 [ 0  0  0  0  0  0  0  0 24  0]
 [ 0  0  0  0  0  0  0  0  0 25]]


In [42]:
import numpy as np
num_classes = len(np.unique(y_test)) 
# Get the number of unique classes in y_test
target_names = [f"Disease {i}" for i in range(num_classes)]
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Disease 0       1.00      1.00      1.00        21
   Disease 1       1.00      1.00      1.00        26
   Disease 2       1.00      1.00      1.00        24
   Disease 3       1.00      1.00      1.00        23
   Disease 4       1.00      1.00      1.00        23
   Disease 5       1.00      1.00      1.00        33
   Disease 6       1.00      1.00      1.00        19
   Disease 7       1.00      1.00      1.00        22
   Disease 8       1.00      1.00      1.00        24
   Disease 9       1.00      1.00      1.00        25

    accuracy                           1.00       240
   macro avg       1.00      1.00      1.00       240
weighted avg       1.00      1.00      1.00       240



### Step 4: Answer the following question.

**Question:**  How effectively does the logistic regression model identify the appropriate disease based on the symptoms provided in the dataset?

**Answer:** 

The logistic regression model seems to perform well in predicting the appropriate disease(s) based on the symptoms. Each disease (Disease 0 to Disease 9) has achieved a perfect precision, recall, and F1-score of 1.00.

Additionally, the overall accuracy of the model is 1.00, suggesting that it accurately predicts the disease(s) based on the symptoms in the dataset.

Therefore, based on the outcome, we can conclude that the logistic regression model performs very well in identifying the appropriate disease based on the symptoms provided in the dataset.


---