In [34]:
# Importing Necessary Libraries
import pandas as pd
import numpy as np
from google.colab import files
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE,SMOTENC
from tqdm import tqdm

# Importing the ML algorithms
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from dataclasses import dataclass

# Importing accuracy metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

In [2]:
#Data Reading
Original_df=pd.read_csv("diabetic_data_uci.csv")

In [3]:
Copy_df = Original_df.copy()

In [4]:
Copy_df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [5]:
#Code block to analyze the improtance of features
print(Copy_df.describe())

#Printing the data types and missing values
print(Copy_df.info())

#Getting the unique values for each column
print("\nUnique Values in Each Column:")
for col in Copy_df.columns:
    print(f"\nColumn: {col}")
    print(Copy_df[col].value_counts())

       encounter_id   patient_nbr  admission_type_id  \
count  2.779300e+04  2.779300e+04       27793.000000   
mean   5.250293e+07  2.566143e+07           2.320836   
std    2.417376e+07  3.197322e+07           1.732156   
min    1.252200e+04  1.350000e+02           1.000000   
25%    3.365125e+07  3.016755e+06           1.000000   
50%    5.467619e+07  1.277532e+07           2.000000   
75%    7.310811e+07  2.580014e+07           3.000000   
max    9.107977e+07  1.152184e+08           8.000000   

       discharge_disposition_id  admission_source_id  time_in_hospital  \
count              27793.000000         27793.000000      27793.000000   
mean                   5.337603             6.820350          4.709711   
std                    6.956722             5.057342          3.142485   
min                    1.000000             1.000000          1.000000   
25%                    1.000000             2.000000          2.000000   
50%                    1.000000             7.00000




1.   **Gender has Unknown/Invalid values as per our summary**
2.   **Missing values have been represented with ? which needs to be processed**


In [6]:
Copy_df.dtypes

encounter_id                  int64
patient_nbr                   int64
race                         object
gender                       object
age                          object
weight                       object
admission_type_id             int64
discharge_disposition_id      int64
admission_source_id           int64
time_in_hospital              int64
payer_code                   object
medical_specialty            object
num_lab_procedures            int64
num_procedures                int64
num_medications               int64
number_outpatient             int64
number_emergency              int64
number_inpatient              int64
diag_1                       object
diag_2                       object
diag_3                       object
number_diagnoses            float64
max_glu_serum                object
A1Cresult                    object
metformin                    object
repaglinide                  object
nateglinide                  object
chlorpropamide              



*   **The categorical values will have to be encoded for the model**
*   **Handling the null values by coverting them to NaN and then checking columns for more than required (30%) null values**



In [7]:
# replacing ? and invalid values with with Nan
Copy_df = Copy_df.replace(["?", "Unknown/Invalid"], np.nan)
# Checking for % of null values
(Copy_df.isna().sum()*100/len(Copy_df))[(Copy_df.isna().sum()*100/len(Copy_df))>30]

weight               96.679020
payer_code           94.066851
medical_specialty    39.402008
dtype: float64

**Dropping the columns Weight, Payer_Code and Medical_Speciality**



In [8]:
Copy_df=Copy_df.drop(['weight','payer_code','medical_specialty'],axis=1)

**Checking the percentage of null values**

In [9]:
null_percentage = (Copy_df.isna().sum() * 100 / len(Copy_df))
null_percentage = null_percentage[null_percentage > 0]
print(null_percentage)

race                        1.781024
diag_1                      0.028784
diag_2                      0.572086
diag_3                      2.547404
number_diagnoses            0.003598
max_glu_serum               0.003598
A1Cresult                   0.003598
metformin                   0.003598
repaglinide                 0.003598
nateglinide                 0.003598
chlorpropamide              0.003598
glimepiride                 0.003598
acetohexamide               0.003598
glipizide                   0.003598
glyburide                   0.003598
tolbutamide                 0.003598
pioglitazone                0.003598
rosiglitazone               0.003598
acarbose                    0.003598
miglitol                    0.003598
troglitazone                0.003598
tolazamide                  0.003598
examide                     0.003598
citoglipton                 0.003598
insulin                     0.003598
glyburide-metformin         0.003598
glipizide-metformin         0.003598
g


*   All the columns do not have more than 3% missing values
*   There are only two options either imputing them or dropping them. Imputing however is not recommended in the healthcare setting. thus if the percentage of loss of dropping is less, we can decide to drop them
*   For, this we have to find the % loss for rows to see the impact of dropping them








In [10]:
(Copy_df.shape[0]-Copy_df.dropna(axis=0).shape[0])*100/Copy_df.dropna(axis=0).shape[0]

4.516395908543923

**Since the loss is just 3% we can proceed with dropping these rows without losing too much information**

In [11]:
Copy_df.dropna(axis=0,inplace=True)

In [12]:
Copy_df.isna().sum()

encounter_id                0
patient_nbr                 0
race                        0
gender                      0
age                         0
admission_type_id           0
discharge_disposition_id    0
admission_source_id         0
time_in_hospital            0
num_lab_procedures          0
num_procedures              0
num_medications             0
number_outpatient           0
number_emergency            0
number_inpatient            0
diag_1                      0
diag_2                      0
diag_3                      0
number_diagnoses            0
max_glu_serum               0
A1Cresult                   0
metformin                   0
repaglinide                 0
nateglinide                 0
chlorpropamide              0
glimepiride                 0
acetohexamide               0
glipizide                   0
glyburide                   0
tolbutamide                 0
pioglitazone                0
rosiglitazone               0
acarbose                    0
miglitol  

**Now there are no missing values in the dataset**

In [13]:
# Checking the distribution of the categorical variables 'examide' and 'citoglipton'
print(Copy_df['examide'].value_counts())
print(Copy_df['citoglipton'].value_counts())

# Since within the boundaries of our dataset both 'examide' and 'citoglipton' have only one category we will be dropping these columns as they don't add much value for prediction
Copy_df = Copy_df.drop(columns=['examide', 'citoglipton'])

No    26592
Name: examide, dtype: int64
No    26592
Name: citoglipton, dtype: int64


In [14]:
# Printing the df to verify
Copy_df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,...,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
1,149190,55629189,Caucasian,Female,[10-20),1,1,7,3,59,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,11,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,44,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,51,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
5,35754,82637451,Caucasian,Male,[50-60),2,1,2,3,31,...,No,Steady,No,No,No,No,No,No,Yes,>30


Now we have the basic processed data to use for our model

In [15]:
Copy_df.to_csv('clean_data.csv', index=False)
files.download('clean_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Coming to the data dictionary,**
*   **The categories within 'admission type', 'discharge disposition', and 'admission source' are similar, so by merging these categories, we can decrease the number of columns created during the encoding process.**
*   **For the 'admission_type' column, similar categories are remapped as follows: 'Urgent' (2) and 'Trauma Center' (7) are combined into the 'Emergency' (1) category. Additionally, 'NULL' (6) and 'Not Mapped' (8) are consolidated into the 'Not Available' (5) category.**
*   **For the 'discharge_disposition' column, similar categories are remapped: e.g. 'Discharged/transferred to home with home health service' (6), 'Discharged/transferred to home under care of Home IV provider' (8) and 'Hospice / home' (13) are combined into the 'Discharged to home' (1) category.**
*   **For the 'admission_source' column, similar categories are remapped: e.g. 'Clinic Referral' (2) and 'HMO Referral' (3) are combined into the ' Physician Referral' (1) category.**

In [16]:
Copy_df = pd.read_csv('clean_data.csv')

In [17]:
def remap_admission_type(admission_type_id):
    if admission_type_id in [2, 7]:
        return 1  # Urgent and Trauma Center is mapped to Emergency
    elif admission_type_id in [6, 8]:
        return 5  # NULL and Not Mapped are mapped to Not Available
    else:
        return admission_type_id

Copy_df['admission_type_id'] = Copy_df['admission_type_id'].apply(remap_admission_type)


In [18]:
def remap_discharge_disposition(discharge_disposition_id):
    if discharge_disposition_id in [6, 8, 13]:
        return 1  # Merging categories into 1
    elif discharge_disposition_id in [9, 12, 15, 16, 17]:
        return 5  # Merging categories into 5
    elif discharge_disposition_id in [3, 4, 5, 14, 22, 23, 24]:
        return 2  # Merging categories into 2
    elif discharge_disposition_id in [25, 26]:
        return 18  # Merging categories into 18
    else:
        return discharge_disposition_id

Copy_df['discharge_disposition_id'] = Copy_df['discharge_disposition_id'].apply(remap_discharge_disposition)


In [19]:
def remap_admission_source(admission_source_id):
    if admission_source_id in [2, 3]:
        return 1  # Merging categories into 1
    elif admission_source_id in [5, 6, 10, 22, 25]:
        return 4  # Merging categories into 4
    elif admission_source_id in [15, 17, 20, 21]:
        return 9  # Merging categories into 9
    elif admission_source_id in [13, 14]:
        return 11  # Merging categories into 11
    else:
        return admission_source_id

Copy_df['admission_source_id'] = Copy_df['admission_source_id'].apply(remap_admission_source)


**Encoding of variables**

In [20]:
# Using replace function to combine categories and encode them to numerical values
Copy_df['change'] = Copy_df['change'].replace({'Ch': 1, 'No': 0})
Copy_df['gender'] = Copy_df['gender'].replace({'Male': 1, 'Female': 0})
Copy_df['diabetesMed'] = Copy_df['diabetesMed'].replace({'Yes': 1, 'No': 0})

Ecoding to combine categories

In [21]:
medicine_columns = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'insulin', 'glyburide-metformin', 'tolazamide', 'metformin-pioglitazone','metformin-rosiglitazone', 'glimepiride-pioglitazone', 'glipizide-metformin', 'troglitazone', 'tolbutamide', 'acetohexamide']
Copy_df[medicine_columns] = Copy_df[medicine_columns].replace({'No': 0, 'Steady': 1, 'Up': 1, 'Down': 1})

**Revising the variables in test result columns such as 'A1cresult' and 'max_glu_serum', where the 'None' value is assigned a negative numerical representation for differentiation purposes.**

In [22]:
Copy_df['A1Cresult'] = Copy_df['A1Cresult'].replace({'>7': 1, '>8': 1, 'Norm': 0, 'None': -100})
Copy_df['max_glu_serum'] = Copy_df['max_glu_serum'].replace({'>200': 1, '>300': 1, 'Norm': 0, 'None': -100})

**Age is given as a range. Thus taking midpoints to replace it as a single value to see patterns**



In [23]:
Copy_df.age.value_counts()

[70-80)     7141
[60-70)     5798
[50-60)     4853
[80-90)     3626
[40-50)     2844
[30-40)     1146
[90-100)     518
[20-30)      397
[10-20)      220
[0-10)        49
Name: age, dtype: int64

In [24]:
age_midpoints = {
    '[0-10)': 5,
    '[10-20)': 15,
    '[20-30)': 25,
    '[30-40)': 35,
    '[40-50)': 45,
    '[50-60)': 55,
    '[60-70)': 65,
    '[70-80)': 75,
    '[80-90)': 85,
    '[90-100)': 95
}
Copy_df['age'] = Copy_df['age'].replace(age_midpoints)


**Finally, we encode the target variable as we are looking to deal with a binary classification problem. thus we will consider just two categories i.e., <= 30 days is readmitted while >30 and NO are not readmitted**

In [25]:
# Encoding the Target Variable using a lambda function
Copy_df['readmitted'] = Copy_df['readmitted'].apply(lambda x: 0 if x in ['>30', 'NO'] else 1)

**Processing the diagnosis variable by grouping them into specific diagnostic categories, thereby streamlining the number of variables produced for each diagnostic field.**

In [26]:
# Use the next block as another way
'''
def categorize_diagnosis(x):
    x = str(x)
    if "V" in x or "E" in x:
        return "Other"
    elif "250" in x:
        return "Diabetes"
    try:
        x = int(float(x))
        if 390 <= x <= 459 or x == 785:
            return "Circulatory"
        elif 460 <= x <= 519 or x == 786:
            return "Respiratory"
        elif 520 <= x <= 579 or x == 787:
            return "Digestive"
        elif 580 <= x <= 629 or x == 788:
            return "Genitourinary"
        elif 140 <= x <= 239:
            return "Neoplasms"
        elif 710 <= x <= 739:
            return "Musculoskeletal"
        elif 800 <= x <= 999:
            return "Injury"
    except ValueError:
        pass
    return "Other"

# Apply the categorize_diagnosis function to the 'diag_1' column
Copy_df['diag_cat'] = Copy_df['diag_1'].apply(categorize_diagnosis)

# Drop the 'diag_2' and 'diag_3' columns
Copy_df = Copy_df.drop(['diag_2', 'diag_3'], axis=1)
'''

'\ndef categorize_diagnosis(x):\n    x = str(x)\n    if "V" in x or "E" in x:\n        return "Other"\n    elif "250" in x:\n        return "Diabetes"\n    try:\n        x = int(float(x))\n        if 390 <= x <= 459 or x == 785:\n            return "Circulatory"\n        elif 460 <= x <= 519 or x == 786:\n            return "Respiratory"\n        elif 520 <= x <= 579 or x == 787:\n            return "Digestive"\n        elif 580 <= x <= 629 or x == 788:\n            return "Genitourinary"\n        elif 140 <= x <= 239:\n            return "Neoplasms"\n        elif 710 <= x <= 739:\n            return "Musculoskeletal"\n        elif 800 <= x <= 999:\n            return "Injury"\n    except ValueError:\n        pass\n    return "Other"\n\n# Apply the categorize_diagnosis function to the \'diag_1\' column\nCopy_df[\'diag_cat\'] = Copy_df[\'diag_1\'].apply(categorize_diagnosis)\n\n# Drop the \'diag_2\' and \'diag_3\' columns\nCopy_df = Copy_df.drop([\'diag_2\', \'diag_3\'], axis=1)\n'

In [27]:
# Define the function to check for diabetes in three columns of a DataFrame
def combined_diagnosis(row):
    # Define the inner function to check a single diagnosis code
    def is_diabetes(code):
        code = str(code)
        return '250' in code  # Assuming '250' is the code for diabetes

    # Check each of the three columns for diabetes
    for col in ['diag_1', 'diag_2', 'diag_3']:
        if is_diabetes(row[col]):
            return 'Diabetes'

    # If none of the columns contain '250', return 'Other'
    return 'Other'

# Apply the function to each row of the DataFrame
Copy_df['diag_cat'] = Copy_df.apply(combined_diagnosis, axis=1)
Copy_df = Copy_df.drop(['diag_1', 'diag_2', 'diag_3'], axis=1)

**For the final preprocessed data, we will do One Hot Encoding**

In [28]:
# Using get_dummies to generate encoded columns for specified categories in Copy_df
Copy_df = pd.get_dummies(Copy_df, columns=['gender', 'admission_type_id', 'discharge_disposition_id',
                                           'admission_source_id', 'max_glu_serum', 'A1Cresult', 'diag_cat'], drop_first=True)

# Creating dummy variables for 'race' and concatenate them with Copy_df
race_dummies = pd.get_dummies(Copy_df['race'])
Copy_df = pd.concat([Copy_df, race_dummies], axis=1)
Copy_df.drop(['race'], inplace=True, axis=1)

# Displaying the first few rows
print(Copy_df.head())

# Dropping duplicates based on 'patient_nbr', keeping the first occurrence
Copy_df = Copy_df.drop_duplicates(subset='patient_nbr', keep='first')


   encounter_id  patient_nbr  age  time_in_hospital  num_lab_procedures  \
0        149190     55629189   15                 3                  59   
1         64410     86047875   25                 2                  11   
2        500364     82442376   35                 2                  44   
3         16680     42519267   45                 1                  51   
4         35754     82637451   55                 3                  31   

   num_procedures  num_medications  number_outpatient  number_emergency  \
0               0               18                  0                 0   
1               5               13                  2                 0   
2               1               16                  0                 0   
3               0                8                  0                 0   
4               6               16                  0                 0   

   number_inpatient  ...  max_glu_serum_0  max_glu_serum_1  A1Cresult_0  \
0                 0  ..

In [29]:
Copy_df.to_csv('final_data.csv', index=False)
files.download('final_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [37]:
Copy_df.dtypes

encounter_id                     int64
patient_nbr                      int64
age                              int64
time_in_hospital                 int64
num_lab_procedures               int64
num_procedures                   int64
num_medications                  int64
number_outpatient                int64
number_emergency                 int64
number_inpatient                 int64
number_diagnoses               float64
metformin                        int64
repaglinide                      int64
nateglinide                      int64
chlorpropamide                   int64
glimepiride                      int64
acetohexamide                    int64
glipizide                        int64
glyburide                        int64
tolbutamide                      int64
pioglitazone                     int64
rosiglitazone                    int64
acarbose                         int64
miglitol                         int64
troglitazone                     int64
tolazamide               

In [43]:
@dataclass
class LogisticRegression:
    learning_rate: float
    epsilon: float
    max_iteration: int
    Copy_df: pd.DataFrame
    target_column: str
    X: np.ndarray = None
    y: np.ndarray = None
    X_train: np.ndarray = None
    X_test: np.ndarray = None
    y_train: np.ndarray = None
    y_test: np.ndarray = None
    w: np.ndarray = None

    def __post_init__(self):
        self.X = self.Copy_df.drop(self.target_column, axis=1).values
        self.y = self.Copy_df[self.target_column].values
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.3, random_state=42)
        self.normalize_data()

    def normalize_data(self):
      epsilon = 1e-10  # A small number to prevent division by zero
      self.mean = np.mean(self.X_train, axis=0)
      self.std = np.std(self.X_train, axis=0) + epsilon  # Adding epsilon
      self.X_train = (self.X_train - self.mean) / self.std
      self.X_test = (self.X_test - self.mean) / self.std



    def sigmoid(self, X):
      z = X.dot(self.w)
      # Clip z to prevent overflow
      #z = np.clip(z, -250, 250)
      sig = 1 / (1 + np.exp(-z))
      return sig


    def predict(self,X):
      sig=self.sigmoid(X)
      return np.around(sig)

    # def cost_function(self,X,y):
    #   sig=self.sigmoid(X)
    #   cost=y*(np.log(sig)) + (1-y)*(np.log(1-sig))
    #   cost=-cost.sum()/len(y)
    #   return cost
    def cost_function(self, X, y):
      sig = self.sigmoid(X)
      # Add a small constant to prevent log(0)
      epsilon = 1e-10
      cost = y * np.log(sig + epsilon) + (1 - y) * np.log(1 - sig + epsilon)
      cost = -cost.sum() / len(y)
      return cost


    def cost_derivative(self,X,y):
      sig=self.sigmoid(X)
      grad=(sig-y).dot(X)
      return grad

    def gradient_descent(self,X,y):
      errors=[]
      previous_error=float('inf')
      for i in tqdm(range(self.max_iteration),colour='red'):
        self.w=self.w-self.learning_rate*self.cost_derivative(X,y)
        current_error=self.cost_function(X,y)
        errors.append(current_error)
        if np.absolute(previous_error-current_error)<self.epsilon:
          print('model stopped learning')
          break
        previous_error=current_error
      #self.plot_cost(errors)

    def efficacy_report(self, y_hat, y):
      y = (y == 1)
      y_hat = (y_hat == 1)

      true_negatives = (~y & ~y_hat).sum()
      true_positives = (y & y_hat).sum()
      predicted_positives = y_hat.sum()
      actual_positives = y.sum()

      accuracy = (true_positives+true_negatives) / y.size
      precision = true_positives / predicted_positives if predicted_positives > 0 else 0
      recall = true_positives / actual_positives if actual_positives > 0 else 0
      f1_score = (2 * recall * precision) / (recall + precision) if (recall + precision) > 0 else 0

      return accuracy,recall, precision, f1_score

#Requirement 1 - remove index function to  receive an index as an input and remove the associated record from the training set
    def remove_index(self,X,y,index):
       X=np.delete(X,index,axis=0)
       y=np.delete(y,index)
       return X,y
       print('{} indexes were removed'.format(index))

    def fit(self,remove_index=[]):
      self.w=np.random.randn(self.X_train.shape[1])
      if(len(remove_index)>0):
        self.X_train,self.y_train=self.remove_index(self.X_train,self.y_train,remove_index)
      self.gradient_descent(self.X_train,self.y_train)
      print(self.w)
      y_hat_train=self.predict(self.X_train)
      accuracy,recall,precision,f1_score=self.efficacy_report(y_hat_train,self.y_train)
      print('the training accuracy was {}'.format(accuracy))
      print('the training recall was {}'.format(recall))
      print('the training precision was {}'.format(precision))
      print('the training f1_score was {}'.format(f1_score))


    def plot_cost(self,cost_sequence):
        s = np.array(cost_sequence)
        t = np.arange(s.size)
        fig, ax = plt.subplots()
        ax.plot(t,s)
        ax.set(xlabel = 'iterations', ylabel = 'cost' , title = 'cost trend')
        ax.grid()
        plt.legend(bbox_to_anchor=(1.05,1), loc=2,shadow=True)
        plt.show()



    def plot(self):
        plt.figure(figsize=(15,10))
        ax = plt.axes(projection='3d')


        ax.scatter3D(self.X_train[:, 0], self.X_train[:, 1],
                     self.sigmoid(self.X_train),
                     c = self.y_train[:], cmap='viridis', s=100);

        ax.set_xlim3d(55, 80)
        ax.set_ylim3d(80,240)
        plt.xlabel('$x_1$ feature', fontsize=15)
        plt.ylabel('$x_2$ feature', fontsize=15 )
        ax.set_zlabel('$P(Y = 1|x_1, x_2)$', fontsize=15, rotation = 0)

    def scatterPlt(self):

        x_min, x_max = self.X_train[:, 0].min() - 1, self.X_train[:, 0].max() + 1
        y_min, y_max = self.X_train[:, 1].min() - 1, self.X_train[:, 1].max() + 1

        xx, yy = np.meshgrid(np.linspace(x_min, x_max, 250),
                             np.linspace(y_min, y_max, 250))
        grid = np.c_[xx.ravel(), yy.ravel()]
        probs = grid.dot(self.w).reshape(xx.shape)

        f, ax = plt.subplots(figsize=(14,12))


        ax.contour(xx, yy, probs, levels=[0.5], cmap="Greys", vmin=0, vmax=.6)


        ax.scatter(self.X_train[:, 0], self.X_train[:, 1],
                   c=self.y_train[:], s=50,
                   cmap="RdBu", vmin=-.2, vmax=1.2,
                   edgecolor="white", linewidth=1)

        plt.xlabel('x1 feature')
        plt.ylabel('x2 feature')


    def plot3D(self):
        x_min, x_max = 55, 80
        y_min, y_max = 80, 240

        xx, yy = np.meshgrid(np.linspace(x_min, x_max, 250),
                             np.linspace(y_min, y_max, 250))

        grid = np.c_[xx.ravel(), yy.ravel()]
        probs = grid.dot(self.w).reshape(xx.shape)
        fig = plt.figure(figsize=(14,12))
        ax = plt.axes(projection='3d')
        ax.contour3D(xx, yy, probs, 50, cmap='binary')

        ax.scatter3D(self.X_train[:, 0], self.X_train[:, 1],
                   c=self.y_train[:], s=50,
                   cmap="RdBu", vmin=-.2, vmax=1.2,
                   edgecolor="white", linewidth=1)

        ax.set_xlabel('x1')
        ax.set_ylabel('x2')
        ax.set_zlabel('probs')
        ax.set_title('3D contour')
        plt.show()



In [48]:
log_reg = LogisticRegression(learning_rate=0.0001, epsilon=0.0000000000000001, max_iteration=50, Copy_df=Copy_df, target_column='readmitted')
log_reg.fit()


100%|[31m██████████[0m| 50/50 [00:00<00:00, 320.95it/s]

[-1.00617014e-02  3.89796803e-02  1.19288067e-02  1.76836919e-02
 -6.71461118e-03 -1.43051593e-02  4.40145568e-02  1.22344121e-02
  1.75912371e-02  1.17135960e-01  3.19610321e-02 -3.93212621e-02
  1.57537184e-02 -4.77625115e-03 -1.92928139e-02 -2.28970570e-02
 -3.34391844e-01 -3.23679655e-02 -4.46289142e-02 -1.84841185e-03
  3.07465359e-03 -3.10590676e-02 -1.18165918e-02  1.70019612e-02
  1.53644931e+00 -1.23421952e-02 -5.72288560e-02 -4.89889401e-04
  2.41351615e+00 -8.05561252e-01 -1.84641570e+00  2.88226888e-01
  6.48341750e-02  6.49697794e-02  9.90302192e-03  1.79275259e-02
 -9.18791119e-01 -9.39619545e-03  1.45744794e-01  6.14771376e-03
  1.35522242e-02 -2.89116420e-03 -4.77547312e-02  3.22874010e-02
  1.66574203e-03  3.33167377e-02 -9.54650627e-01  8.77348381e-03
 -1.65147259e-02 -2.06248862e-02  4.87095156e-03 -4.38050159e-03
 -2.61178716e-02 -9.15649117e-01 -1.61657076e-01 -9.92730820e-01
 -3.44517560e-01 -2.84977059e-01]
the training accuracy was 0.6315827598696124
the trainin




In [53]:
#Neural network
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from keras.models import Sequential
from keras.layers import Dense

In [52]:
X = Copy_df.drop('readmitted', axis = 1)
y= Copy_df['readmitted']


# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Build the model
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))

model.add(Dense(1, activation='sigmoid'))  # Change for multi-class classification

#Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])  # Change loss for multi-class

#Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

#Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print('Test Accuracy:', accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.9011406898498535
