In [None]:


3. Workflow

    Load Dataset

        pd.read_csv()

        Explore shape, head, describe, info

    Preprocess Data

        Handle missing or zero values (e.g., Glucose = 0)

        Normalize/standardize if needed

    EDA (Exploratory Data Analysis)

        Correlation matrix

        Distribution plots for features

        Class balance (0: No diabetes, 1: Diabetic)

    Split Data

        train_test_split() (e.g., 80% train, 20% test)

    Train Model

        Use Logistic Regression, Decision Tree, or Random Forest

    Evaluate Model

        Accuracy, Confusion Matrix, Precision, Recall, F1 Score

        classification_report and confusion_matrix

    Bonus (Optional)

        Create a simple interface using streamlit or gradio

        Save model using joblib or pickle

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [5]:
with open("README-DIABETES", "r") as file:
    print(file.read())

The DIABETES data sets in this directory are provided for use in 1994 
AI in Medicine symposium submissions.  Permission is granted to use the
data sets for other research purposes as long as appropriate credit is
given as to the source (AIM-94 data set provided by Michael Kahn, MD, PhD, 
Washington University, St. Louis, MO).


Index:
------

* Data-Codes: a listing of the codes used in the data sets.

* Domain-Description: This file describes the basic physiology and patho-
physiology of diabetes mellitus and its treatment.

* data-[01-70]: data sets covering several weeks' to months' worth of
outpatient care on 70 patients.  An additional 10 sets will be made
available two weeks prior to the symposium for interested parties.  Please
contact the organizers if you would like to obtain these data sets.


Methods:
--------

You do not need to use all the data in order to participate.  Use any 
subset of the available data from either the ICU data set or the diabetes 
data set.  Furtherm

In [6]:
import os
print(os.listdir())  # Lists all files in current directory


['.git', '.ipynb_checkpoints', 'data-01', 'data-02', 'data-03', 'data-04', 'data-05', 'data-06', 'data-07', 'data-08', 'data-09', 'data-10', 'data-11', 'data-12', 'data-13', 'data-14', 'data-15', 'data-16', 'data-17', 'data-18', 'data-19', 'data-20', 'data-21', 'data-22', 'data-23', 'data-24', 'data-25', 'data-26', 'data-27', 'data-28', 'data-29', 'data-30', 'data-31', 'data-32', 'data-33', 'data-34', 'data-35', 'data-36', 'data-37', 'data-38', 'data-39', 'data-40', 'data-41', 'data-42', 'data-43', 'data-44', 'data-45', 'data-46', 'data-47', 'data-48', 'data-49', 'data-50', 'data-51', 'data-52', 'data-53', 'data-54', 'data-55', 'data-56', 'data-57', 'data-58', 'data-59', 'data-60', 'data-61', 'data-62', 'data-63', 'data-64', 'data-65', 'data-66', 'data-67', 'data-68', 'data-69', 'data-70', 'Data-Codes', 'Domain-Description', 'Index', 'README-DIABETES', 'README.md', 'Untitled.ipynb']


In [7]:
with open("data-01", "r") as file:
    for _ in range(10):
        print(file.readline())


04-21-1991	9:09	58	100

04-21-1991	9:09	33	009

04-21-1991	9:09	34	013

04-21-1991	17:08	62	119

04-21-1991	17:08	33	007

04-21-1991	22:51	48	123

04-22-1991	7:35	58	216

04-22-1991	7:35	33	010

04-22-1991	7:35	34	013

04-22-1991	13:40	33	002



In [8]:
import pandas as pd

df = pd.read_csv("data-01", sep="\t", header=None)
df.columns = ["Date", "Time", "Code1", "Code2"]  # Naming columns based on observation
print(df.head())


         Date   Time  Code1  Code2
0  04-21-1991   9:09     58    100
1  04-21-1991   9:09     33      9
2  04-21-1991   9:09     34     13
3  04-21-1991  17:08     62    119
4  04-21-1991  17:08     33      7


In [10]:
import pandas as pd
import glob

files = sorted(glob.glob("data-*"))

for file in files:
    temp_df = pd.read_csv(file, sep="\t", header=None)
    print(f"{file}: shape = {temp_df.shape}")


Data-Codes: shape = (39, 1)
data-01: shape = (943, 4)
data-02: shape = (761, 4)
data-03: shape = (300, 4)
data-04: shape = (300, 4)
data-05: shape = (300, 4)
data-06: shape = (149, 4)
data-07: shape = (242, 4)
data-08: shape = (177, 4)
data-09: shape = (206, 4)
data-10: shape = (247, 4)
data-11: shape = (236, 4)
data-12: shape = (300, 4)
data-13: shape = (300, 4)
data-14: shape = (230, 4)
data-15: shape = (300, 4)
data-16: shape = (300, 4)
data-17: shape = (251, 4)
data-18: shape = (300, 4)
data-19: shape = (300, 4)
data-20: shape = (1003, 4)
data-21: shape = (517, 4)
data-22: shape = (300, 4)
data-23: shape = (300, 4)
data-24: shape = (300, 4)
data-25: shape = (110, 4)
data-26: shape = (483, 4)
data-27: shape = (926, 4)
data-28: shape = (951, 4)
data-29: shape = (1289, 4)
data-30: shape = (1179, 4)
data-31: shape = (670, 4)
data-32: shape = (157, 4)
data-33: shape = (300, 4)
data-34: shape = (300, 4)
data-35: shape = (300, 4)
data-36: shape = (265, 4)
data-37: shape = (300, 4)
data-38

In [12]:
import pandas as pd
import glob

# Pattern to match only data files with numbers
files = sorted(glob.glob("data-[0-9][0-9]*"))

dfs = []
for file in files:
    temp_df = pd.read_csv(file, sep="\t", header=None)
    temp_df.columns = ["Date", "Time", "Code1", "Code2"]
    temp_df['Datetime'] = pd.to_datetime(temp_df['Date'] + ' ' + temp_df['Time'], format='%m-%d-%Y %H:%M', errors='coerce')
    temp_df = temp_df.drop(columns=['Date', 'Time'])
    dfs.append(temp_df)

full_df = pd.concat(dfs, ignore_index=True)

# Optional: remove rows with invalid datetime
full_df = full_df.dropna(subset=['Datetime'])

print(full_df.shape)
print(full_df.head())


(29285, 3)
   Code1 Code2            Datetime
0     58   100 1991-04-21 09:09:00
1     33     9 1991-04-21 09:09:00
2     34    13 1991-04-21 09:09:00
3     62   119 1991-04-21 17:08:00
4     33     7 1991-04-21 17:08:00
