In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_excel('/Users/catherinelee/Documents/GitHub/Project_Python/Health Care/data.xlsx')

# Perform preliminary data inspection
structure = data.info()
missing_values = data.isnull().sum()
duplicates = data.duplicated().sum()

# Remove duplicates
data = data.drop_duplicates()

# Preliminary statistical summary of the data
stat_summary = data.describe()

# Identify categorical variables
categorical_vars = data.nunique()[data.nunique() <= 10].index.tolist()

# Categorical variable count plots
for var in categorical_vars:
    sns.countplot(x=var, data=data)
    plt.title(f'Count Plot of {var}')
    plt.show()

# Occurrence of CVD across Age
sns.histplot(data=data, x="age", hue="target", multiple="stack")
plt.title('Occurrence of CVD across Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

# Composition of overall patients w.r.t. Gender
sns.countplot(data=data, x="sex", hue="target")
plt.title('Composition of Patients w.r.t. Gender')
plt.xlabel('Gender')
plt.ylabel('Frequency')
plt.show()

# Detect heart attack based on Resting Blood Pressure
sns.boxplot(data=data, x="target", y="trestbps")
plt.title('Resting Blood Pressure by Heart Attack Occurrence')
plt.xlabel('Heart Attack Occurrence')
plt.ylabel('Resting Blood Pressure')
plt.show()

# Relationship between Cholesterol levels and heart attack
sns.scatterplot(data=data, x="chol", y="age", hue="target")
plt.title('Cholesterol Levels by Age and Heart Attack Occurrence')
plt.xlabel('Cholesterol Level')
plt.ylabel('Age')
plt.show()

# Relationship between peak exercising and occurrence of heart attack
sns.boxplot(data=data, x="exang", y="oldpeak", hue="target")
plt.title('Relationship Between Peak Exercising and Heart Attack')
plt.xlabel('Exercise Induced Angina')
plt.ylabel('ST Depression (oldpeak)')
plt.show()

# Is thalassemia a major cause of CVD?
sns.countplot(data=data, x="thal", hue="target")
plt.title('Thalassemia Types and Heart Attack Occurrence')
plt.xlabel('Thalassemia Type')
plt.ylabel('Frequency')
plt.show()

# For the other factors and the pair plot, consider running them separately or on a reduced dataset.

# Building a baseline logistic regression model
X = data.drop('target', axis=1)
y = data['target']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Model evaluation
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Output results
print('Confusion Matrix:\n', conf_matrix)
print('\nClassification Report:\n', class_report)


ModuleNotFoundError: No module named 'pandas'