# PCA ATTEMPT

In [8]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from statsmodels.api import Logit, add_constant
import statsmodels.api as sm



# Load the dataset
merged_df = pd.read_csv('merged_df.csv')

# Display the first few rows of the dataset to ensure it loaded correctly
print(merged_df.head())


   Unnamed: 0      ID  group  outcome  age  gendera       BMI  hypertensive  \
0           1  100213      1        0   74        2  26.81457             1   
1           2  100449      2        0   87        1  24.00000             1   
2           3  100571      1        0   67        1  32.28435             1   
3           4  100610      1        0   81        2       NaN             1   
4           5  100660      1        0   75        1       NaN             1   

   atrialfibrillation  CHD with no MI  ...  admission_type  \
0                   0               0  ...       EMERGENCY   
1                   0               0  ...       EMERGENCY   
2                   0               0  ...       EMERGENCY   
3                   1               0  ...       EMERGENCY   
4                   0               0  ...       EMERGENCY   

          admission_location       discharge_location  insurance  language  \
0  CLINIC REFERRAL/PREMATURE  LONG TERM CARE HOSPITAL   Medicare       NaN

In [7]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from statsmodels.api import Logit, add_constant
import statsmodels.api as sm


# Identify and drop any non-numeric columns if they exist
# Assuming 'outcome' is the only non-numeric column needed
non_numeric_cols = merged_df.select_dtypes(include=['object', 'datetime']).columns.tolist()
print(f"Non-numeric columns to be dropped: {non_numeric_cols}")
data = merged_df.drop(columns=non_numeric_cols)

# Separate the outcome variable
outcome = data['outcome']
data = data.drop(columns=['outcome'])

# Ensure all remaining columns are numeric
data = data.apply(pd.to_numeric, errors='coerce')

# Remove columns with NA values that could not be converted to numeric
data = data.dropna(axis=1, how='any')

# Standardize the data
scaler = StandardScaler()
standardized_data = scaler.fit_transform(data)

# Remove columns with zero variance
data_var = pd.DataFrame(standardized_data, columns=data.columns)
data_var = data_var.loc[:, (data_var != data_var.iloc[0]).any()]

# Verify that there are no zero variance columns
print(f"Columns retained after removing zero variance columns: {data_var.shape[1]}")

# Apply PCA
pca = PCA()
pc_data = pca.fit_transform(data_var)

# Check explained variance ratio
explained_variance = pca.explained_variance_ratio_
print(f"Explained variance ratio of each principal component: {explained_variance}")

# Convert to DataFrame for ease of use
pc_df = pd.DataFrame(pc_data, columns=[f'PC{i+1}' for i in range(pc_data.shape[1])])

# Combine principal components with the outcome variable
pca_data = pd.concat([pc_df, outcome.reset_index(drop=True)], axis=1)

# Fit the full logistic regression model using principal components
X = add_constant(pca_data.drop(columns=['outcome']))
y = pca_data['outcome']

# Fit the model
full_model = Logit(y, X).fit()
print(full_model.summary())

# Perform stepwise selection based on AIC
def stepwise_selection(X, y):
    initial_list = X.columns.tolist()
    best_model = Logit(y, X).fit()
    best_aic = best_model.aic
    improved = True
    
    while improved:
        improved = False
        results = []
        
        # Try dropping each column
        for column in initial_list:
            temp_X = X.drop(columns=[column])
            model = Logit(y, temp_X).fit(disp=0)
            results.append((model.aic, column, model))
        
        # Try adding each column
        for column in [col for col in data.columns if col not in initial_list]:
            temp_X = X.join(data[column])
            model = Logit(y, temp_X).fit(disp=0)
            results.append((model.aic, column, model))
        
        results.sort()
        best_aic, best_column, best_model = results[0]
        
        if best_aic < best_model.aic:
            improved = True
            initial_list = best_model.params.index.tolist()
    
    return best_model

best_model = stepwise_selection(X, y)
print(best_model.summary())


Non-numeric columns to be dropped: ['admittime', 'dischtime', 'deathtime', 'admission_type', 'admission_location', 'discharge_location', 'insurance', 'language', 'religion', 'marital_status', 'ethnicity', 'diagnosis', 'hospital_expire_flag']
Columns retained after removing zero variance columns: 33
Explained variance ratio of each principal component: [1.09072129e-01 8.22930626e-02 6.66611481e-02 6.15933642e-02
 5.35715691e-02 5.11805663e-02 4.61982382e-02 4.21848423e-02
 3.82590947e-02 3.55825747e-02 3.35821011e-02 3.23057834e-02
 3.01572197e-02 2.92823905e-02 2.75982435e-02 2.68134648e-02
 2.62863238e-02 2.53305497e-02 2.44232083e-02 2.15535707e-02
 2.11021851e-02 1.99087028e-02 1.79548271e-02 1.64467352e-02
 1.62897412e-02 1.51350117e-02 1.19663142e-02 9.06791865e-03
 7.95846524e-03 1.78887167e-04 3.28011251e-05 2.42889990e-05
 4.67636432e-06]
Optimization terminated successfully.
         Current function value: 0.298325
         Iterations 7
                           Logit Regres



                           Logit Regression Results                           
Dep. Variable:                outcome   No. Observations:                 1177
Model:                          Logit   Df Residuals:                     1144
Method:                           MLE   Df Model:                           32
Date:                Thu, 27 Jun 2024   Pseudo R-squ.:                  0.2465
Time:                        16:11:15   Log-Likelihood:                -351.16
converged:                       True   LL-Null:                       -466.03
Covariance Type:            nonrobust   LLR p-value:                 9.077e-32
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.4572      0.128    -19.180      0.000      -2.708      -2.206
PC1            0.2303      0.058      3.947      0.000       0.116       0.345
PC2           -0.0352      0.057     -0.623      0.5

