In [147]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv(r"C:\Users\valan\OneDrive\Desktop\Projects in D Science\MANDATORY\metadata.csv")

### DATA PREPROCESSING ###

# List of categorical features
binary_cols = ['smoke', 'drink', 'itch', 'grew', 'hurt', 'changed', 'bleed', 'elevation']

# Convert 'UNK' to NaN, then replace True/False with 1/0 (Future-proofing with infer_objects)
df[binary_cols] = df[binary_cols].replace({'True': 1, 'False': 0, 'UNK': np.nan}).astype(float).infer_objects(copy=False)

# Fill missing values in binary features with 0.5
df.loc[:, binary_cols] = df[binary_cols].fillna(0.5)

# Fix diameter issue - Use mean of diameter_1 and diameter_2
df['diameter'] = df[['diameter_1', 'diameter_2']].mean(axis=1)

# Proper missing value handling without inplace warnings
df.loc[:, 'diameter'] = df['diameter'].fillna(df['diameter'].median())
df.loc[:, 'age'] = df['age'].fillna(df['age'].median())

# Select features
features = binary_cols + ['diameter', 'age']
X = df[features].copy()  # Avoid potential chaining issues
y = df['biopsed'].astype(int)  # Convert target variable to integer

# Standardize numerical features correctly
scaler = StandardScaler()
X.loc[:, ['diameter', 'age']] = scaler.fit_transform(X[['diameter', 'age']])

### TRAINING LOGISTIC REGRESSION MODEL ###
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy (0.5 for missing values): {accuracy:.2f}")

### FEATURE IMPORTANCE ###
feature_importance = np.exp(model.coef_[0])  # Convert to odds ratios
print("\nFeature Importance (Odds Ratios):")
for feature, importance in zip(X.columns, feature_importance):
    print(f"{feature}: {importance:.2f}")

### CROSS-VALIDATION ###
kf = KFold(n_splits=10, shuffle=True, random_state=None)
cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')

print(f"\nCross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Accuracy: {cv_scores.mean():.2f}")

### MISSING DATA ANALYSIS ###
missing_values = df.isnull().sum()
print("\nMissing Value Counts:")
print(missing_values)

missing_rows = df[df[['pesticide', 'gender', 'skin_cancer_history', 'cancer_history', 
                      'has_piped_water', 'has_sewage_system', 'fitspatrick', 
                      'diameter_1', 'diameter_2']].isnull().any(axis=1)]

print(f"\nTotal rows with missing values in key columns: {missing_rows.shape[0]}")




Model Accuracy (0.5 for missing values): 0.88

Feature Importance (Odds Ratios):
smoke: 0.07
drink: 0.41
itch: 1.30
grew: 1.87
hurt: 8.29
changed: 19.75
bleed: 3.72
elevation: 2.84
diameter: 1.17
age: 1.36

Cross-Validation Accuracy Scores: [0.81304348 0.83478261 0.8826087  0.87391304 0.85652174 0.84347826
 0.84782609 0.82608696 0.86899563 0.87336245]
Mean Accuracy: 0.85

Missing Value Counts:
patient_id               0
lesion_id                0
smoke                    0
drink                    0
background_father      818
background_mother      822
age                      0
pesticide              804
gender                 804
skin_cancer_history    804
cancer_history         804
has_piped_water        804
has_sewage_system      804
fitspatrick            804
region                   0
diameter_1             804
diameter_2             804
diagnostic               0
itch                     0
grew                     0
hurt                     0
changed                  0
bleed    

  df[binary_cols] = df[binary_cols].replace({'True': 1, 'False': 0, 'UNK': np.nan}).astype(float).infer_objects(copy=False)
 -1.22486016]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X.loc[:, ['diameter', 'age']] = scaler.fit_transform(X[['diameter', 'age']])


In [148]:
from sklearn.linear_model import LogisticRegression

# Split into full and partial datasets
cols_to_clean = ['pesticide', 'gender', 'skin_cancer_history', 'cancer_history', 
                 'has_piped_water', 'has_sewage_system', 'fitspatrick', 
                 'diameter_1', 'diameter_2']

df_full = df.dropna(subset=cols_to_clean)  # Drop NaNs only for these columns

df_partial = df[df[['pesticide', 'gender', 'skin_cancer_history', 'cancer_history', 
                    'has_piped_water', 'has_sewage_system', 'fitspatrick', 
                    'diameter_1', 'diameter_2']].isnull().any(axis=1)]  # Partial data

# Ensure `biopsed` has both classes in partial data
if df_partial['biopsed'].nunique() < 2:
    print("Warning: Partial dataset contains only one class. Using full dataset only.")
    df_partial = None  # Skip training this model

features = binary_cols + ['diameter', 'age']

# Prepare Full Data Model
X_full = df_full[features]
y_full = df_full['biopsed']
scaler = StandardScaler()
X_full[['diameter', 'age']] = scaler.fit_transform(X_full[['diameter', 'age']])
model_full = LogisticRegression().fit(X_full, y_full)

# Train Partial Data Model if both classes exist
if df_partial is not None:
    features_partial = [f for f in features if f not in ['pesticide', 'gender', 'skin_cancer_history', 
                                                          'cancer_history', 'has_piped_water', 
                                                          'has_sewage_system', 'fitspatrick', 
                                                          'diameter_1', 'diameter_2']]
    X_partial = df_partial[features_partial]
    y_partial = df_partial['biopsed']
    X_partial[['diameter', 'age']] = scaler.transform(X_partial[['diameter', 'age']])
    model_partial = LogisticRegression().fit(X_partial, y_partial)

# Model Predictions & Ensemble
y_pred_full = model_full.predict_proba(X_full)[:, 1]

if df_partial is not None:
    y_pred_partial = model_partial.predict_proba(X_partial)[:, 1]
    ensemble_pred = (0.7 * y_pred_full[:len(y_pred_partial)]) + (0.3 * y_pred_partial)
    final_pred = (ensemble_pred >= 0.5).astype(int)
else:
    final_pred = (y_pred_full >= 0.5).astype(int)

# Model Evaluation
accuracy_full = accuracy_score(y_full, model_full.predict(X_full))
print(f"\nModel Accuracy (Full Data): {accuracy_full:.2f}")

if df_partial is not None:
    accuracy_partial = accuracy_score(y_partial, model_partial.predict(X_partial))
    print(f"Model Accuracy (Partial Data): {accuracy_partial:.2f}")

# Feature Importance
feature_importance = np.exp(model_full.coef_[0])
print("\nFeature Importance (Odds Ratios - Full Model):")
for feature, importance in zip(X_full.columns, feature_importance):
    print(f"{feature}: {importance:.2f}")

# Cross-Validation
kf = KFold(n_splits=10, shuffle=True, random_state=None)
cv_scores = cross_val_score(model_full, X_full, y_full, cv=kf, scoring='accuracy')

print(f"\nCross-Validation Accuracy Scores (Full Model): {cv_scores}")
print(f"Mean Accuracy: {cv_scores.mean():.2f}")

# Missing Data Analysis
missing_values = df.isnull().sum()
print("\nMissing Value Counts:")
print(missing_values)

missing_rows = df[df[['pesticide', 'gender', 'skin_cancer_history', 'cancer_history', 
                      'has_piped_water', 'has_sewage_system', 'fitspatrick', 
                      'diameter_1', 'diameter_2']].isnull().any(axis=1)]

print(f"\nTotal rows with missing values in key columns: {missing_rows.shape[0]}")




Model Accuracy (Full Data): 0.90

Feature Importance (Odds Ratios - Full Model):
smoke: 2.19
drink: 0.76
itch: 1.16
grew: 1.70
hurt: 5.08
changed: 17.89
bleed: 2.44
elevation: 2.89
diameter: 1.26
age: 0.82

Cross-Validation Accuracy Scores (Full Model): [0.88       0.90666667 0.88       0.89333333 0.91275168 0.88590604
 0.90604027 0.91946309 0.88590604 0.89261745]
Mean Accuracy: 0.90

Missing Value Counts:
patient_id               0
lesion_id                0
smoke                    0
drink                    0
background_father      818
background_mother      822
age                      0
pesticide              804
gender                 804
skin_cancer_history    804
cancer_history         804
has_piped_water        804
has_sewage_system      804
fitspatrick            804
region                   0
diameter_1             804
diameter_2             804
diagnostic               0
itch                     0
grew                     0
hurt                     0
changed               

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_full[['diameter', 'age']] = scaler.fit_transform(X_full[['diameter', 'age']])
