In [176]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv(r"C:\Users\valan\OneDrive\Desktop\Projects in D Science\MANDATORY\metadata.csv")

### **Step 1: Inspect Basic Information** ###
print(df.info())  # Overview of dataset structure
print("\nMissing Values Count:\n", df.isnull().sum())  # Count missing data for each feature

### **Step 2: Summary Statistics for Numerical Features** ###
print("\nSummary Statistics:\n", df.describe())  # View numerical feature distributions

### **Step 3: Distribution of Categorical Features** ###
binary_cols = ['smoke', 'drink', 'itch', 'grew', 'hurt', 'changed', 'bleed', 'elevation']
print("\nCategorical Feature Counts:")
for col in binary_cols:
    print(f"{col}:\n{df[col].value_counts(dropna=False)}\n")

### **Step 4: Check Target Variable (`biopsed`) Distribution** ###
print("\nTarget Variable Distribution:\n", df['biopsed'].value_counts(normalize=True))

### **Step 5: Convert `biopsed` to Numeric for Correlation Analysis** ###
df['biopsed'] = df['biopsed'].astype(int)  # Convert boolean to integer (1 = True, 0 = False)

### **Step 6: Analyze Correlations (Only Numeric Features)** ###
numeric_cols = df.select_dtypes(include=[np.number]).columns
print("\nFeature Correlations:\n", df[numeric_cols].corr()['biopsed'].sort_values(ascending=False))

### **Step 7: Missing Data Analysis** ###
print("\nMissing Value Counts:")
print(df.isnull().sum())

# Check how many rows have missing values in key columns
missing_cols = ['smoke', 'drink', 'pesticide', 'gender', 'skin_cancer_history', 'cancer_history',
                'has_piped_water', 'has_sewage_system', 'fitspatrick', 'diameter_1', 'diameter_2']
missing_rows = df[missing_cols].isnull().any(axis=1).sum()
print(f"\nTotal rows with missing values in key columns: {missing_rows}")

# Inspect smoking data separately to confirm how missing values behave
print("\nSmoking Data Distribution:\n", df['smoke'].value_counts(dropna=False))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2298 entries, 0 to 2297
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   patient_id           2298 non-null   object 
 1   lesion_id            2298 non-null   int64  
 2   smoke                1494 non-null   object 
 3   drink                1494 non-null   object 
 4   background_father    1480 non-null   object 
 5   background_mother    1476 non-null   object 
 6   age                  2298 non-null   int64  
 7   pesticide            1494 non-null   object 
 8   gender               1494 non-null   object 
 9   skin_cancer_history  1494 non-null   object 
 10  cancer_history       1494 non-null   object 
 11  has_piped_water      1494 non-null   object 
 12  has_sewage_system    1494 non-null   object 
 13  fitspatrick          1494 non-null   float64
 14  region               2298 non-null   object 
 15  diameter_1           1494 non-null   f

In [177]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv(r"C:\Users\valan\OneDrive\Desktop\Projects in D Science\MANDATORY\metadata.csv")

print(f"Total rows with missing 'smoke': {df['smoke'].isnull().sum()}")
print(f"Total rows with missing 'smoke' and 'biopsed' is FALSE: {df[df['smoke'].isnull() & (df['biopsed'] == 0)].shape[0]}")

### **DATA PREPROCESSING** ###

# List of categorical features
binary_cols = ['smoke', 'drink', 'itch', 'grew', 'hurt', 'changed', 'bleed', 'elevation']
numeric_cols = ['diameter_1', 'diameter_2', 'age']

# Convert 'UNK' to NaN, then replace True/False with 1/0
df[binary_cols] = df[binary_cols].replace({'True': 1, 'False': 0, 'UNK': np.nan}).astype(float)

# Fill missing values in binary features with 0.5 (for initial test)
df[binary_cols] = df[binary_cols].fillna(0.5)

# Fill missing values for numerical features using MEDIAN (to prevent outliers from impacting the data)
df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.median()))

# Convert `biopsed` to numeric (1 = True, 0 = False)
df['biopsed'] = df['biopsed'].astype(int)

# Select features
features = binary_cols + numeric_cols
X = df[features].copy()
y = df['biopsed']

# Standardize ONLY numerical features (`diameter_1`, `diameter_2`, `age`)
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

### **TRAINING LOGISTIC REGRESSION MODEL** ###
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy (0.5 for missing binary features): {accuracy:.2f}")

### **FEATURE IMPORTANCE** ###
feature_importance = np.exp(model.coef_[0])  # Convert to odds ratios
print("\nFeature Importance (Odds Ratios):")
for feature, importance in zip(X.columns, feature_importance):
    print(f"{feature}: {importance:.2f}")

### **CROSS-VALIDATION** ###
kf = KFold(n_splits=5, shuffle=True, random_state=None)
cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')

print(f"\nCross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Accuracy: {cv_scores.mean():.2f}")

### **MISSING DATA ANALYSIS** ###
print("\nFinal Missing Value Counts (After Handling):")
print(df.isnull().sum())

# Check total rows with missing values in key columns
missing_cols = ['pesticide', 'gender', 'skin_cancer_history', 'cancer_history', 'has_piped_water', 'has_sewage_system', 
                'fitspatrick', 'diameter_1', 'diameter_2']
missing_rows = df[missing_cols].isnull().any(axis=1).sum()

print(f"\nTotal rows with missing values in key columns: {missing_rows}")

# Inspect smoking data distribution after imputation
print("\nSmoking Data Distribution (After Imputation):\n", df['smoke'].value_counts())


Total rows with missing 'smoke': 804
Total rows with missing 'smoke' and 'biopsed' is FALSE: 804

Model Accuracy (0.5 for missing binary features): 0.86

Feature Importance (Odds Ratios):
smoke: 0.07
drink: 0.37
itch: 1.18
grew: 1.72
hurt: 7.88
changed: 24.85
bleed: 4.21
elevation: 2.99
diameter_1: 1.42
diameter_2: 0.94
age: 1.40

Cross-Validation Accuracy Scores: [0.86521739 0.84130435 0.85869565 0.8496732  0.8496732 ]
Mean Accuracy: 0.85

Final Missing Value Counts (After Handling):
patient_id               0
lesion_id                0
smoke                    0
drink                    0
background_father      818
background_mother      822
age                      0
pesticide              804
gender                 804
skin_cancer_history    804
cancer_history         804
has_piped_water        804
has_sewage_system      804
fitspatrick            804
region                   0
diameter_1               0
diameter_2               0
diagnostic               0
itch                    

  df[binary_cols] = df[binary_cols].replace({'True': 1, 'False': 0, 'UNK': np.nan}).astype(float)


### Splitting the dataset to try to fix the bias of the 804 rows that miss values, and are all biopsed FALSE, which probably would cause issues for our training

In [216]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# ---------------------------
# Load your dataset (adjust the path as needed)
df = pd.read_csv(r"C:\Users\valan\OneDrive\Desktop\Projects in D Science\MANDATORY\metadata.csv")

# At this point, df remains unchanged so that its missing value counts are preserved.
# For example, checking df.isnull().sum() will show that 'smoke' and 'drink' have 804 missing values.

# ---------------------------
# Now, create a working copy for modeling.
df_model = df.copy()

# STEP 1: Preprocessing on the modeling copy (df_model)
# Define the binary columns (including smoke, drink, etc.)
binary_cols = ['smoke', 'drink', 'itch', 'grew', 'hurt', 'changed', 'bleed', 'elevation']

# Replace string entries:
# - 'True' becomes 1
# - 'False' becomes 0
# - 'UNK' becomes NaN
df_model[binary_cols] = df_model[binary_cols].replace({'True': 1, 'False': 0, 'UNK': np.nan})
# Convert them to float and impute missing values with 0.5 (for modeling purposes only)
df_model[binary_cols] = df_model[binary_cols].astype(float).fillna(0.5)

# Define the numerical columns (we keep diameter_1 and diameter_2 separate, plus age)
numeric_cols = ['diameter_1', 'diameter_2', 'age']
# Impute missing numerical values with their respective median
df_model[numeric_cols] = df_model[numeric_cols].apply(lambda x: x.fillna(x.median()))

# Convert the target variable 'biopsed' to numeric (True->1, False->0)
df_model['biopsed'] = df_model['biopsed'].astype(int)

# ---------------------------
# STEP 2: Use only complete cases for modeling
# Define key columns that we require to be present:
cols_to_clean = ['pesticide', 'gender', 'skin_cancer_history', 'cancer_history', 
                 'has_piped_water', 'has_sewage_system', 'fitspatrick', 
                 'diameter_1', 'diameter_2']

# Drop any rows in df_model that are missing a value in any of these key columns.
df_clean = df_model.dropna(subset=cols_to_clean)

# Now, df_clean contains our complete cases – around 1494 rows.

# ---------------------------
# STEP 3: Define the features & prepare X and y for training.
# Use the binary columns (which include smoke and drink now imputed in df_model) 
# and the numerical columns.
features = binary_cols + numeric_cols

X = df_clean[features].copy()
y = df_clean['biopsed']

# ---------------------------
# STEP 4: Standardize only the numerical features.
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

# ---------------------------
# STEP 5: Train the Logistic Regression Model.
model = LogisticRegression()
model.fit(X, y)

# ---------------------------
# STEP 6: Model Predictions, Evaluation, and Cross-Validation.
y_pred = model.predict(X)
accuracy = accuracy_score(y, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

kf = KFold(n_splits=5, shuffle=True, random_state=None)
cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')
print(f"Cross-Validation Mean Accuracy: {cv_scores.mean():.2f}")

# ---------------------------
# STEP 7: Feature Importance (Odds Ratios)
feature_importance = np.exp(model.coef_[0])
print("\nFeature Importance (Odds Ratios):")
for feat, imp in zip(X.columns, feature_importance):
    print(f"{feat}: {imp:.2f}")

# ---------------------------
# Additional Analysis: Check Missing Value Counts in the Original df.
print("\nMissing Value Counts in Original DataFrame:")
print(df.isnull().sum())


# Split the complete (clean) data into train and test sets (e.g., 80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=None)

model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate on the hold-out test set
y_pred_test = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test Accuracy: {test_accuracy:.2f}")


Model Accuracy: 0.89
Cross-Validation Mean Accuracy: 0.89

Feature Importance (Odds Ratios):
smoke: 2.16
drink: 0.73
itch: 1.14
grew: 1.69
hurt: 5.15
changed: 18.05
bleed: 2.54
elevation: 2.82
diameter_1: 0.74
diameter_2: 1.73
age: 0.81

Missing Value Counts in Original DataFrame:
patient_id               0
lesion_id                0
smoke                  804
drink                  804
background_father      818
background_mother      822
age                      0
pesticide              804
gender                 804
skin_cancer_history    804
cancer_history         804
has_piped_water        804
has_sewage_system      804
fitspatrick            804
region                   0
diameter_1             804
diameter_2             804
diagnostic               0
itch                     0
grew                     0
hurt                     0
changed                  0
bleed                    0
elevation                0
img_id                   0
biopsed                  0
dtype: int64
Tes

  df_model[binary_cols] = df_model[binary_cols].replace({'True': 1, 'False': 0, 'UNK': np.nan})


# **Refining the Model by Selecting Key Features**

### **Selected Features**
Based on feature importance analysis, we focus on high-impact predictors:

| Feature   | Odds Ratio (First Model) | Odds Ratio (Full Model) |
|-----------|-------------------------|-------------------------|
| **Changed**   | 23.85 | 17.89 |
| **Hurt**      | 7.86  | 5.08  |
| **Bleed**     | 4.06  | 2.44  |
| **Elevation** | 2.94  | 2.89  |
| **Diameter**  | 1.24  | 1.26  |
| **Age**       | 1.34  | 0.82  |

### **Next Steps**
1️⃣ **Re-train the model using only the above features**  
2️⃣ **Compare accuracy with previous versions**  
3️⃣ **Assess if simplifying the model improves generalization**


### We split the data into two models before, because for the same 804 entries , we had a lot of missing values. But now that we keep the important features , only diameter has missing values so we ll fix that by using the median for the missing entries

In [228]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Load dataset (adjust file path as needed)
df = pd.read_csv(r"C:\Users\valan\OneDrive\Desktop\Projects in D Science\MANDATORY\metadata.csv")

# ================================
# STEP 1: Preprocess Important Features
# ================================

# Define important binary features (we use these for the model)
important_binary = ['changed', 'hurt', 'bleed', 'elevation']
# Replace string values: 'True' becomes 1, 'False' becomes 0, and 'UNK' becomes NaN.
df[important_binary] = df[important_binary].replace({'True': 1, 'False': 0, 'UNK': np.nan})
# Ensure they are floats, then fill missing values with 0.5
df[important_binary] = df[important_binary].astype(float).fillna(0.5)

# Process the diameter features.
# List of original diameter features
diameter_cols = ['diameter_1', 'diameter_2']
# Impute missing values with the median for each diameter column.
df[diameter_cols] = df[diameter_cols].apply(lambda col: col.fillna(col.median()))
# Create a new combined 'diameter' feature as the average
df['diameter'] = df[diameter_cols].mean(axis=1)

# Convert the target variable 'biopsed' to numeric (True becomes 1, False becomes 0)
df['biopsed'] = df['biopsed'].astype(int)

# ================================
# STEP 2: Define Our Feature Set and Prepare Data
# ================================

# We are using only the important features:
# - The four binary features: changed, hurt, bleed, elevation.
# - The combined diameter feature.
features = important_binary + ['diameter']

# Create our feature matrix X and target y.
X = df[features].copy()
y = df['biopsed']

# Standardize the continuous feature "diameter".
scaler = StandardScaler()
X[['diameter']] = scaler.fit_transform(X[['diameter']])

# ================================
# STEP 3: Train/Test Split (80/20)
# ================================

# Split the (processed) dataset into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ================================
# STEP 4: Training and Evaluation
# ================================

model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on the test set.
y_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", test_accuracy)

# Optionally, you can also perform cross-validation on the entire dataset:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')
print("Cross-Validation Mean Accuracy:", cv_scores.mean())

# ================================
# STEP 5: Feature Importance (Odds Ratios)
# ================================

# Calculate the odds ratios (exp(coefficients)).
odds_ratios = np.exp(model.coef_[0])
print("\nFeature Importance (Odds Ratios):")
for feat, ratio in zip(features, odds_ratios):
    print(f"{feat}: {ratio:.2f}")


Test Accuracy: 0.8173913043478261
Cross-Validation Mean Accuracy: 0.8046045278014589

Feature Importance (Odds Ratios):
changed: 27.39
hurt: 6.71
bleed: 5.52
elevation: 3.28
diameter: 1.31


  df[important_binary] = df[important_binary].replace({'True': 1, 'False': 0, 'UNK': np.nan})


### And a final training using only the 4 features that seem to be the most important

In [234]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score

# ---------------------------
# Load dataset (update the path as needed)
df = pd.read_csv(r"C:\Users\valan\OneDrive\Desktop\Projects in D Science\MANDATORY\metadata.csv")
# ---------------------------

# STEP 1: Preprocessing for the Important Binary Features
# We will use only the following features: changed, hurt, bleed, and elevation.
important_binary = ['changed', 'hurt', 'bleed', 'elevation']

# Replace string entries in these columns:
# 'True' becomes 1, 'False' becomes 0, and 'UNK' becomes NaN.
df[important_binary] = df[important_binary].replace({'True': 1, 'False': 0, 'UNK': np.nan})
# Convert them to float and impute missing values with 0.5 (a neutral value).
df[important_binary] = df[important_binary].astype(float).fillna(0.5)

# Convert the target variable 'biopsed' to numeric (e.g., True->1, False->0)
df['biopsed'] = df['biopsed'].astype(int)

# Note: We are intentionally not processing any of the diameter-related features here.

# STEP 2: Define Features and Prepare Dataset for Modeling
# Our feature set now consists of only the four important binary features.
features = important_binary
X = df[features].copy()
y = df['biopsed']

# (Since our binary features have been imputed directly, there is no need for numerical standardization.)

# STEP 3: Train–Test Split (using the whole dataset)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# STEP 4: Train the Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

# STEP 5: Evaluate the Model on the Hold-Out Test Set
y_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", test_accuracy)

# Also run cross-validation on the entire dataset:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')
print("Cross-Validation Mean Accuracy:", cv_scores.mean())

# STEP 6: Feature Importance (Odds Ratios)
import numpy as np
odds_ratios = np.exp(model.coef_[0])
print("\nFeature Importance (Odds Ratios):")
for feat, ratio in zip(features, odds_ratios):
    print(f"{feat}: {ratio:.2f}")

# ---------------------------
# Additional analysis (optional): Check missingness in the original DataFrame
print("\nMissing Value Counts in Original Data:")
print(df.isnull().sum())


Test Accuracy: 0.8043478260869565
Cross-Validation Mean Accuracy: 0.7932982854977741

Feature Importance (Odds Ratios):
changed: 27.76
hurt: 6.97
bleed: 5.76
elevation: 3.18

Missing Value Counts in Original Data:
patient_id               0
lesion_id                0
smoke                  804
drink                  804
background_father      818
background_mother      822
age                      0
pesticide              804
gender                 804
skin_cancer_history    804
cancer_history         804
has_piped_water        804
has_sewage_system      804
fitspatrick            804
region                   0
diameter_1             804
diameter_2             804
diagnostic               0
itch                     0
grew                     0
hurt                     0
changed                  0
bleed                    0
elevation                0
img_id                   0
biopsed                  0
dtype: int64


  df[important_binary] = df[important_binary].replace({'True': 1, 'False': 0, 'UNK': np.nan})
