In [1]:
#!pip install tensorflow
#!pip install sklearn

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
# Load the BMI dataset
data = pd.read_csv("data.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,bmi,gender,is_training,name
0,0,34.207396,Male,1,img_0.bmp
1,1,26.45372,Male,1,img_1.bmp
2,2,34.967561,Female,1,img_2.bmp
3,3,22.044766,Female,1,img_3.bmp
4,4,37.758789,Female,1,img_4.bmp


In [4]:
print(len(data))

4206


In [5]:
print(len(data[data['is_training']==1]))
print(len(data[data['is_training']==0]))

3368
838


In [6]:
data.info()
image_paths='C:/Users/kisho/Desktop/UChicago Academics/Images'

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4206 entries, 0 to 4205
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   4206 non-null   int64  
 1   bmi          4206 non-null   float64
 2   gender       4206 non-null   object 
 3   is_training  4206 non-null   int64  
 4   name         4206 non-null   object 
dtypes: float64(1), int64(2), object(2)
memory usage: 164.4+ KB


In [7]:
data.value_counts()

Unnamed: 0  bmi        gender  is_training  name        
0           34.207396  Male    1            img_0.bmp       1
2793        28.662354  Female  1            img_2793.bmp    1
2795        26.289704  Female  1            img_2795.bmp    1
2796        33.792661  Male    1            img_2796.bmp    1
2797        28.160551  Male    1            img_2797.bmp    1
                                                           ..
1407        46.511695  Female  1            img_1407.bmp    1
1408        26.622856  Male    1            img_1408.bmp    1
1409        24.900200  Male    1            img_1409.bmp    1
1410        50.029844  Male    1            img_1410.bmp    1
4205        34.618844  Male    0            img_4205.bmp    1
Length: 4206, dtype: int64

In [8]:
# Preprocess the image paths and labels
image_paths = data["name"].values

valid_image_paths = []
import os
missing_images=0
missing_list = []
for image_path in image_paths:
    if os.path.isfile(image_path):
        valid_image_paths.append(image_path)
    else:
        missing_images+=1
        missing_list.append(image_path)
        #print(f"Image file not found: {image_path}")
        
print(missing_images)
#print(missing_list)
#image_paths = valid_image_paths
#labels = data["bmi"].values

244


In [9]:
data_filtered = data[~data['name'].isin(missing_list)]

In [10]:
data = data_filtered
len(data)

3962

In [11]:
print(len(data[data['is_training']==1]))
print(len(data[data['is_training']==0]))

3210
752


In [12]:
train_data = data[data["is_training"] == 1]
test_data = data[data["is_training"] == 0]

In [13]:
train_paths = train_data["name"].tolist()
train_labels = train_data["bmi"].tolist()

val_paths = test_data["name"].tolist()
val_labels = test_data["bmi"].tolist()

#from sklearn.model_selection import train_test_split

#train_paths, val_paths, train_labels, val_labels = train_test_split(
#    train_paths, train_labels, test_size=0.2, random_state=42
#)

In [14]:
print(len(train_labels))
print(len(val_labels))

3210
752


#### Adding Custom Layers: The code adds custom layers on top of the VGG16 model. It adds a global average pooling layer to reduce the spatial dimensions, followed by a fully connected (dense) layer with ReLU activation function, and an output layer with a single neuron for BMI prediction.

In [18]:
from tensorflow.keras.layers import Dropout
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense

base_model = ResNet50(weights="imagenet", include_top=False, input_shape=(224, 224, 3))

# Create a new model for feature extraction
output_layer = base_model.get_layer("conv5_block3_out")
model = Model(inputs=base_model.input, outputs=output_layer.output)

# Freeze the initial layers and unfreeze the later layers for fine-tuning
for layer in base_model.layers[:15]:
    layer.trainable = False
for layer in base_model.layers[15:]:
    layer.trainable = True

# Add a fully connected layer for classification
x = GlobalAveragePooling2D()(output_layer.output)

# Add a fully connected layer with 64 neurons and L2 regularization
x = Dense(64, activation="relu", kernel_regularizer=regularizers.l2(0.01))(x)
x = Dropout(0.3)(x)

# Add a fully connected layer with 128 neurons and L2 regularization
x = Dense(128, activation="relu", kernel_regularizer=regularizers.l2(0.01))(x)
x = Dropout(0.3)(x)

predictions = Dense(1)(x)

# Create the final model
model = Model(inputs=model.input, outputs=predictions)
model.compile(optimizer="RMSProp", loss="mean_squared_error")

In [26]:
%%time

from tensorflow.keras.callbacks import EarlyStopping, LambdaCallback
from tensorflow.keras.callbacks import Callback

# This is to use preprocessing techniques to the images to improve the performance
# Define the image data generator with data augmentation
datagen = image.ImageDataGenerator(
    rescale=1.0 / 255.0,
    rotation_range=20,
    zoom_range=0.2,
    horizontal_flip=True
)

# Create training data generator with augmentation
train_generator = datagen.flow_from_dataframe(
    pd.DataFrame({"path": train_paths, "BMI": train_labels}),
    x_col="path",
    y_col="BMI",
    target_size=(224, 224),
    batch_size=32,
    class_mode="raw",
)

val_generator = datagen.flow_from_dataframe(
    pd.DataFrame({"path": val_paths, "BMI": val_labels}),
    x_col="path",
    y_col="BMI",
    target_size=(224, 224),
    batch_size=32,
    class_mode="raw",
)

# Define a function to calculate RMSE
def calculate_rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred)**2))

# Define a callback to print RMSE at each epoch
print_rmse_callback = LambdaCallback(on_epoch_end=lambda epoch, logs: print(f"Epoch {epoch+1} - RMSE: {calculate_rmse(val_labels, model.predict(val_generator))}"))

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min', restore_best_weights=True)

# Train the model with early stopping
history = model.fit(train_generator, validation_data=val_generator, epochs=20, callbacks=[early_stopping])

Found 3210 validated image filenames.
Found 752 validated image filenames.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
CPU times: total: 3h 46min 5s
Wall time: 2h 9min


In [28]:
# Save the model
model.save("bmi_mode_20_eps.h5")

In [29]:
%%time

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

# Calculate predictions for the validation set
val_predictions = model.predict(val_generator)
val_predictions = val_predictions.flatten()

# Create a dataframe for actual and predicted BMI values
val_results = pd.DataFrame({'Actual BMI': val_labels, 'Predicted BMI': val_predictions})

# Add the image file names column
val_results['Image File'] = val_paths

# Define the BMI categories
bmi_categories = ["Underweight", "Normal Weight", "Overweight"]

# Convert actual BMI values to BMI categories
val_results['Actual BMI Category'] = pd.cut(val_results['Actual BMI'], bins=[0, 18.5, 25, np.inf], labels=bmi_categories)

# Convert predicted BMI values to BMI categories
val_results['Predicted BMI Category'] = pd.cut(val_results['Predicted BMI'], bins=[0, 18.5, 25, np.inf], labels=bmi_categories)

# Calculate accuracy
val_accuracy = accuracy_score(val_results['Actual BMI Category'], val_results['Predicted BMI Category'])

print("Validation Accuracy:", val_accuracy)
print(val_results)

Validation Accuracy: 0.8404255319148937
     Actual BMI  Predicted BMI    Image File Actual BMI Category  \
0     29.698495      32.726242  img_3369.bmp          Overweight   
1     30.845918      31.070740  img_3370.bmp          Overweight   
2     24.389796      31.505375  img_3371.bmp       Normal Weight   
3     36.258679      31.761124  img_3372.bmp          Overweight   
4     27.891291      33.911640  img_3373.bmp          Overweight   
..          ...            ...           ...                 ...   
747   34.078947      26.991812  img_4201.bmp          Overweight   
748   34.564776      35.075993  img_4202.bmp          Overweight   
749   27.432362      29.435787  img_4203.bmp          Overweight   
750   40.492800      29.757931  img_4204.bmp          Overweight   
751   34.618844      26.409700  img_4205.bmp          Overweight   

    Predicted BMI Category  
0               Overweight  
1               Overweight  
2               Overweight  
3               Overweight 

In [30]:
print("Minimum Predicted BMI:", np.min(val_predictions))
print("Maximum Predicted BMI:", np.max(val_predictions))

Minimum Predicted BMI: 25.756084
Maximum Predicted BMI: 45.1645


In [31]:
val_results.head()

Unnamed: 0,Actual BMI,Predicted BMI,Image File,Actual BMI Category,Predicted BMI Category
0,29.698495,32.726242,img_3369.bmp,Overweight,Overweight
1,30.845918,31.07074,img_3370.bmp,Overweight,Overweight
2,24.389796,31.505375,img_3371.bmp,Normal Weight,Overweight
3,36.258679,31.761124,img_3372.bmp,Overweight,Overweight
4,27.891291,33.91164,img_3373.bmp,Overweight,Overweight


In [32]:
val_results.to_csv('output_bmi.csv', index=False)

In [33]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Calculate predictions for the validation set
val_predictions = model.predict(val_generator)
val_predictions = val_predictions.flatten()

# Calculate RMSE
val_rmse = np.sqrt(mean_squared_error(val_labels, val_predictions))

# Calculate MAE
val_mae = mean_absolute_error(val_labels, val_predictions)

print("Validation RMSE:", val_rmse)
print("Validation MAE:", val_mae)

Validation RMSE: 10.255140313423135
Validation MAE: 7.759831389867411


In [34]:
from sklearn.metrics import roc_auc_score

# Convert actual BMI values to binary labels (0 or 1)
actual_labels = pd.cut(val_results['Actual BMI'], bins=[0, 24.9, np.inf], labels=[0, 1])

# Calculate AUC
auc = roc_auc_score(actual_labels, val_results['Predicted BMI'])

# Print AUC
print("AUC:", auc)

AUC: 0.5083495987855129


In [35]:
from sklearn.metrics import accuracy_score, r2_score

# Calculate R2 score
val_r2 = r2_score(val_results['Actual BMI'], val_results['Predicted BMI'])
print("R2 Score:",val_r2)

R2 Score: -0.16708693687692722


In [36]:
import pandas as pd

# Assuming you have a DataFrame called 'df'
# ...

# Save the DataFrame as a CSV file
val_results.to_csv('predictions_trial.csv', index=False)
