# Data exploration 

Dataset: https://www.kaggle.com/datasets/abdullah0a/comprehensive-weight-change-prediction

This dataset includes features involving aspects that may influence weight gain or loss

### Data Cleaning

In [19]:
df.columns

Index(['Age', 'Gender', 'Current_Weight_(lbs)', 'BMR_(Calories)',
       'Daily_Calories_Consumed', 'Daily_Caloric_Surplus/Deficit',
       'Weight_Change_(lbs)', 'Duration_(weeks)', 'Physical_Activity_Level',
       'Sleep_Quality', 'Stress_Level', 'Final_Weight_(lbs)'],
      dtype='object')

In [20]:
df.columns = df.columns.str.replace(' ', '_')

df.columns

Index(['Age', 'Gender', 'Current_Weight_(lbs)', 'BMR_(Calories)',
       'Daily_Calories_Consumed', 'Daily_Caloric_Surplus/Deficit',
       'Weight_Change_(lbs)', 'Duration_(weeks)', 'Physical_Activity_Level',
       'Sleep_Quality', 'Stress_Level', 'Final_Weight_(lbs)'],
      dtype='object')

In [21]:
df.isnull().sum()

Age                              0
Gender                           0
Current_Weight_(lbs)             0
BMR_(Calories)                   0
Daily_Calories_Consumed          0
Daily_Caloric_Surplus/Deficit    0
Weight_Change_(lbs)              0
Duration_(weeks)                 0
Physical_Activity_Level          0
Sleep_Quality                    0
Stress_Level                     0
Final_Weight_(lbs)               0
dtype: int64

In [22]:
data_encoded = df.copy()

# Encode 'Gender'
data_encoded['Gender'] = data_encoded['Gender'].map({'M': 0, 'F': 1})

# Encode 'Physical Activity Level'
activity_mapping = {'Sedentary': 0, 'Lightly Active': 1,
                    'Moderately Active': 2, 'Very Active': 3}
data_encoded['Physical_Activity_Level'] = data_encoded['Physical_Activity_Level'].map(
    activity_mapping)

# Encode 'Sleep Quality'
sleep_mapping = {'Poor': 0, 'Fair': 1, 'Good': 2, 'Excellent': 3}
data_encoded['Sleep_Quality'] = data_encoded['Sleep_Quality'].map(
    sleep_mapping)

In [9]:
sns.pairplot(df)

In [10]:
plt.figure(figsize=(6, 6))
df['Gender'].value_counts().plot.pie(
    autopct='%1.1f%%', colors=['salmon', 'skyblue'])
plt.title('Gender Distribution')
plt.ylabel('')
plt.show()

In [12]:
plt.figure(figsize=(10, 6))
sns.histplot(data_encoded['Weight_Change_(lbs)'],
             bins=20, kde=True, color='turquoise')
plt.title('Weight Change Distribution')
plt.xlabel('Weight Change (lbs)')
plt.ylabel('Frequency')
plt.show()

In [13]:
plt.figure(figsize=(14, 12))
numeric_cols = data_encoded.select_dtypes(include=['float64', 'int64'])
sns.heatmap(numeric_cols.corr(), annot=True, fmt=".2f", cmap='BuPu')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
sns.histplot(data=data_encoded["Daily_Caloric_Surplus/Deficit"],
             bins=10, kde=True, color='orange', edgecolor='black')

# Add title and labels
plt.title("Distribution of Daily Caloric Surplus/Deficit")
plt.xlabel("Caloric Surplus/Deficit")
plt.ylabel("Number of People")

# Show the plot
plt.show()