In [None]:
import pandas as pd
import numpy as np


In [None]:
df=pd.read_excel("/content/20-05-2024_TextNorm.xlsx")

In [None]:
#here i see that blood pressure column has string values and there is another column "level of bp"
# so we can map the corresponding level which ranges from 0 to 4 to the blood pressure
#sleep was in format yes or no i convert tht into hours by using the values of remaining columns
#activity score column is also created using the remaining column values
#the final dataset will have sleep hours,level of bp,heart rate,activity score,stress,anxiety,hypertension,depression,nonwell being
#the columns bp,heart rate,activity can be collected from watch if the user wishes to share his watch details
#we will create a model to predict accurately the stress anxiety hypertension depression nonwell being for the information
#provided by user thro his watch and append these values to the conversation prompts of tht user while chatting with the llm
#if we cannot access the watch details of user the llm will simply work based on the conversations.
#we can use the chats to generate a final score for his mental state and if he is highly vulnerable to depression then we can alert him

In [None]:
'''
Formula for Sleep Hours Calculation
I estimated sleep hours based on the relationship between stress, anxiety, depression, and non-wellbeing, as these factors are known to impact sleep quality. A possible formula is:

Sleep Hours=8.5−(0.5×Stress)−(0.4×Anxiety)−(0.6×Depression)−(0.3×Non-Wellbeing)
Baseline sleep is assumed to be 8.5 hours, representing an ideal sleep duration.

Each mental health factor (stress, anxiety, depression) reduces sleep by a certain fraction.

Non-Wellbeing reduces sleep further, assuming a general health issue impact.

To ensure sleep hours stay within a valid range:

Sleep Hours=max(3,min(10,Sleep Hours))
This keeps sleep in a reasonable range of 3 to 10 hours.

Formula for Activity Score Calculation
activity score based on factors like heart rate, BMI, smoking, and sleep:

Activity Score=(0.3×Heart Rate)−(2×BMI Index)−(3×Smoking)+(2×Sleep Hours)
Heart rate positively contributes to activity (higher heart rate → more active).

BMI index negatively affects activity (higher BMI → lower movement).

Smoking reduces activity level significantly.

Sleep Hours positively contribute (better sleep → more energy).

To normalize the values:

Activity Score=max(0,min(100,Activity Score))
'''

In [None]:
print("Blood Pressure Column Unique Values:")
print(df["Blood Pressure"].unique())

print("\nLevel of BP Column Unique Values:")
print(df["Level of BP"].unique())


Blood Pressure Column Unique Values:
['Normal' 'High blood' 'Elevated' 'Hypertensive crisis']

Level of BP Column Unique Values:
[0 3 1 2 4]


In [None]:
print(df[["Blood Pressure", "Level of BP"]].drop_duplicates())


          Blood Pressure  Level of BP
0                 Normal            0
54            High blood            3
62              Elevated            1
206           High blood            2
314  Hypertensive crisis            4


In [None]:
import pandas as pd
import numpy as np

# Load dataset

# Standardize column names (strip spaces & convert to lowercase)
df.columns = df.columns.str.strip().str.lower()

# Print actual column names to debug
print("Columns in dataset:", df.columns.tolist())

# Convert 'smoking' column to numeric (Map "Do not smoke" to 0 and assume "Smoke" as 1)
if 'smoking' in df.columns:
    df['smoking'] = df['smoking'].apply(lambda x: 1 if 'smoke' in str(x).lower() and 'do not' not in str(x).lower() else 0)

# Map Blood Pressure categories to numerical values
bp_mapping = {
    'normal': 0,
    'elevated': 1,
    'high blood': 2,
    'hypertensive crisis': 3
}
df['level of bp'] = df['blood pressure'].str.lower().map(bp_mapping)

# Ensure numerical values for processing
for col in ['stress', 'anxiety', 'depression', 'heart rate', 'bmi_index', 'smoking', 'age']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(df[col].median())

# Compute Sleep Hours (Based on Stress, Anxiety, and Depression levels)
if {'stress', 'anxiety', 'depression'}.issubset(df.columns):
    df['sleep hours'] = 8 - (df['stress'] * 0.3 + df['anxiety'] * 0.3 + df['depression'] * 0.4)

# Clip sleep hours to a realistic range (4 to 10 hours)
df['sleep hours'] = df['sleep hours'].clip(4, 10)

# Compute Activity Score (Simulating Smartwatch Data)
if {'heart rate', 'bmi_index', 'age'}.issubset(df.columns):
    df['activity score'] = (
        (df['heart rate'] / df['heart rate'].max()) * 50 +
        (1 / (df['bmi_index'] + 1)) * 30 +
        (1 / (df['age'] + 1)) * 20
    ).round(2)

# Normalize Activity Score to range (0 - 100)
df['activity score'] = (df['activity score'] - df['activity score'].min()) / \
                       (df['activity score'].max() - df['activity score'].min()) * 100

# Select required columns
df = df[['sleep hours', 'activity score', 'level of bp', 'heart rate', 'stress', 'anxiety', 'depression', 'hypertension', 'non-wellbeing']]

# Save updated dataset
df.to_excel('processed_dataset.xlsx', index=False)

# Show final dataset preview
print(df.head())


Columns in dataset: ['type of data', 'patient_id', 'kv100', 'm', 'δ', 't', 'a365', 'a460', 'anadn', 'pom', 'ae', 'an', 'am', 'ar', 'ac', 'f_ae', 'f_an', 'f_am', 'f_ar', 'f_ac', 'ethnicity', 'left or right', 'sleep', 'gender', 'race', 'type of skins', 'age', 'weight', 'height', 'bmi_index', 'smoking', 'number of cigarettes per week if smoked:', 'disease', 'blood pressure', 'level of bp', 'heart rate', 'dass_21', 'stress', 'anxiety', 'depression', 'hypertension', 'non-wellbeing', 'sleep hours']
   sleep hours  activity score  level of bp  heart rate  stress  anxiety  \
0          8.0       62.080130            0        95.0       0        0   
1          8.0       62.080130            0        95.0       0        0   
2          8.0       62.080130            0        95.0       0        0   
3          8.0       62.080130            0        95.0       0        0   
4          8.0       18.494537            0        70.0       0        0   

   depression  hypertension  non-wellbeing  


In [None]:
# List of columns to check
#values 0 to 4 indicate zero,mild,moderate,severe,extremely severe
#values 0 and 1 for hyoertension and non wellbeign just indicate yes or no
cols_to_check = ['stress', 'anxiety', 'depression', 'hypertension', 'non-wellbeing']

# Print unique values for each column
for col in cols_to_check:
    if col in df.columns:
        print(f"Unique values in '{col}': {df[col].unique()}")


Unique values in 'stress': [0 1 2 3]
Unique values in 'anxiety': [0 2 4 1 3]
Unique values in 'depression': [0 2 3 1 4]
Unique values in 'hypertension': [0 1]
Unique values in 'non-wellbeing': [0 1]
