In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

 
# Step 1: Load and Preprocess the Data
 
# Load the data file (adjust the path as needed)
data = pd.read_csv('export.csv')

# Replace unwanted characters so classification is more precise
data['trackable_name'] = data['trackable_name'].astype(str).replace(r'[\n\r\'"0123456789*,.<>/?:;-_`~=+]+', '', regex=True)

# Convert the trackable_value to numeric
data['trackable_value'] = pd.to_numeric(data['trackable_value'], errors='coerce')
# Drop any rows where conversion failed
data = data.dropna(subset=['trackable_value'])

 
# Step 2: Create the Feature Matrix from Symptoms
 
# Filter to only include self-reported symptoms
symptoms = data[data['trackable_type'] == 'Symptom']

# Pivot the data so that each row (a patient) has one column per symptom, with the value being the average rating
symptom_matrix = symptoms.pivot_table(
    index='user_id',
    columns='trackable_name',
    values='trackable_value',
    aggfunc='mean'
).fillna(0)

 
# Step 3: Derive Ground Truth Labels from Conditions
 
# Filter to include only conditions
conditions = data[data['trackable_type'] == 'Condition']

# For each patient, select their primary condition
# Use the mode (most frequently reported condition) as the label
def mode_func(x):
    try:
        return x.mode()[0]
    except:
        return np.nan

condition_labels = conditions.groupby('user_id')['trackable_name'].agg(mode_func)
condition_labels = condition_labels.dropna()

 
# Step 4: Merge Features and Labels
 
# Merge the symptom fingerprint with the condition labels
df = symptom_matrix.merge(condition_labels.rename('condition'), left_index=True, right_index=True)
df = df.reset_index()
print("Merged dataset shape:", df.shape)
# Calculate frequency of each condition
condition_counts = df['condition'].value_counts()

# Keep only conditions with at least 2 samples for train/test split
valid_conditions = condition_counts[condition_counts >= 2].index
df = df[df['condition'].isin(valid_conditions)]
 
# Step 5: Prepare Data for Classification
 
# Separate features (X) and labels (y). Drop the user_id
X = df.drop(['user_id', 'condition'], axis=1).values
y = df['condition'].values

# Standardize the feature values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# To later visualize the split, record the original indices
indices = np.arange(len(y))
# Split into training (80%) and test (20%) sets. Stratify by the condition label
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X_scaled, y, indices, test_size=0.2, random_state=42, stratify=y
)

 
# Step 6: Train the Classifier
 
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Evaluate the classifier on the test set
y_pred = clf.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

  data = pd.read_csv('export.csv')


Merged dataset shape: (9038, 5848)
Test Accuracy: 0.13827893175074185
Classification Report:
                                              precision    recall  f1-score   support

                                                   0.20      0.49      0.29       266
                                                   0.00      0.00      0.00         1
                                   disorders       0.00      0.00      0.00         9
                                      issues       0.00      0.00      0.00         1
                     sleep behavior disorder       0.00      0.00      0.00         1
                                           -       0.00      0.00      0.00         6
                                      abesia       0.00      0.00      0.00         2
                                     ace ain       0.00      0.00      0.00         1
                                       aches       0.00      0.00      0.00         1
                                 achy joints  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
def predict_condition(symptom, severity, feature_names, scaler, clf):
    
    # Create a zero vector with shape (1, number of features)
    input_vector = np.zeros((1, len(feature_names)))
    
    # Check if the symptom is present in the features
    if symptom not in feature_names:
        print(f"Symptom '{symptom}' not recognized.")
        print("Available symptoms are:")
        for feat in feature_names:
            print(f" - {feat}")
        return None
    
    # Set the input severity for the specified symptom
    idx = feature_names.index(symptom)
    input_vector[0, idx] = severity
    
    # Scale the input vector using the previously fitted scaler
    input_vector_scaled = scaler.transform(input_vector)
    
    # Predict using the trained classifier
    prediction = clf.predict(input_vector_scaled)
    return prediction[0]

# INTERACTION SECTION
# Get the list of feature names (symptoms) from the pivot table columns.
feature_names = list(symptom_matrix.columns)

# Prompt the user for input.
symptom_input = input("Enter the symptom name (exactly as in the data): ").strip()
try:
    severity_input = float(input("Enter the severity (numeric, e.g., 0 to 4): "))
except ValueError:
    print("Invalid severity input. Please enter a numeric value.")
    severity_input = None

if severity_input is not None:
    predicted_condition = predict_condition(symptom_input, severity_input, feature_names, scaler, clf)
    if predicted_condition is not None:
        print(f"Predicted Condition: {predicted_condition}")


Invalid severity input. Please enter a numeric value.
