In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

## Data Cleaning and Preparation

In [2]:
# reading in data and getting the info on each column
diabetes = pd.read_csv("diabetes_012_health_indicators_BRFSS2015.csv")
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Diabetes_012          253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   HeartDiseaseorAttack  253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

In [3]:
# because the values above are floats, this converts all columns to int
diabetes = diabetes.astype(int)

In [4]:
# converting "prediabetes" individuals to "no diabetes"
diabetes['Diabetes_012'] = diabetes['Diabetes_012'].replace(1, 0)

# converting the diabetes label from 2 to 1 
diabetes['Diabetes_012'] = diabetes['Diabetes_012'].replace(2, 1)

diabetes["Diabetes_012"]

0         0
1         0
2         0
3         0
4         0
         ..
253675    0
253676    1
253677    0
253678    0
253679    1
Name: Diabetes_012, Length: 253680, dtype: int64

In [5]:
# checking number of values for each
uniq = diabetes['Diabetes_012'].value_counts()

print(uniq)

0    218334
1     35346
Name: Diabetes_012, dtype: int64


In [6]:
# evening the data so that there are an equal number of 0 values as 1 values for the diabetes column
num_ones = (diabetes['Diabetes_012'] == 1).sum()

# separate no diabetes users
no_diabetes = diabetes[diabetes['Diabetes_012'] == 0].sample(n=num_ones, random_state=42)
uniq_no = no_diabetes['Diabetes_012'].value_counts()
print(uniq_no)

0    35346
Name: Diabetes_012, dtype: int64


In [7]:
# Separate users into diabetes and no diabetes groups
diabetes_users = diabetes[diabetes['Diabetes_012'] == 1]

# limit to a lower number (to reduce computation)
n=500
no_diabetes = no_diabetes.sample(n=n, random_state=42)
diabetes_users = diabetes_users.sample(n=n, random_state=42)

diabetes_balanced = pd.concat([no_diabetes, diabetes_users])

# Save the balanced dataset to CSV
diabetes_balanced.to_csv('balanced_diabetes_data.csv', index=False)

diabetes_balanced = diabetes_balanced.reset_index(drop=True)

In [8]:
print(diabetes_balanced['Diabetes_012'].value_counts())
print(len(diabetes_balanced))

0    500
1    500
Name: Diabetes_012, dtype: int64
1000


## Collaborative Filtering

In [10]:
# Extract features (excluding the Diabetes_012 label)
features = diabetes_balanced.drop(columns=['Diabetes_012'])

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(features)

print("Cosine similarity matrix shape:", similarity_matrix.shape)

# Create a set to store unique "1" (diabetes) individuals that have been added
added_diabetes_ones = set()

# List to store misdiagnosed pairs
misdiagnoses = []

# Set a similarity threshold
threshold = 0.99

# Loop through each "0" (non-diabetes) individual
for i in range(len(diabetes_balanced)):
    if diabetes_balanced['Diabetes_012'].iloc[i] == 0:  # Non-diabetes individual
        # For each "0", loop through "1"s (diabetes individuals) and check if similarity is above threshold
        for j in range(len(diabetes_balanced)):
            if diabetes_balanced['Diabetes_012'].iloc[j] == 1 and j not in added_diabetes_ones:  # Diabetes individual
                # Check if similarity score is above the threshold
                if similarity_matrix[i, j] >= threshold:
                    # Store the pair in the misdiagnoses list
                    misdiagnoses.append({
                        'user_1': i,  # Non-diabetes user
                        'user_2': j,  # Diabetes user
                        'similarity': similarity_matrix[i, j],
                        'non_diabetes_user': diabetes_balanced['Diabetes_012'].iloc[i],
                        'diabetes_user': diabetes_balanced['Diabetes_012'].iloc[j]
                    })
                    # Add this "1" (diabetes) to the added_diabetes_ones set to avoid duplicates
                    added_diabetes_ones.add(j)

# Convert to DataFrame
misdiagnoses_df = pd.DataFrame(misdiagnoses)

print("Number of misdiagnosed pairs:", len(misdiagnoses_df))
print(misdiagnoses_df)
num_rows = misdiagnoses_df.shape[0]
print("There are", num_rows, "misdiagnoses.")

Cosine similarity matrix shape: (1000, 1000)
Number of misdiagnosed pairs: 459
     user_1  user_2  similarity  non_diabetes_user  diabetes_user
0         0     521    0.990026                  0              1
1         0     548    0.992026                  0              1
2         0     794    0.996419                  0              1
3         0     917    0.990252                  0              1
4         0     932    0.991719                  0              1
..      ...     ...         ...                ...            ...
454     444     605    0.992914                  0              1
455     444     768    0.990722                  0              1
456     458     506    0.990362                  0              1
457     477     831    0.990653                  0              1
458     477     929    0.992318                  0              1

[459 rows x 5 columns]
There are 459 misdiagnoses.
