---
title: "Data Cleaning"
format: html
code-fold: true
---

In [2]:
import pyreadr
import pandas as pd
from IPython.display import display

## HINTS Preparation of Data

In [3]:
# Load the .rda file
result = pyreadr.read_r('/Users/elizabethkovalchuk/Documents/DSAN6000/Project/fall-2024-project-team-35/data/HINTS6_R_20240524/hints6_public.rda')

# Extract the DataFrame from the loaded data
hints = result['public']  # Assuming 'public' is the name of the R object in the file

# Specify the columns to select
columns = [
    "HHID", "SeekCancerInfo", "CancerFrustrated", "CancerTrustDoctor",
    "CancerTrustFamily", "CancerTrustGov", "CancerTrustCharities",
    "CancerTrustReligiousOrgs", "CancerTrustScientists", "Electronic2_HealthInfo",
    "MisleadingHealthInfo", "TrustHCSystem"
]

# Select the relevant columns
hints_select = hints[columns]

# # Convert the 'updatedate' column if required (commented for now)
# hints_select['updatedate'] = pd.to_datetime(hints_select['updatedate'] / 1000, unit='s')

# Preview the first few rows
print("Sample data from the HINTS dataset:")
display(hints_select.head())
print(f"Shape of the original dataset: {hints_select.shape}")

Sample data from the HINTS dataset:


Unnamed: 0,HHID,SeekCancerInfo,CancerFrustrated,CancerTrustDoctor,CancerTrustFamily,CancerTrustGov,CancerTrustCharities,CancerTrustReligiousOrgs,CancerTrustScientists,Electronic2_HealthInfo,MisleadingHealthInfo,TrustHCSystem
0,21000006,No,"Inapplicable, coded 2 in SeekCancerInfo",A lot,Missing data (Not Ascertained),Missing data (Not Ascertained),Missing data (Not Ascertained),Missing data (Not Ascertained),Missing data (Not Ascertained),Question answered in error (Commission Error),I do not use social media,Very
1,21000009,No,"Inapplicable, coded 2 in SeekCancerInfo",A lot,Some,A lot,Some,Some,A lot,Yes,I do not use social media,Very
2,21000020,Yes,Somewhat disagree,A lot,Some,Some,A little,Not at all,A lot,Yes,Some,Somewhat
3,21000022,No,"Inapplicable, coded 2 in SeekCancerInfo",A lot,Missing data (Not Ascertained),Missing data (Not Ascertained),Missing data (Not Ascertained),Missing data (Not Ascertained),Missing data (Not Ascertained),"Inapplicable, coded 2 in UseInternet",I do not use social media,Somewhat
4,21000039,No,"Inapplicable, coded 2 in SeekCancerInfo",Some,Some,Some,Not at all,Not at all,Some,Yes,A lot,Somewhat


Shape of the original dataset: (6252, 12)


In [4]:
# Count missing values in each column
missing_values = hints_select.isna().sum()

# Display the count of missing values
print("Missing values per column:")
display(missing_values)


Missing values per column:


HHID                        0
SeekCancerInfo              0
CancerFrustrated            0
CancerTrustDoctor           0
CancerTrustFamily           0
CancerTrustGov              0
CancerTrustCharities        0
CancerTrustReligiousOrgs    0
CancerTrustScientists       0
Electronic2_HealthInfo      0
MisleadingHealthInfo        0
TrustHCSystem               0
dtype: int64

In [5]:
# List of ordinal columns
ordinal_columns = [
    "SeekCancerInfo", "CancerFrustrated", "CancerTrustDoctor",
    "CancerTrustFamily", "CancerTrustGov", "CancerTrustCharities",
    "CancerTrustReligiousOrgs", "CancerTrustScientists", "Electronic2_HealthInfo",
    "MisleadingHealthInfo", "TrustHCSystem"
]

# Display unique values for each ordinal column
print("Unique values for ordinal columns:")
for column in ordinal_columns:
    unique_values = hints_select[column].unique()
    print(f"\nColumn: {column}")
    print(f"Unique Values: {unique_values}")


Unique values for ordinal columns:

Column: SeekCancerInfo
Unique Values: ['No', 'Yes', 'Missing data (Not Ascertained)']
Categories (3, object): ['Missing data (Not Ascertained)', 'No', 'Yes']

Column: CancerFrustrated
Unique Values: ['Inapplicable, coded 2 in SeekCancerInfo', 'Somewhat disagree', 'Strongly disagree', 'Somewhat agree', 'Strongly agree', 'Question answered in error (Commission Error)', 'Missing data (Filter Missing)', 'Missing data (Not Ascertained)', 'Multiple responses selected in error']
Categories (9, object): ['Inapplicable, coded 2 in SeekCancerInfo', 'Missing data (Filter Missing)', 'Missing data (Not Ascertained)', 'Multiple responses selected in error', ..., 'Somewhat agree', 'Somewhat disagree', 'Strongly agree', 'Strongly disagree']

Column: CancerTrustDoctor
Unique Values: ['A lot', 'Some', 'Not at all', 'A little', 'Missing data (Not Ascertained)', 'Multiple responses selected in error']
Categories (6, object): ['A little', 'A lot', 'Missing data (Not Asce

In [6]:
# Define the valid scales for each column
valid_scales = {
    "CancerFrustrated": ['Somewhat disagree', 'Strongly disagree', 'Somewhat agree', 'Strongly agree'],
    "CancerTrustDoctor": ['A lot', 'Some', 'Not at all', 'A little'],
    "CancerTrustFamily": ['A lot', 'Some', 'Not at all', 'A little'],
    "CancerTrustGov": ['A lot', 'Some', 'Not at all', 'A little'],
    "CancerTrustCharities": ['A lot', 'Some', 'Not at all', 'A little'],
    "CancerTrustReligiousOrgs": ['A lot', 'Some', 'Not at all', 'A little'],
    "CancerTrustScientists": ['A lot', 'Some', 'Not at all', 'A little'],
    "TrustHCSystem": ['A lot', 'Some', 'Not at all', 'A little'],
    "Electronic2_HealthInfo": ['Yes', 'No'], 
    "MisleadingHealthInfo": ['I do not use social media', 'None', 'A little', 'Some', 'A lot']  
}

# Create a copy of the original DataFrame
hints_cleaned = hints_select.copy()

# Filter the DataFrame
for column, scale in valid_scales.items():
    hints_cleaned = hints_cleaned[hints_cleaned[column].isin(scale)]

# Display the cleaned dataset and its shape
print("Data after filtering invalid values:")
display(hints_cleaned.head())
print(f"Shape of the cleaned dataset: {hints_cleaned.shape}")

Data after filtering invalid values:


Unnamed: 0,HHID,SeekCancerInfo,CancerFrustrated,CancerTrustDoctor,CancerTrustFamily,CancerTrustGov,CancerTrustCharities,CancerTrustReligiousOrgs,CancerTrustScientists,Electronic2_HealthInfo,MisleadingHealthInfo,TrustHCSystem
51,21000330,Yes,Somewhat disagree,Some,Not at all,Some,Some,Not at all,A lot,Yes,A lot,A little
112,21000976,Yes,Somewhat agree,A lot,Some,Some,Some,Some,A lot,Yes,Some,A little
136,21001112,Yes,Somewhat disagree,A little,A little,Not at all,Not at all,Not at all,A little,No,A lot,Not at all
157,21001283,Yes,Somewhat disagree,A lot,Some,Not at all,A little,Some,Not at all,No,I do not use social media,Not at all
181,21001548,Yes,Strongly agree,A lot,Some,Not at all,Some,A lot,A little,Yes,Some,A little


Shape of the cleaned dataset: (323, 12)


In [8]:
# Count unique values in the 'SeekCancerInfo' column
value_counts = hints_cleaned['SeekCancerInfo'].value_counts()
print("Unique value counts in 'SeekCancerInfo':")
print(value_counts)

# Save the cleaned dataset to an Excel file
output_file = "../data/csv/hints_cleaned_forML_spearman.xlsx"
hints_cleaned.to_excel(output_file, index=False)

print(f"Cleaned dataset saved as {output_file}")


Unique value counts in 'SeekCancerInfo':
SeekCancerInfo
Yes                               323
Missing data (Not Ascertained)      0
No                                  0
Name: count, dtype: int64
Cleaned dataset saved as ../data/csv/hints_cleaned_forML_spearman.xlsx
