# IMPORT LIBRARIES AND SET CONSTANTS

In [10]:
import pandas as pd
import numpy as np
from pathlib import Path

DATA_PATH  = Path('../data')
RAW_DATA_PATH = DATA_PATH / 'raw'
PROCESSED_DATA_PATH = DATA_PATH / 'processed'

VAR_LIST_FILE_PATH = Path('../documentation/var_list_decription.txt')
RAW_FILE_PATH = RAW_DATA_PATH / 'LLCP2024.XPT'
PROCESSED_FILE_PATH_CSV = PROCESSED_DATA_PATH / 'converted_data.csv'

FINAL_FILE_PATH = PROCESSED_DATA_PATH / 'final_data.csv'

## CONVERT XPT TO CSV

In [6]:
def sas_to_csv(file_path: str, destination_path: str):
    try:
        df = pd.read_sas(file_path, encoding='utf-8')
    except Exception as e:
        print("Not valid SAS file!")
    else:
        try:
            df.to_csv(destination_path, index=False)
        except Exception:
            print("Failed while converting xpt file to csv file!")

In [7]:
sas_to_csv(RAW_FILE_PATH, PROCESSED_FILE_PATH_CSV)

## Data preprocessing
over viewing data

In [8]:
raw_df = pd.read_csv(PROCESSED_FILE_PATH_CSV)
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457670 entries, 0 to 457669
Columns: 301 entries, _STATE to _AIDTST4
dtypes: float64(298), int64(3)
memory usage: 1.0 GB


In [9]:
raw_df.head()

Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENM1,...,_LCSCTSN,_LCSPSTF,DRNKANY6,DROCDY4_,_RFBING6,_DRNKWK3,_RFDRHV9,_FLSHOT7,_PNEUMO3,_AIDTST4
0,1.0,2.0,2282024,2.0,28.0,2024,1100.0,2024000001,2024000000.0,1.0,...,,9.0,2.0,5.397605e-79,1.0,5.397605e-79,1.0,1.0,2.0,2.0
1,1.0,2.0,2212024,2.0,21.0,2024,1100.0,2024000002,2024000000.0,1.0,...,4.0,9.0,2.0,5.397605e-79,1.0,5.397605e-79,1.0,1.0,1.0,2.0
2,1.0,2.0,2212024,2.0,21.0,2024,1100.0,2024000003,2024000000.0,1.0,...,4.0,2.0,1.0,100.0,2.0,1400.0,1.0,,,2.0
3,1.0,2.0,2282024,2.0,28.0,2024,1100.0,2024000004,2024000000.0,1.0,...,,9.0,2.0,5.397605e-79,1.0,5.397605e-79,1.0,1.0,1.0,2.0
4,1.0,2.0,2212024,2.0,21.0,2024,1100.0,2024000005,2024000000.0,1.0,...,3.0,9.0,2.0,5.397605e-79,1.0,5.397605e-79,1.0,,,2.0


In [11]:
NEW_VAR_NAMES = [
    "State",
    "Sex",
    "GeneralHealth",
    "PhysicalHealthDays",
    "MentalHealthDays",
    "LastCheckupTime",
    "PhysicalActivities",
    "SleepHours",
    "RemovedTeeth",
    "HadHeartAttack",
    "HadAngina",
    "HadStroke",
    "HadAsthma",
    "HadSkinCancer",
    "HadCOPD",
    "HadDepressiveDisorder",
    "HadKidneyDisease",
    "HadArthritis",
    "HadDiabetes",
    "DeafOrHardOfHearing",
    "BlindOrVisionDifficulty",
    "DifficultyConcentrating",
    "DifficultyWalking",
    "DifficultyDressingBathing",
    "DifficultyErrands",
    "SmokerStatus",
    "ECigaretteUsage",
    "ChestScan",
    "RaceEthnicityCategory",
    "AgeCategory",
    "HeightInMeters",
    "WeightInKilograms",
    "BMI",
    "AlcoholDrinkers",
    "HIVTesting",
    "FluVaxLast12",
    "PneumoVaxEver",
    "TetanusLast10Tdap",
    "HighRiskLastYear",
    "CovidPos"
]

In [12]:
var_list_df = pd.read_csv(VAR_LIST_FILE_PATH, sep=' - ', header=None, names=['Variable', 'description'])
var_list_df

  var_list_df = pd.read_csv(VAR_LIST_FILE_PATH, sep=' - ', header=None, names=['Variable', 'description'])


Unnamed: 0,Variable,description
0,_STATE,State FIPS Code
1,SEXVAR,Sex of Respondent
2,GENHLTH,Would you say that in general your health is:
3,PHYSHLTH,"Now thinking about your physical health, which..."
4,MENTHLTH,"Now thinking about your mental health, which i..."
5,CHECKUP1,About how long has it been since you last visi...
6,EXERANY2,"During the past month, other than your regular..."
7,SLEPTIM1,"On average, how many hours of sleep do you get..."
8,RMVTETH4,Not including teeth lost for injury or orthodo...
9,CVDINFR4,"(Ever told) you had a heart attack, also calle..."


In [15]:
var_list = var_list_df['Variable'].to_numpy()
var_list

array(['_STATE', 'SEXVAR', 'GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'CHECKUP1',
       'EXERANY2', 'SLEPTIM1', 'RMVTETH4', 'CVDINFR4', 'CVDCRHD4',
       'CVDSTRK3', 'ASTHMA3', 'CHCSCNC1', 'CHCCOPD3', 'ADDEPEV3',
       'CHCKDNY2', 'HAVARTH4', 'DIABETE4', 'DEAF', 'BLIND', 'DECIDE',
       'DIFFWALK', 'DIFFDRES', 'DIFFALON', '_SMOKER3', 'ECIGNOW2',
       'LCSCTSC1', '_RACEGR4', '_AGEG5YR', 'HTM4', 'WTKG3', '_BMI5',
       'DRNKANY6', '_AIDTST4', 'FLUSHOT7', 'PNEUVAC4', 'TETANUS1',
       'HIVRISK5', 'COVIDPOS'], dtype=object)

In [16]:
heart_df = raw_df[var_list]

KeyError: "['SLEPTIM1', 'ECIGNOW2', '_RACEGR4', 'COVIDPOS'] not in index"