In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
prostate_cancer_data18 = pd.read_csv('adult_18.csv')

In [3]:
# Get the list of columns
columns_list = prostate_cancer_data18.columns.tolist()
columns_list
prostate_cancer_data18.shape

(25417, 742)

In [4]:
prostate_cancer_data18 = prostate_cancer_data18[prostate_cancer_data18.CNKIND20.notna()]
#prostate_cancer_data18 = pd.read_csv(prostate_cancer_data18.csv)
prostate_cancer_data18.shape

(1167, 742)

In [5]:
prostate_cancer_data18.to_csv('prostate_cancer_data18.csv', index=False)

#Renaming the Prostate Cancer column to PROSTCAN_A
prostate_cancer_data18.rename(columns={'CNKIND20':'PROSTCAN_A','AGE_P':'AGEP_A','CNKIND22':'SKNNMCAN_A',
                                      'CNKIND16':'SKNMCAN_A', 'CNKIND31':'OTHERCANP_A','CNKIND23':'SKNDCAN_A', 
                                       'CNKIND16':'MELANCAN_A','AWEIGHTP':'WEIGHTLBTC_A','CNKIND14':'LUNGCAN_A',
                                       'CNKIND15':'LYMPHCAN_A','CNKIND7':'COLONCAN_A','AHEIGHT':'HEIGHTTC_A',
                                       'CNKIND1':'BLADDCAN_A','CNKIND19':'PANCRCAN_A','CNKIND27':'THROACAN_A',
                                       'CNKIND8':'ESOPHCAN_A','CNKIND21':'RECTUCAN_A','CNKIND9':'GALLBCAN_A',
                                       'CNKIND3':'BONECAN_A','CNKIND12':'LEUKECAN_A','CNKIND17':'MOUTHCAN_A',
                                       'CNKIND13':'LIVERCAN_A','DEP_1':'DEPFREQ_A','CNKIND11':'LARYNCAN_A',
                                       'PAIN_2A':'PAIFRQ3M_A','CNKIND5':'BREASCAN_A','ALUNIT17':'ANXFREQ_A',
                                       'RACERPI2':'RACEALLP','HISPAN_I':'HOUYRSLIV_A','SMKLSTB1':'SMOKELSEV_A',
                                      'BMI':'BMICAT_A','DIBEV1':'DIBEV_A','CIGAREV2':'SMKCIGST_A','ECIGEV2':'SMKECIGST_A',
                                      'AASMEV':'ASEV_A','EPHEV':'COPDEV_A','HYPEV':'HYPEV_A','CHDEV':'CHDEV_A',
                                      'PSAHAD':'PSATEST_A','R_MARITL':'LEGMSTAT_A','ASISIM':'ORIENT_A','ECIGEV2':'ECIGEV_A'
                                      }, inplace=True)

In [6]:

columns = prostate_cancer_data18.columns.sort_values()

# Correct variable name: S_columns
S_columns = [i for i in columns if i.startswith("SKN")]

# Use the correct case for the variable
prostate_cancer_data18[S_columns].notnull().sum()

SKNDCAN_A     1167
SKNNMCAN_A    1167
dtype: int64

In [7]:
# Dropping of columns not needed


required_columns18 = ['BMICAT_A','DIBEV_A','SMKCIGST_A','ASEV_A','COPDEV_A','HYPEV_A',
                      'AGEP_A','SKNNMCAN_A','OTHERCANP_A','SKNDCAN_A','MELANCAN_A','WEIGHTLBTC_A','LUNGCAN_A',
                     'LYMPHCAN_A','COLONCAN_A','HEIGHTTC_A','BLADDCAN_A','PANCRCAN_A','THROACAN_A','ESOPHCAN_A',
                      'GALLBCAN_A','BONECAN_A','LEUKECAN_A','MOUTHCAN_A','LIVERCAN_A','DEPFREQ_A','LARYNCAN_A',
                      'BREASCAN_A','REGION','ANXFREQ_A','RACEALLP','HOUYRSLIV_A','SMOKELSEV_A','PROSTCAN_A',
                      'CHDEV_A','PSATEST_A','LEGMSTAT_A','ORIENT_A','ECIGEV_A']
            
#  Keep only the specified columns
columns_filtered = prostate_cancer_data18[required_columns18]

# Drop any unnamed columns
# Unnamed columns typically have names like 'Unnamed: 0'
unwanted_columns = [col for col in columns_filtered.columns if 'Unnamed:' in str(col)]
required_data18 = columns_filtered.drop(unwanted_columns, axis=1)

In [8]:
prostate_cancer_data19 = pd.read_csv('adult_19.csv')
prostate_cancer_data19.to_csv('prostate_cancer_data19.csv', index=False)

In [9]:
prostate_cancer_data19 = prostate_cancer_data19[prostate_cancer_data19.PROSTCAN_A.notna()]
prostate_cancer_data19.shape

(1598, 534)

In [10]:
#REPLACING OF COLUMN NAME
prostate_cancer_data19.rename(columns={'EDUC_A':'EDUCP_A', 'RACEALLP_A':'RACEALLP', 'SKNDKCAN_A':'SKNDCAN_A' }, inplace= True)

In [11]:
prostate_cancer_data19.sample(10)

Unnamed: 0,URBRRL,RATCAT_A,INCGRP_A,INCTCFLG_A,FAMINCTC_A,IMPINCFLG_A,PPSU,PSTRAT,HISPALLP_A,RACEALLP,DISAB3_A,SCHDYMSSTC_A,AFNOW,PHQCAT_A,YRSINUS_A,CITZNSTP_A,LEGMSTAT_A,MARSTAT_A,SASPPRACE_A,SASPPHISP_A,PRTNREDUC_A,SPOUSEDUC_A,PRTNRAGETC_A,SPOUSAGETC_A,PRTNRWKFT_A,PRTNRWRK_A,SPOUSWKFT_A,SPOUSWRK_A,SPOUSESEX_A,PRTNRSEX_A,SHTHPVAGEP_A,SHINGRXYRP_A,ZOSTAVXYRP_A,HHRESPSA_FLG,GADCAT_A,PCNTADTWFP_A,PCNTADTWKP_A,FDSCAT4_A,FDSCAT3_A,EMPDYSMSS2_A,EMPLSTWORK_A,EMPWRKFT_A,EMPWRKLSWK_A,EMPWKHRS2_A,DIFYRSTC_A,DIBAGETC_A,SMKECIGST_A,SMKCIGST_A,BMICAT_A,WEIGHTLBTC_A,HEIGHTTC_A,URGNT12MTC_A,EMERG12MTC_A,PSA5YRTC_A,PCNTTC,PCNT18UPTC,PCNTLT18TC,COVER65_A,COVER_A,EXCHANGE_A,NOTCOV_A,MILSPC1R_A,OGFLG_A,OPFLG_A,CHFLG_A,MAFLG_A,PLNWRKR2_A,PLNWRKR1_A,RSNHIMISS_A,RSNHIJOB_A,MCADVR_A,PRFLG_A,PLEXCHPR1_A,PRPREM1_A,PXCHNG1_A,HICOSTR2_A,HICOSTR1_A,OTHGOV_A,OTHPUB_A,IHS_A,MILITARY_A,CHIP_A,MEDICAID_A,MEDICARE_A,PRIVATE_A,PRPLCOV1_C_A,PRPLCOV2_C_A,PLEXCHOG_A,PLEXCHOP_A,EXCHPR2_A,EXCHPR1_A,EDUCP_A,MAXEDUC_A,PARSTAT_A,SAPARENTSC_A,MLTFAMFLG_A,OVER65FLG_A,PCNTFAM_A,PCNTADLT_A,PCNTKIDS_A,NUMCAN_A,COLRCAGETC_A,HDNCKAGETC_A,OTHERAGETC_A,UTERUAGETC_A,THYROAGETC_A,THROAAGETC_A,STOMAAGETC_A,SKNDKAGETC_A,SKNNMAGETC_A,SKNMAGETC_A,RECTUAGETC_A,PROSTAGETC_A,PANCRAGETC_A,OVARYAGETC_A,MOUTHAGETC_A,MELANAGETC_A,LYMPHAGETC_A,LUNGAGETC_A,LIVERAGETC_A,LEUKEAGETC_A,LARYNAGETC_A,GALLBAGETC_A,ESOPHAGETC_A,COLONAGETC_A,CERVIAGETC_A,BREASAGETC_A,BRAINAGETC_A,BONEAGETC_A,BLOODAGETC_A,BLADDAGETC_A,OTHERCANP_A,COLRCCAN_A,HDNCKCAN_A,UTERUCAN_A,THYROCAN_A,THROACAN_A,STOMACAN_A,SKNDCAN_A,SKNNMCAN_A,SKNMCAN_A,RECTUCAN_A,PROSTCAN_A,PANCRCAN_A,OVARYCAN_A,MOUTHCAN_A,MELANCAN_A,LYMPHCAN_A,LUNGCAN_A,LIVERCAN_A,LEUKECAN_A,LARYNCAN_A,GALLBCAN_A,ESOPHCAN_A,COLONCAN_A,CERVICAN_A,BREASCAN_A,BRAINCAN_A,BONECAN_A,BLOODCAN_A,BLADDCAN_A,HISDETP_A,HISP_A,REGION,SRVY_YR,SEX_A,AGEP_A,AGE65,ASTATNEW,HOUGVASST_A,HOUTENURE_A,HOUYRSLIV_A,FDSNEDAYS_A,FDSNOTEAT_A,FDSWEIGHT_A,FDSHUNGRY_A,FDSLESS_A,FDSSKIPDYS_A,FDSSKIP_A,FDSBALANCE_A,FDSLAST_A,FDSRUNOUT_A,FLUNCH12M_A,FWIC12M_A,FSNAP30D_A,FSNAP12M_A,INCOTHR_A,INCRETIRE_A,INCWELF_A,SSISSDIDSB_A,SSISSDIBTH_A,INCSSISSDI_A,INCSSRR_A,INCINTER_A,INCWRKO_A,EMPOFFHI_A,EMPPDSKLV_A,EMPRSNOWK_A,SCHCURENR_A,NATUSBORN_A,VACAREEV_A,VAHOSP_A,VADISB_A,COMBAT_A,AFVETTRN_A,AFVET_A,EVRMARRIED_A,SPOUSEP_A,SPOUSLIV_A,MARITAL_A,ORIENT_A,SMOKELSCUR_A,SMOKELSEV_A,PIPECUR_A,PIPEEV_A,CIGAR30D_A,CIGARCUR_A,CIGAREV_A,ECIGNOW_A,ECIGEV_A,CIG30D_A,SMK30D_A,CIGNOW_A,SMKNOW_A,SMKEV_A,ARTHPH_A,ARTHWT_A,ARTHWRK_A,ARTHLMT_A,JNTPN_A,JNTSYMP_A,PAITOOTH3M_A,PAIAPG3M_A,PAIHDFC3M_A,PAILLMB3M_A,PAIULMB3M_A,PAIBACK3M_A,PAINMEFF_A,PAIMOTHER_A,PAIMEDITAT_A,PAIMASSAGE_A,PAIYOGA_A,PAIGROUP_A,PAIPROGRAM_A,PAITALKTPY_A,PAICHIRO_A,PAIPHYSTPY_A,PAIAFFM3M_A,PAIWKLM3M_A,PAIAMNT_A,PAIFRQ3M_A,GAD77_A,GAD76_A,GAD75_A,GAD74_A,GAD73_A,GAD72_A,GAD71_A,PHQ88_A,PHQ87_A,PHQ86_A,PHQ85_A,PHQ84_A,PHQ83_A,PHQ82_A,PHQ81_A,MHTHND_A,MHTHDLY_A,MHTPYNOW_A,MHTHRPY_A,MHRX_A,DEPLEVEL_A,DEPMED_A,DEPFREQ_A,ANXLEVEL_A,ANXMED_A,ANXFREQ_A,HOMEHC12M_A,THERA12M_A,EYEEX12M_A,WRKHLTHFC_A,WORKHEALTH_A,SHTHPV_A,SHTTDAP_A,SHTTETANUS_A,TDAPPREG_A,SHINGWHEN_A,SHINGRIXNB_A,SHINGRIX_A,ZOSTAWHEN_A,ZOSTAVAX_A,SHTSHINGLE_A,SHTPNEUNB_A,SHTPNUEV_A,FLUPREG2_A,FLUPREG_A,SHTFLUY_A,SHTFLUM_A,SHTFLU12M_A,LIVEBIRTH_A,PREGFLUYR_A,BEXAMREAS_A,BEXAMWHEN_A,BREASTEXAM_A,MAMWHY1ST_A,MAMAGE1ST_A,MAMREASON_A,MAMWHEN_A,MAMEV_A,HYSTEV_A,CERVICNOT_A,CERVICRES_A,HPVTEST_A,PAPTEST_A,CERREASON_A,CERVICWHEN_A,CERVICEV_A,PSADISADV_A,PSAADVANT_A,PSASUGGEST_A,PSAREASON_A,PSAWHEN_A,PSATEST_A,COLKIND6_A,COLKIND5_A,COLKIND4_A,COLKIND3_A,COLKIND2_A,COLKIND1_A,COLPROBLEM_A,FITCOLG_A,COLOGUARD_A,FITHWHEN_A,FITHEV_A,CTCOLWHEN_A,CTCOLEV_A,COLOROTH_A,SIGWHEN_A,COLSIGWHEN_A,COLPAY_A,COLREASON_A,COLWHEN_A,COLORECTYP_A,COLORECTEV_A,DIABLAST_A,CHOLLAST_A,BPLAST_A,OPDFREQ_A,OPDCHRONIC_A,OPDACUTE_A,OPD3M_A,OPD12M_A,RXDG12M_A,RXDL12M_A,RXLS12M_A,RXSK12M_A,RX12M_A,MEDNG12M_A,MEDDL12M_A,HOSPONGT_A,USPLKIND_A,USUALPL_A,WELLVIS_A,WELLNESS_A,LASTDR_A,DENNG12M_A,DENDL12M_A,DENPREV_A,PAYWORRY_A,PAYNOBLLNW_A,PAYBLL12M_A,HINOTMYR_A,HINOTYR_A,RSNHIOTH_A,RSNHIWAIT_A,RSNHIMEET_A,RSNHICONF_A,RSNHIELIG_A,RSNHIWANT_A,RSNHICOST_A,HISTOPELIG_A,HISTOPCOST_A,HISTOPAGE_A,HISTOPMISS_A,HISTOPJOB_A,HILASTMY_A,HILAST_A,MILSPC3_A,MILSPC2_A,MILSPC1_A,OGHDHP_A,OGDEDUC_A,OGPREM_A,OGXCHNG_A,OPHDHP_A,OPDEDUC_A,OPPREM_A,OPXCHNG_A,CHHDHP_A,CHDEDUC_A,CHPREM_A,CHXCHNG_A,PRVSCOV2_A,PRVSCOV1_A,PRDNCOV2_A,PRDNCOV1_A,PRRXCOV2_A,PRRXCOV1_A,HSAHRA2_A,HSAHRA1_A,PRHDHP2_A,PRHDHP1_A,PRDEDUC2_A,PRDEDUC1_A,PLN2PAY6_A,PLN2PAY5_A,PLN2PAY4_A,PLN2PAY3_A,PLN2PAY2_A,PLN2PAY1_A,PLN1PAY6_A,PLN1PAY5_A,PLN1PAY4_A,PLN1PAY3_A,PLN1PAY2_A,PLN1PAY1_A,PLNEXCHG2_A,PLNEXCHG1_A,PRPOLH2_A,PRPOLH1_A,PRPLCOV2_A,PRPLCOV1_A,POLHLD2_A,POLHLD1_A,MAHDHP_A,MADEDUC_A,MAPREM_A,MAXCHNG_A,MCPARTD_A,MCHMO_A,MCCHOICE_A,MCPART_A,SINCOVRX_A,SINCOVVS_A,SINCOVDE_A,MCAIDPRB_A,MCAREPRB_A,HIKIND10_A,HIKIND09_A,HIKIND08_A,HIKIND07_A,HIKIND06_A,HIKIND05_A,HIKIND04_A,HIKIND03_A,HIKIND02_A,HIKIND01_A,HICOV_A,SOCWRKLIM_A,SOCSCLPAR_A,SOCERRNDS_A,UPPOBJCT_A,UPPRAISE_A,UPPSLFCR_A,COGAMTDFF_A,COGFRQDFF_A,COGTYPEDFF_A,COGMEMDFF_A,COMDIFF_A,EQSTEPS_A,EQWLK13M_A,EQWLK100_A,NOEQSTEPS_A,NOEQWLK13M_A,NOEQWLK100_A,PERASST_A,WCHAIR_A,CANEWLKR_A,STEPS_A,WLK13M_A,WLK100_A,EQUIP_A,DIFF_A,HEARINGDF_A,HEARAIDFR_A,HEARAID_A,VISIONDF_A,WEARGLSS_A,PREGNOW_A,DEPEV_A,ANXEV_A,DEMENEV_A,ARTHEV_A,COPDEV_A,DIBTYPE_A,DIBINSSTYR_A,DIBINSSTOP_A,DIBINSTIME_A,DIBINS_A,DIBPILL_A,DIBEV_A,GESDIB_A,PREDIB_A,CANEV_A,ASER12M_A,ASAT12M_A,ASTILL_A,ASEV_A,ASPONOWN_A,ASPMEDSTP_A,ASPMEDNOWN_A,ASPMEDEV_A,STREV_A,MIEV_A,ANGEV_A,CHDEV_A,CHLMED_A,CHL12M_A,CHLEV_A,HYPMED_A,HYP12M_A,HYPDIF_A,HYPEV_A,PHSTAT_A,PROXYREL_A,PROXY_A,AVAIL_A,HHSTAT_A,INTV_QRT,RECTYPE,WTFA_A,WTIA_A,HHX,POVRATTC_A
1754,1,14,5,0,100000,0,63,124,2,1,2,,2.0,1,,1,3,1,1.0,1.0,,8.0,,61.0,,,1.0,1.0,2.0,,,,2017.0,,1,2,2,1,1,0.0,,1.0,1,54.0,,,3,3,3,185,72,0,0,4.0,2,2,0,1.0,,2.0,2,,,,,,,1.0,,,,,,,,,2880.0,3,3,2,3,3,3,1,1,,,,,,1.0,8,8.0,3,3,2,1,2,2,0,1,,,,,,,,,,,,62.0,,,,,,,,,,,,,,,,,,,2.0,2.0,2.0,,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0,2.0,2.0,3,2,1,2019,1,66,,1,,1,4,,,,,,,,3,3,3,,,,2,2.0,1.0,2.0,,,2.0,1.0,1,1,1.0,1.0,,2,1,2.0,2.0,2.0,,1.0,1,,,1.0,1,2,,2,3.0,1,15.0,2.0,1,,2,,,,3.0,1,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,3.0,3.0,2.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,,2,2.0,,2,5,,2,5,2,1,1,2.0,2.0,,,2.0,,,,2.0,,1.0,1.0,1.0,1,,,2018.0,11.0,1,,,,,,,,,,,,,,,,,,,1.0,1.0,2.0,3.0,4.0,1.0,,,,,,,,,,,,,,2.0,,,2.0,1.0,1.0,1.0,1.0,1,1,1,,2.0,1.0,1.0,1.0,2,2.0,2.0,2.0,1,2,2,2,1.0,1,,1.0,1,2,2,1,3,,2,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,2.0,,1.0,,,,,,2.0,,,,,,,2.0,2.0,2.0,2.0,1.0,1.0,,,,,,1.0,,1.0,,,,,2.0,,,1.0,2,1,1,,,2,2,2,2,2,2,2,2,1,1,1,2,1,1,1,1,1,,,,1,1,,,,,,,,,,1.0,1.0,1.0,2,1,1,,2,1,1,,2,2,2,1,2,,,,,,,2,,2,1,,,,2,2.0,,,2.0,2,2,2,2,,,2,1.0,1.0,1.0,1,2,,,1,1,1,10,7976.413,7616.27,H040758,6.58
1949,4,12,4,0,75000,0,93,109,2,1,2,,2.0,1,,1,3,1,1.0,1.0,,8.0,,61.0,,,2.0,1.0,2.0,,,,2017.0,1.0,1,0,2,1,1,0.0,,2.0,1,12.0,,,3,4,3,195,70,0,0,5.0,2,2,0,,1.0,2.0,2,1.0,,,,,,1.0,,,,,,,,,8220.0,3,3,2,1,3,3,3,1,,,,,,2.0,9,9.0,3,3,2,0,2,2,0,1,,,,,,,,,,,,,,,,62.0,,,,,,,,,,,,,,,2.0,2.0,2.0,,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0,2.0,2.0,3,2,2,2019,1,62,,1,,1,5,,,,,,,,3,3,3,,,,2,1.0,1.0,2.0,,,2.0,2.0,1,1,2.0,2.0,,2,1,1.0,2.0,1.0,2.0,2.0,1,,,1.0,1,2,2.0,1,,2,0.0,3.0,1,,2,,,,,2,,,,,,,1.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,1.0,3.0,2.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,,2,2.0,,2,5,,2,5,2,1,2,2.0,2.0,2.0,9.0,1.0,,,,2.0,,1.0,1.0,,2,,,2018.0,10.0,1,,,,,,,,,,,,,,,,,,,2.0,1.0,1.0,1.0,1.0,1.0,,,,,,,,,,,,,,2.0,,,2.0,1.0,4.0,1.0,1.0,1,1,1,,,,,2.0,2,2.0,2.0,2.0,1,2,2,2,1.0,1,1.0,2.0,1,2,2,1,3,,2,,2.0,,,,,,,,,,,,,,,1.0,1.0,,,,,,,,,,,,,,,2.0,,2.0,,1.0,,,,1.0,,1.0,,,,,,,2.0,2.0,2.0,2.0,2.0,1.0,,,,,,1.0,,1.0,,,,,,,,,2,2,2,,,2,2,2,2,2,2,2,2,2,1,1,2,1,1,1,1,1,,,,1,2,,,,,,,,,,1.0,1.0,1.0,2,1,2,,2,1,2,,2,2,2,2,2,,,,,,,2,,2,1,,,,2,2.0,,,2.0,2,2,2,2,,,2,,,,2,2,,,1,1,1,10,1714.976,1994.436,H044625,4.46
27123,3,14,5,0,150000,0,47,111,2,1,2,,,1,,1,3,1,1.0,1.0,,5.0,,67.0,,,,2.0,2.0,,,,2016.0,1.0,1,0,0,1,1,,3.0,,2,,,,3,3,2,170,70,1,0,10.0,2,2,0,1.0,,2.0,2,,,,,,,3.0,,,,,,,,,2400.0,3,3,2,3,3,3,1,1,,,,,,2.0,4,5.0,3,3,2,1,2,2,0,1,,,,,,,,,,,,72.0,,,,,,,,,,,,,,,,,,,2.0,2.0,2.0,,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0,2.0,2.0,3,2,4,2019,1,74,,1,,1,3,,,,,,,,3,3,3,,,,2,2.0,2.0,2.0,,,2.0,1.0,1,2,,,3.0,2,1,,,,,,2,,,1.0,1,2,,2,3.0,1,0.0,3.0,1,,2,,,,3.0,1,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,,2,2.0,,2,5,3.0,2,4,2,2,1,2.0,2.0,,1.0,1.0,,,,2.0,,1.0,1.0,1.0,1,,,2018.0,10.0,1,,,,,,,,,,,,,,,,,,,1.0,1.0,2.0,2.0,1.0,1.0,,,,,,,,,,,,,,2.0,6.0,,1.0,1.0,1.0,3.0,1.0,1,1,1,,,,,2.0,2,2.0,2.0,2.0,1,2,2,2,1.0,1,,1.0,1,2,2,2,3,,2,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,2.0,,2.0,,,,1.0,,1.0,,,,,,,2.0,2.0,2.0,2.0,2.0,1.0,,,,,,2.0,,1.0,,,,,2.0,2.0,9.0,3.0,2,2,2,,,2,2,2,2,2,2,2,1,1,2,1,2,1,1,1,1,1,,,,1,1,,,,,,,,,,1.0,1.0,1.0,2,1,2,,2,1,1,,2,2,2,2,2,,,,,,,2,,2,1,,,,2,,,1.0,1.0,2,2,2,2,2.0,2.0,1,,,,2,2,,,1,1,4,10,5466.102,5338.635,H026493,9.88
9524,3,9,2,0,40000,0,4,135,2,1,2,,,1,,1,3,1,1.0,1.0,,1.0,,79.0,,,,2.0,2.0,,,,,1.0,1,0,0,1,1,,3.0,,2,,11.0,71.0,3,3,2,160,68,0,1,1.0,2,2,0,2.0,,,2,,,,,,,,,,,,,,,,,3,3,2,3,3,1,1,3,,,,,,,3,3.0,3,3,2,1,2,2,0,2,,,,,,,79.0,,,,,,,,,,,,,,,,,,,,,,,78.0,2.0,2.0,2.0,,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0,2.0,1.0,3,2,3,2019,1,82,,1,,1,5,,,,,,,,3,3,3,,,,2,2.0,1.0,2.0,,,2.0,1.0,2,2,,,4.0,2,1,2.0,2.0,2.0,2.0,2.0,1,,,1.0,1,2,,2,,2,,,2,,2,,,,3.0,1,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1,2,1,1,2,1,2,1,2,1,1,2,2,2,1,2,2,,2,,3.0,1,3,3.0,1,3,2,2,2,2.0,2.0,,2.0,1.0,,,,,,,2.0,1.0,1,,,,,2,,,,,,,,,,,,,,,,,,,1.0,1.0,2.0,1.0,2.0,1.0,,,,,,,,,,,,,,2.0,,,1.0,1.0,1.0,1.0,1.0,1,1,1,,,,,2.0,2,2.0,2.0,2.0,1,2,2,1,1.0,1,,1.0,1,2,2,6,3,,2,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,2.0,2.0,2.0,2.0,3.0,2,2,2,,,2,2,2,2,2,2,1,2,1,2,1,1,1,1,1,1,1,,,,1,2,,,,,,,,,,4.0,,4.0,2,2,2,,2,1,2,,1,2,2,2,2,2.0,,2.0,9.0,1.0,1.0,1,,1,1,,,,2,,2.0,2.0,1.0,1,1,1,1,1.0,1.0,1,,,,2,4,,,1,1,2,10,5329.587,5037.686,H032687,2.63
7116,3,1,1,0,13,0,61,108,2,1,2,,2.0,1,,1,4,7,,,,,,,,,,,,,,,,1.0,1,0,0,1,1,0.0,1.0,,2,,,,3,4,2,165,70,0,0,,1,1,0,,2.0,,2,,,,,,,,,,,,,,,,,3,3,2,3,3,1,3,3,,,,,,,11,11.0,3,3,2,0,1,1,0,1,,,,,,,,51.0,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,2.0,,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0,2.0,2.0,3,2,4,2019,1,63,,1,2.0,2,3,,,,,,,,3,3,3,,,,2,2.0,2.0,2.0,,,2.0,2.0,1,1,,,3.0,2,1,,,,,,2,2.0,,,3,2,,2,,2,0.0,3.0,1,,2,,,,,2,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,2,2,,2,2.0,,2,5,1.0,2,2,2,2,2,2.0,2.0,2.0,,2.0,,,,,,,2.0,1.0,1,,,,,2,,,,,,,,,,,,,,,,,,,2.0,2.0,,,,2.0,,,,,,,,,,,,,,2.0,,,1.0,1.0,2.0,1.0,1.0,3,3,1,,,,,2.0,2,2.0,2.0,2.0,1,2,2,2,1.0,1,,1.0,1,2,2,3,3,,2,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,1.0,,,,,2,2,2,,,2,2,2,2,2,2,1,2,2,2,1,2,1,1,1,1,1,,,,1,1,,,,,,,,,,1.0,1.0,1.0,2,1,2,,2,1,1,,2,2,2,2,2,,,,,,,2,,2,1,,,,2,2.0,,,2.0,2,2,2,2,,,2,,,,2,1,,,1,1,1,10,5710.82,7224.721,H044088,0.0
3292,3,9,2,0,45000,0,16,143,2,1,2,,,1,,1,3,1,1.0,2.0,,7.0,,65.0,,,,2.0,2.0,,,,2018.0,,2,0,0,1,1,,3.0,,2,,,,3,4,4,225,70,1,3,15.0,2,2,0,1.0,,2.0,2,1.0,,,,,,2.0,,,,,,,,,99999.0,3,3,2,1,3,3,1,1,,,,,,2.0,7,7.0,3,3,2,1,2,2,0,1,,,,,,,,,,,,63.0,,,,,,,,,,,,,,,,,,,2.0,2.0,2.0,,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0,2.0,2.0,3,2,4,2019,1,83,,1,,1,5,,,,,,,,3,3,3,,,,2,2.0,1.0,2.0,,,2.0,1.0,2,2,,,3.0,2,1,,1.0,2.0,1.0,2.0,1,,,1.0,1,2,,2,,2,,,2,,2,,,,,2,1.0,1.0,1.0,1.0,5.0,1.0,1.0,2.0,1.0,2.0,3.0,2.0,,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,2,2,1,2,2,2,2,2,2,1,1,2,2,1,1,2,2,,2,2.0,1.0,2,3,3.0,2,2,2,2,1,2.0,2.0,,2.0,1.0,,,,2.0,,1.0,1.0,1.0,1,,,2018.0,10.0,1,,,,,,,,,,,,,,,,,,,1.0,1.0,2.0,2.0,1.0,1.0,,,,,,,2.0,,2.0,4.0,1.0,,2.0,1.0,,,,,,,2.0,1,1,1,,,,,2.0,2,2.0,1.0,2.0,1,2,2,2,1.0,1,1.0,2.0,1,2,2,1,2,,2,,2.0,,,,,,,,,,,,,,,2.0,2.0,1.0,,,,,,,,,,,,,,2.0,,2.0,,1.0,,2.0,,2.0,,1.0,,,,,,,2.0,2.0,2.0,2.0,2.0,1.0,,2.0,,,,1.0,,1.0,,,,,2.0,2.0,2.0,3.0,2,1,1,,,2,2,2,2,1,2,2,1,1,2,1,1,1,2,1,2,1,1.0,1.0,3.0,2,1,2.0,2.0,2.0,2.0,3.0,2.0,1.0,2.0,1.0,,,,1,2,2,,2,1,1,,2,2,2,1,2,,,,,,,2,,2,1,,,,2,2.0,,,2.0,2,1,2,1,,,2,1.0,1.0,1.0,1,2,,,1,1,1,10,5010.668,4753.43,H000448,2.96
20764,4,14,4,0,95000,0,28,122,2,1,1,,,1,,1,3,1,1.0,1.0,,5.0,,68.0,,,1.0,1.0,2.0,,,,,1.0,1,1,1,1,1,,3.0,,2,,24.0,57.0,3,3,2,189,74,0,1,4.0,2,2,0,5.0,,,2,1.0,,,,,,,,,,,,,,,,3,3,2,1,3,3,1,3,,,,,,,4,5.0,3,3,2,1,2,2,0,1,74.0,,,,,,,,,,,,,,,,,,,,,,,74.0,,,,,,,2.0,1.0,2.0,,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,,2.0,2.0,2.0,2.0,2.0,3,2,3,2019,1,81,,1,,1,4,,,,,,,,3,3,3,,,,2,1.0,1.0,2.0,,,2.0,1.0,2,1,,,3.0,2,1,,1.0,1.0,1.0,2.0,1,,,1.0,1,2,,2,3.0,1,0.0,3.0,1,,2,,,,3.0,1,,,,,,,1.0,1.0,2.0,2.0,1.0,2.0,,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,4.0,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,2,2,,2,2.0,,2,5,1.0,2,4,1,2,2,2.0,2.0,,9.0,1.0,,,,,,,2.0,2.0,1,,,,,2,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,1.0,1.0,,,,,,,,,,,,,,2.0,,,1.0,2.0,5.0,1.0,1.0,1,1,1,,,,,2.0,2,2.0,2.0,2.0,1,2,2,1,1.0,1,,1.0,1,2,2,6,3,,2,,2.0,,,,,,,,,,,,,,,2.0,1.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,2.0,3.0,2,2,2,,,2,2,2,2,1,2,2,1,1,2,1,1,1,1,3,1,1,,,,1,2,,,,,,,,,,4.0,4.0,2.0,2,3,1,,2,3,1,,2,2,2,2,2,2.0,,2.0,4.0,1.0,1.0,1,,1,1,,,,2,2.0,,,2.0,2,2,2,2,1.0,1.0,1,1.0,1.0,1.0,1,5,,,1,1,3,10,4675.212,5705.6,H026381,6.25
26575,4,11,3,0,55000,2,20,116,2,1,2,,2.0,1,,1,3,1,1.0,1.0,,4.0,,63.0,,,,2.0,2.0,,,,,,1,0,0,1,1,,3.0,,2,,,,3,3,3,160,64,1,1,2.0,2,2,0,1.0,,2.0,2,1.0,,,,,,2.0,,,,,,,,,3000.0,3,3,2,1,3,3,1,1,,,,,,2.0,4,4.0,3,3,2,1,2,2,0,2,,60.0,,,,60.0,,,,,,,,,,,,64.0,,,,,,,,,,,,,2.0,2.0,1.0,,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0,2.0,2.0,3,2,1,2019,1,71,,1,,1,2,,,,,,,,3,3,3,,,,2,2.0,1.0,2.0,,,2.0,1.0,2,2,,,3.0,2,1,,1.0,1.0,1.0,2.0,1,,,1.0,1,2,,2,,2,,,2,,2,,,,3.0,1,,,,,,,1.0,1.0,1.0,2.0,1.0,2.0,,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,2.0,1,2,1,1,1,1,1,1,1,1,1,2,1,1,1,2,2,,2,2.0,,2,5,,2,5,2,2,1,2.0,2.0,,,2.0,,,,9.0,,9.0,1.0,1.0,1,,,2019.0,10.0,1,,,,,,,,,,,,,,,,,,,2.0,2.0,2.0,1.0,2.0,1.0,,,,,,,,,,,,,,2.0,,,1.0,1.0,3.0,1.0,1.0,1,1,1,,,,,2.0,2,2.0,2.0,2.0,1,2,2,1,1.0,1,,1.0,1,2,2,6,3,,2,,2.0,,,,,,,,,,,,,,,2.0,2.0,,,,,,,,,,,,,,,1.0,,1.0,,9.0,,,,,,2.0,,,,,,,2.0,2.0,2.0,2.0,2.0,1.0,,2.0,,,,2.0,,1.0,,,,,1.0,2.0,9.0,3.0,2,2,2,,,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,,,,1,1,,,,,,,,,,3.0,,4.0,2,2,1,,2,1,1,,2,2,2,2,1,,,,,,,2,,2,1,,,,2,,,1.0,1.0,1,1,2,1,1.0,1.0,1,1.0,1.0,1.0,1,5,,,1,1,4,10,16001.788,13664.184,H038211,3.62
835,4,8,1,0,30000,0,10,144,2,1,2,,,1,,1,5,4,,,,,,,,,,,,,,,,1.0,1,0,0,1,1,,3.0,,2,,99.0,99.0,3,4,3,160,65,5,1,,1,1,0,5.0,,,2,2.0,,,,,,,,,,,,,,,,3,3,2,1,3,3,1,3,,,,,,,5,5.0,3,3,2,1,1,1,0,1,,,,,,,,,,,,,,,,51.0,,,,,,,,,,,,,,,2.0,2.0,2.0,,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0,2.0,2.0,3,2,3,2019,1,85,,1,,1,1,,,,,,,,3,3,3,,,,2,1.0,1.0,2.0,,,2.0,1.0,2,2,,,3.0,2,1,2.0,2.0,2.0,1.0,2.0,1,1.0,,,3,2,,2,,2,,,2,,2,,,,,2,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,,2,2.0,,2,5,,2,5,2,2,2,2.0,2.0,,1.0,1.0,,,,,,,2.0,,2,,,,,2,,,,,,,,,,,,,,,,,,,1.0,1.0,,,,2.0,,,,,,,,,,,,,,2.0,,,1.0,1.0,1.0,1.0,1.0,1,1,1,,,,,2.0,2,2.0,2.0,2.0,1,2,2,2,2.0,1,,1.0,1,2,2,0,3,,2,,2.0,,,,,,,,,,,,,,,2.0,1.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,2.0,3.0,2,2,2,,,2,2,2,2,1,2,2,2,1,2,1,2,1,1,1,1,1,,,,1,1,,,,,,,,,,1.0,1.0,1.0,2,1,1,,2,1,1,,2,2,2,2,2,2.0,,,,2.0,1.0,1,,1,1,,,,2,,,1.0,1.0,2,1,2,1,1.0,1.0,1,1.0,1.0,1.0,1,4,,,1,1,1,10,3880.518,2303.861,H039054,2.49
26412,1,14,5,0,100000,0,34,131,2,1,1,,2.0,2,,1,3,2,,,,,,,,,,,,,,,,,1,1,1,1,1,,3.0,,2,,,,3,4,4,208,66,2,2,0.0,3,3,0,5.0,,,2,1.0,,,,,,,,,1.0,,,,,,,3,3,2,1,3,3,1,3,,,,,,,4,4.0,3,3,2,1,3,3,0,2,,,,,,,,,,77.0,,,,,,,5.0,,,,,,,,,,,,,,2.0,2.0,2.0,,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0,2.0,2.0,2.0,2.0,3,2,2,2019,1,85,,1,,1,5,,,,,,,,3,3,3,,,,2,2.0,1.0,2.0,1.0,2.0,1.0,1.0,1,1,,,3.0,2,1,1.0,2.0,2.0,2.0,2.0,1,,2.0,2.0,1,2,,2,,2,,,2,,2,,,,,2,,,,,,,1.0,1.0,2.0,3.0,3.0,3.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,4.0,1,1,1,1,1,1,1,1,1,1,1,4,4,1,2,2,2,,2,,1.0,1,2,1.0,1,2,1,1,1,2.0,2.0,,2.0,1.0,,,,2.0,,2.0,1.0,3.0,1,,,2019.0,10.0,1,,,,,,,,,,,,,,,,,,,1.0,1.0,2.0,1.0,5.0,1.0,,,,,,,2.0,,,,,,,2.0,,,1.0,2.0,6.0,1.0,1.0,1,1,1,,,,2.0,1.0,2,2.0,2.0,2.0,1,2,2,1,1.0,1,,1.0,1,1,1,1,3,,2,,2.0,,,,,,,,,,,,,,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,3.0,1,2,2,,,2,2,2,2,2,2,2,2,1,2,1,1,1,1,1,1,1,1.0,2.0,3.0,2,3,4.0,,4.0,4.0,,4.0,2.0,2.0,1.0,,,,1,2,3,,2,1,1,,1,2,2,2,2,,,,,2.0,2.0,2,,1,1,,,,2,,1.0,2.0,1.0,2,1,1,2,1.0,1.0,1,1.0,1.0,1.0,1,4,,,1,1,4,10,8666.611,8456.64,H003027,5.09


In [12]:
# Dropping of columns not needed


required_columns19 = ['BMICAT_A','DIBEV_A','SMKCIGST_A','ASEV_A','COPDEV_A','HYPEV_A','AGEP_A','SKNNMCAN_A','OTHERCANP_A',
                      'SKNDCAN_A','MELANCAN_A','WEIGHTLBTC_A','LUNGCAN_A','LYMPHCAN_A','COLONCAN_A','HEIGHTTC_A',
                      'BLADDCAN_A','PANCRCAN_A','THROACAN_A','ESOPHCAN_A','GALLBCAN_A','BONECAN_A','LEUKECAN_A',
                      'MOUTHCAN_A','LIVERCAN_A','DEPFREQ_A','LARYNCAN_A','BREASCAN_A','REGION','ANXFREQ_A','RACEALLP',
                      'HOUYRSLIV_A','SMOKELSEV_A','PROSTCAN_A','CHDEV_A','PSATEST_A','LEGMSTAT_A','ORIENT_A','ECIGEV_A']
            
#  Keep only the specified columns
columns_filtered = prostate_cancer_data19[required_columns18]

# Drop any unnamed columns
# Unnamed columns typically have names like 'Unnamed: 0'
unwanted_columns = [col for col in columns_filtered.columns if 'Unnamed:' in str(col)]
required_data19 = columns_filtered.drop(unwanted_columns, axis=1)

In [13]:
prostate_cancer_data20 = pd.read_csv('adult_20.csv')
prostate_cancer_data20.to_csv('prostate_cancer_data20csv', index=False)
prostate_cancer_data20.shape

(31568, 617)

In [14]:
prostate_cancer_data20 = prostate_cancer_data20[prostate_cancer_data20.PROSTCAN_A.notna()]
prostate_cancer_data20.shape

(1732, 617)

In [15]:
# Dropping of columns not needed


required_columns20 = ['BMICAT_A','DIBEV_A','SMKCIGST_A','ASEV_A','COPDEV_A','HYPEV_A','AGEP_A','SKNNMCAN_A','OTHERCANP_A',
                      'SKNDCAN_A','MELANCAN_A','WEIGHTLBTC_A','LUNGCAN_A','LYMPHCAN_A','COLONCAN_A','HEIGHTTC_A',
                      'BLADDCAN_A','PANCRCAN_A','THROACAN_A','ESOPHCAN_A','GALLBCAN_A','BONECAN_A','LEUKECAN_A',
                      'MOUTHCAN_A','LIVERCAN_A','DEPFREQ_A','LARYNCAN_A','BREASCAN_A','REGION','ANXFREQ_A','RACEALLP',
                      'HOUYRSLIV_A','SMOKELSEV_A','PROSTCAN_A','CHDEV_A','PSATEST_A','LEGMSTAT_A','ORIENT_A','ECIGEV_A']
            
#  Keep only the specified columns
columns_filtered = prostate_cancer_data19[required_columns20]

# Drop any unnamed columns
# Unnamed columns typically have names like 'Unnamed: 0'
unwanted_columns = [col for col in columns_filtered.columns if 'Unnamed:' in str(col)]
required_data20 = columns_filtered.drop(unwanted_columns, axis=1)

In [16]:
prostate_cancer_data21 = pd.read_csv('adult_21.csv')
prostate_cancer_data21.to_csv('prostate_cancer_data21.csv', index=False)

In [17]:
prostate_cancer_data21 = prostate_cancer_data21[prostate_cancer_data21.PROSTCAN_A.notna()]
prostate_cancer_data21.shape

(1546, 622)

In [18]:
#REPLACING OF COLUMN NAME
prostate_cancer_data21.rename(columns={'RACEALLP_A':'RACEALLP', 'SKNDKCAN_A':'SKNDCAN_A' }, inplace= True)

In [19]:
# Dropping of columns not needed


required_columns21 = ['BMICAT_A','DIBEV_A','SMKCIGST_A','ASEV_A','COPDEV_A','HYPEV_A','AGEP_A','SKNNMCAN_A','OTHERCANP_A',
                      'SKNDCAN_A','MELANCAN_A','WEIGHTLBTC_A','LUNGCAN_A','LYMPHCAN_A','COLONCAN_A','HEIGHTTC_A',
                      'BLADDCAN_A','PANCRCAN_A','THROACAN_A','ESOPHCAN_A','GALLBCAN_A','BONECAN_A','LEUKECAN_A',
                      'MOUTHCAN_A','LIVERCAN_A','DEPFREQ_A','LARYNCAN_A','BREASCAN_A','REGION','ANXFREQ_A','RACEALLP',
                      'HOUYRSLIV_A','SMOKELSEV_A','PROSTCAN_A','CHDEV_A','PSATEST_A','LEGMSTAT_A','ORIENT_A','ECIGEV_A']
            
#  Keep only the specified columns
columns_filtered = prostate_cancer_data21[required_columns21]

# Drop any unnamed columns
# Unnamed columns typically have names like 'Unnamed: 0'
unwanted_columns = [col for col in columns_filtered.columns if 'Unnamed:' in str(col)]
required_data21 = columns_filtered.drop(unwanted_columns, axis=1)

In [20]:
prostate_cancer_data22 = pd.read_csv('adult_22.csv')
prostate_cancer_data22.to_csv('prostate_cancer_data22.csv', index=False)

In [21]:
prostate_cancer_data22 = prostate_cancer_data21[prostate_cancer_data21.PROSTCAN_A.notna()]
prostate_cancer_data22.shape

(1546, 622)

In [22]:
#REPLACING OF COLUMN NAME
prostate_cancer_data22.rename(columns={'RACEALLP_A':'RACEALLP', 'SKNDKCAN_A':'SKNDCAN_A'}, inplace= True)

In [23]:
# Dropping of columns not needed


required_columns22 = ['BMICAT_A','DIBEV_A','SMKCIGST_A','ASEV_A','COPDEV_A','HYPEV_A','AGEP_A','SKNNMCAN_A','OTHERCANP_A',
                      'SKNDCAN_A','MELANCAN_A','WEIGHTLBTC_A','LUNGCAN_A','LYMPHCAN_A','COLONCAN_A','HEIGHTTC_A',
                      'BLADDCAN_A','PANCRCAN_A','THROACAN_A','ESOPHCAN_A','GALLBCAN_A','BONECAN_A','LEUKECAN_A',
                      'MOUTHCAN_A','LIVERCAN_A','DEPFREQ_A','LARYNCAN_A','BREASCAN_A','REGION','ANXFREQ_A','RACEALLP',
                      'HOUYRSLIV_A','SMOKELSEV_A','PROSTCAN_A','CHDEV_A','PSATEST_A','LEGMSTAT_A','ORIENT_A','ECIGEV_A']
            
#  Keep only the specified columns
columns_filtered = prostate_cancer_data22[required_columns22]

# Drop any unnamed columns
# Unnamed columns typically have names like 'Unnamed: 0'
unwanted_columns = [col for col in columns_filtered.columns if 'Unnamed:' in str(col)]
required_data22 = columns_filtered.drop(unwanted_columns, axis=1)

## Merging the csv files to create another Dataframe

In [24]:
# Use this if all DataFrames now have the same columns
final_prostate_data = pd.concat([required_data18,required_data19,required_data20,required_data21,required_data22], ignore_index=True)


In [25]:
final_prostate_data.to_csv('final_prostate_data.csv', index=False)
final_prostate_data.sample(5)

Unnamed: 0,BMICAT_A,DIBEV_A,SMKCIGST_A,ASEV_A,COPDEV_A,HYPEV_A,AGEP_A,SKNNMCAN_A,OTHERCANP_A,SKNDCAN_A,MELANCAN_A,WEIGHTLBTC_A,LUNGCAN_A,LYMPHCAN_A,COLONCAN_A,HEIGHTTC_A,BLADDCAN_A,PANCRCAN_A,THROACAN_A,ESOPHCAN_A,GALLBCAN_A,BONECAN_A,LEUKECAN_A,MOUTHCAN_A,LIVERCAN_A,DEPFREQ_A,LARYNCAN_A,BREASCAN_A,REGION,ANXFREQ_A,RACEALLP,HOUYRSLIV_A,SMOKELSEV_A,PROSTCAN_A,CHDEV_A,PSATEST_A,LEGMSTAT_A,ORIENT_A,ECIGEV_A
5372,4,2,3,2,2,1,68,1.0,2.0,2.0,2.0,225,2.0,2.0,2.0,69,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,5,2.0,2.0,4,5.0,1,1,2,1.0,2,1.0,4,1.0,2
4863,3,1,4,2,2,1,71,2.0,2.0,2.0,2.0,220,2.0,2.0,2.0,76,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,4,2.0,2.0,3,4.0,2,5,2,1.0,1,1.0,3,2.0,2
5479,2,2,4,2,2,2,43,2.0,2.0,2.0,2.0,178,2.0,2.0,2.0,72,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,4,2.0,2.0,2,2.0,1,2,2,2.0,2,2.0,3,2.0,2
6943,4,1,3,1,2,1,75,2.0,2.0,2.0,2.0,219,2.0,2.0,2.0,68,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,5,2.0,2.0,3,5.0,2,3,2,1.0,2,1.0,3,2.0,2
1784,3,2,4,2,2,2,77,2.0,2.0,2.0,2.0,205,2.0,2.0,2.0,73,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,5,2.0,2.0,3,5.0,1,5,2,1.0,2,1.0,3,2.0,2


In [26]:
missing_pct = final_prostate_data.isna().mean()
low_missingness_columns = missing_pct[missing_pct == 0].index
cancer_reduced_missing = final_prostate_data[low_missingness_columns]
cancer_reduced_missing.shape

(7455, 36)

In [27]:
# Calculate the percentage of missing values for each column.
missing_percentage = final_prostate_data.isna().mean()
missing_pct = 0.2

# Filter columns with less than 30% missing values.
columns_with_less_than_30_percent_missing = missing_percentage[missing_percentage < missing_pct].index
len_missing = len(columns_with_less_than_30_percent_missing)
print(f"number of columns with less than {missing_pct * 100}% missing: {len_missing} \n")
# List the variable names with less than 30% missing values.
for column_name in columns_with_less_than_30_percent_missing:
    print(column_name)

number of columns with less than 20.0% missing: 39 

BMICAT_A
DIBEV_A
SMKCIGST_A
ASEV_A
COPDEV_A
HYPEV_A
AGEP_A
SKNNMCAN_A
OTHERCANP_A
SKNDCAN_A
MELANCAN_A
WEIGHTLBTC_A
LUNGCAN_A
LYMPHCAN_A
COLONCAN_A
HEIGHTTC_A
BLADDCAN_A
PANCRCAN_A
THROACAN_A
ESOPHCAN_A
GALLBCAN_A
BONECAN_A
LEUKECAN_A
MOUTHCAN_A
LIVERCAN_A
DEPFREQ_A
LARYNCAN_A
BREASCAN_A
REGION
ANXFREQ_A
RACEALLP
HOUYRSLIV_A
SMOKELSEV_A
PROSTCAN_A
CHDEV_A
PSATEST_A
LEGMSTAT_A
ORIENT_A
ECIGEV_A


In [30]:
features = cancer_reduced_missing.drop(["PROSTCAN_A"], axis="columns")
print("Shape of data with only independent variables",features.shape)

Shape of data with only independent variables (7455, 35)


In [31]:
selector = VarianceThreshold()
selector.fit(features)

variances = pd.DataFrame({"features": features.columns, "variance":selector.variances_})
variances.sort_values(by=["variance"])

Unnamed: 0,features,variance
4,COPDEV_A,0.118487
3,ASEV_A,0.155151
1,DIBEV_A,0.199999
5,HYPEV_A,0.253057
19,GALLBCAN_A,0.334666
25,LARYNCAN_A,0.336137
26,BREASCAN_A,0.337899
16,PANCRCAN_A,0.338631
22,MOUTHCAN_A,0.340677
23,LIVERCAN_A,0.341988


In [32]:
non_zero_var_columns = variances[variances.variance > 0 ]['features'].tolist()
feature_data = features[non_zero_var_columns]
target_variable = final_prostate_data["PROSTCAN_A"]

In [33]:
def convert_to_categorical(df):
    """
    Convert selected columns in a DataFrame to categorical columns if they meet the following criteria:
    - Have 14 or fewer unique numbers.
    - All non-NaN values are whole numbers without any decimals.

    Parameters:
    - df (pandas.DataFrame): The input DataFrame.

    Returns:
    pandas.DataFrame: The DataFrame with selected columns converted to categorical.
    """
    for column in df.columns:
        unique_values = df[column].dropna().unique()  # Exclude NaN values
        
        # Check if the column has 14 or fewer unique values
        if len(unique_values) <= 14:
            
            # Check if all non-NaN values are whole numbers without decimals
            if all(value == int(value) and value % 1 == 0 for value in unique_values):
                
                # Convert the column to categorical
                df[column] = pd.to_numeric(df[column], errors='coerce').astype('Int64')
                df[column] = df[column].astype('category')
    
    return df

def filter_non_categorical_columns(df):
    """
    Filters out all non-categorical columns from a pandas DataFrame.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.

    Returns:
    pd.DataFrame: A new DataFrame containing only the non-categorical columns.
    """
    non_categorical_columns = df.select_dtypes(exclude='category').columns
    return df[non_categorical_columns]


In [34]:
X_train, X_test, y_train, y_test = train_test_split(feature_data, target_variable, test_size=0.2, random_state=42, stratify=target_variable)

In [35]:
y_train.value_counts()/len(y_train)

2.0    0.702549
1.0    0.290074
9.0    0.005701
7.0    0.001677
Name: PROSTCAN_A, dtype: float64

In [36]:
y_test.value_counts()/len(y_test)

2.0    0.702213
1.0    0.290409
9.0    0.006036
7.0    0.001341
Name: PROSTCAN_A, dtype: float64

In [37]:
print("Model performance without considering class imbalance")
rf = RandomForestClassifier(random_state=42)

####
rf.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = rf.predict(X_train)

# Evaluate the model using classification report
print("Classification report on the training set")
print(classification_report(y_train, y_train_pred))
####

# Make predictions on the testing set
y_test_pred = rf.predict(X_test)

# Evaluate the model using classification report
print("Classification report on the testing set")
print(classification_report(y_test, y_test_pred))

Model performance without considering class imbalance
Classification report on the training set
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00      1730
         2.0       1.00      1.00      1.00      4190
         7.0       1.00      1.00      1.00        10
         9.0       1.00      1.00      1.00        34

    accuracy                           1.00      5964
   macro avg       1.00      1.00      1.00      5964
weighted avg       1.00      1.00      1.00      5964

Classification report on the testing set
              precision    recall  f1-score   support

         1.0       0.88      0.91      0.89       433
         2.0       0.96      0.95      0.95      1047
         7.0       1.00      1.00      1.00         2
         9.0       1.00      1.00      1.00         9

    accuracy                           0.94      1491
   macro avg       0.96      0.96      0.96      1491
weighted avg       0.94      0.94      0.94     

In [38]:
print("Model performance after considering class imbalance")
rf = RandomForestClassifier(random_state=42, class_weight={1:4,2:1,9:206,7:618})
rf.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = rf.predict(X_train)

# Evaluate the model using classification report
print("Classification report on the training set")
print(classification_report(y_train, y_train_pred))

# Make predictions on the testing set
y_test_pred = rf.predict(X_test)

# Evaluate the model using classification report
print("Classification report on the testing set")
print(classification_report(y_test, y_test_pred))

Model performance after considering class imbalance
Classification report on the training set
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00      1730
         2.0       1.00      1.00      1.00      4190
         7.0       1.00      1.00      1.00        10
         9.0       1.00      1.00      1.00        34

    accuracy                           1.00      5964
   macro avg       1.00      1.00      1.00      5964
weighted avg       1.00      1.00      1.00      5964

Classification report on the testing set
              precision    recall  f1-score   support

         1.0       0.89      0.89      0.89       433
         2.0       0.95      0.96      0.96      1047
         7.0       1.00      1.00      1.00         2
         9.0       1.00      1.00      1.00         9

    accuracy                           0.94      1491
   macro avg       0.96      0.96      0.96      1491
weighted avg       0.94      0.94      0.94      1

In [39]:
print("Model performance after considering class imbalance")
rf = RandomForestClassifier(random_state=42, class_weight={1:4,2:1,9:20,7:61})
rf.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = rf.predict(X_train)

# Evaluate the model using classification report
print("Classification report on the training set")
print(classification_report(y_train, y_train_pred))

# Make predictions on the testing set
y_test_pred = rf.predict(X_test)

# Evaluate the model using classification report
print("Classification report on the testing set")
print(classification_report(y_test, y_test_pred))

Model performance after considering class imbalance
Classification report on the training set
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00      1730
         2.0       1.00      1.00      1.00      4190
         7.0       1.00      1.00      1.00        10
         9.0       1.00      1.00      1.00        34

    accuracy                           1.00      5964
   macro avg       1.00      1.00      1.00      5964
weighted avg       1.00      1.00      1.00      5964

Classification report on the testing set
              precision    recall  f1-score   support

         1.0       0.89      0.89      0.89       433
         2.0       0.95      0.95      0.95      1047
         7.0       1.00      1.00      1.00         2
         9.0       1.00      1.00      1.00         9

    accuracy                           0.93      1491
   macro avg       0.96      0.96      0.96      1491
weighted avg       0.93      0.93      0.93      1

## CROSS VALIDATION

In [40]:
 # Dropping of columns not needed


required_columns = ['BMICAT_A','DIBEV_A','SMKCIGST_A','ASEV_A','COPDEV_A','HYPEV_A','AGEP_A','WEIGHTLBTC_A','HEIGHTTC_A',
                      'DEPFREQ_A','REGION','ANXFREQ_A','RACEALLP','HOUYRSLIV_A','SMOKELSEV_A','PROSTCAN_A','CHDEV_A',
                      'PSATEST_A','LEGMSTAT_A','ORIENT_A','ECIGEV_A']
            
#  Keep only the specified columns
columns_filtered = final_prostate_data[required_columns]

# Drop any unnamed columns
# Unnamed columns typically have names like 'Unnamed: 0'
unwanted_columns = [col for col in columns_filtered.columns if 'Unnamed:' in str(col)]
required_data = columns_filtered.drop(unwanted_columns, axis=1)

In [41]:
required_data.shape
missing_pct = required_data.isna().mean()
low_missingness_columns = missing_pct[missing_pct == 0].index
cancer_reduced_missing = required_data[low_missingness_columns]
cancer_reduced_missing.shape

(7455, 19)

In [42]:
# Calculate the percentage of missing values for each column.
missing_percentage = required_data.isna().mean()
missing_pct = 0.2

# Filter columns with less than 30% missing values.
columns_with_less_than_30_percent_missing = missing_percentage[missing_percentage < missing_pct].index
len_missing = len(columns_with_less_than_30_percent_missing)
print(f"number of columns with less than {missing_pct * 100}% missing: {len_missing} \n")
# List the variable names with less than 30% missing values.
for column_name in columns_with_less_than_30_percent_missing:
    print(column_name)

number of columns with less than 20.0% missing: 21 

BMICAT_A
DIBEV_A
SMKCIGST_A
ASEV_A
COPDEV_A
HYPEV_A
AGEP_A
WEIGHTLBTC_A
HEIGHTTC_A
DEPFREQ_A
REGION
ANXFREQ_A
RACEALLP
HOUYRSLIV_A
SMOKELSEV_A
PROSTCAN_A
CHDEV_A
PSATEST_A
LEGMSTAT_A
ORIENT_A
ECIGEV_A


In [43]:
features = cancer_reduced_missing.drop(["PROSTCAN_A"], axis="columns")
selector = VarianceThreshold()
selector.fit(features)

variances = pd.DataFrame({"features": features.columns, "variance":selector.variances_})
variances.sort_values(by=["variance"])

Unnamed: 0,features,variance
4,COPDEV_A,0.118487
3,ASEV_A,0.155151
1,DIBEV_A,0.199999
5,HYPEV_A,0.253057
14,CHDEV_A,0.348582
17,ECIGEV_A,0.650416
13,SMOKELSEV_A,0.741902
16,ORIENT_A,0.840126
11,RACEALLP,0.985294
10,REGION,1.020299


In [44]:
non_zero_var_columns = variances[variances.variance > 0 ]['features'].tolist()
feature_data = features[non_zero_var_columns]
target_variable = required_data["PROSTCAN_A"]

In [45]:
def convert_to_categorical(df):
    """
    Convert selected columns in a DataFrame to categorical columns if they meet the following criteria:
    - Have 14 or fewer unique numbers.
    - All non-NaN values are whole numbers without any decimals.

    Parameters:
    - df (pandas.DataFrame): The input DataFrame.

    Returns:
    pandas.DataFrame: The DataFrame with selected columns converted to categorical.
    """
    for column in df.columns:
        unique_values = df[column].dropna().unique()  # Exclude NaN values
        
        # Check if the column has 14 or fewer unique values
        if len(unique_values) <= 14:
            
            # Check if all non-NaN values are whole numbers without decimals
            if all(value == int(value) and value % 1 == 0 for value in unique_values):
                
                # Convert the column to categorical
                df[column] = pd.to_numeric(df[column], errors='coerce').astype('Int64')
                df[column] = df[column].astype('category')
    
    return df

def filter_non_categorical_columns(df):
    """
    Filters out all non-categorical columns from a pandas DataFrame.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.

    Returns:
    pd.DataFrame: A new DataFrame containing only the non-categorical columns.
    """
    non_categorical_columns = df.select_dtypes(exclude='category').columns
    return df[non_categorical_columns]


In [46]:
feature_data = convert_to_categorical(feature_data)
non_cat_data = filter_non_categorical_columns(feature_data)
non_cat_data.shape

(7455, 4)

In [47]:
X_train, X_test, y_train, y_test = train_test_split(feature_data, target_variable, test_size=0.2, random_state=42, stratify=target_variable)

In [48]:
print("Model performance without considering class imbalance")
rf = RandomForestClassifier(random_state=42)

####
rf.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = rf.predict(X_train)

# Evaluate the model using classification report
print("Classification report on the training set")
print(classification_report(y_train, y_train_pred))

# Make predictions on the testing set
y_test_pred = rf.predict(X_test)

# Evaluate the model using classification report
print("Classification report on the testing set")
print(classification_report(y_test, y_test_pred))

Model performance without considering class imbalance
Classification report on the training set
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00      1730
         2.0       1.00      1.00      1.00      4190
         7.0       1.00      1.00      1.00        10
         9.0       1.00      1.00      1.00        34

    accuracy                           1.00      5964
   macro avg       1.00      1.00      1.00      5964
weighted avg       1.00      1.00      1.00      5964

Classification report on the testing set
              precision    recall  f1-score   support

         1.0       0.89      0.78      0.83       433
         2.0       0.91      0.96      0.93      1047
         7.0       1.00      0.50      0.67         2
         9.0       1.00      0.56      0.71         9

    accuracy                           0.90      1491
   macro avg       0.95      0.70      0.79      1491
weighted avg       0.90      0.90      0.90     

In [49]:
print("Model performance after considering class imbalance")
rf = RandomForestClassifier(random_state=42, class_weight={1:4,2:1,9:206,7:618})
rf.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = rf.predict(X_train)

# Evaluate the model using classification report
print("Classification report on the training set")
print(classification_report(y_train, y_train_pred))

# Make predictions on the testing set
y_test_pred = rf.predict(X_test)

# Evaluate the model using classification report
print("Classification report on the testing set")
print(classification_report(y_test, y_test_pred))

Model performance after considering class imbalance
Classification report on the training set
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00      1730
         2.0       1.00      1.00      1.00      4190
         7.0       1.00      1.00      1.00        10
         9.0       1.00      1.00      1.00        34

    accuracy                           1.00      5964
   macro avg       1.00      1.00      1.00      5964
weighted avg       1.00      1.00      1.00      5964

Classification report on the testing set
              precision    recall  f1-score   support

         1.0       0.90      0.76      0.83       433
         2.0       0.90      0.97      0.93      1047
         7.0       1.00      0.50      0.67         2
         9.0       1.00      0.56      0.71         9

    accuracy                           0.90      1491
   macro avg       0.95      0.70      0.79      1491
weighted avg       0.90      0.90      0.90      1

In [50]:
print("Model performance after considering class imbalance")
rf = RandomForestClassifier(random_state=42, class_weight={1:4,2:1,9:20,7:61})
rf.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = rf.predict(X_train)

# Evaluate the model using classification report
print("Classification report on the training set")
print(classification_report(y_train, y_train_pred))

# Make predictions on the testing set
y_test_pred = rf.predict(X_test)

# Evaluate the model using classification report
print("Classification report on the testing set")
print(classification_report(y_test, y_test_pred))

Model performance after considering class imbalance
Classification report on the training set
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00      1730
         2.0       1.00      1.00      1.00      4190
         7.0       1.00      1.00      1.00        10
         9.0       1.00      1.00      1.00        34

    accuracy                           1.00      5964
   macro avg       1.00      1.00      1.00      5964
weighted avg       1.00      1.00      1.00      5964

Classification report on the testing set
              precision    recall  f1-score   support

         1.0       0.92      0.76      0.83       433
         2.0       0.90      0.97      0.94      1047
         7.0       1.00      0.50      0.67         2
         9.0       1.00      0.56      0.71         9

    accuracy                           0.91      1491
   macro avg       0.96      0.70      0.79      1491
weighted avg       0.91      0.91      0.91      1