## **DS 5100 Project: A Data-Driven Investigation of the Physiological Factors that Predict the Presence of Cardiovascular Disease**

### Group 1: Nick Landi, Burke Lawlor, Cecily Wolfe

### **Data Preprocessing Part 2: Dropping Additional Columns with Missing Data - includes custom Class, associated Methods, and Unit Tests**

In [1]:
import numpy as np
import pandas as pd

In [2]:
heart_df = pd.read_csv("combined copy.csv")
heart_df

Unnamed: 0.1,Unnamed: 0,id,ssn,age,sex,painloc,painexer,relrest,pncaden,cp,...,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk,name,loc
0,0,3001,0,65,1,1.0,1.0,1.0,,4,...,1.0,1.0,1.0,1.0,1.0,1.0,75.00,,name,swit
1,1,3002,0,32,1,0.0,0.0,0.0,,1,...,1.0,1.0,1.0,1.0,5.0,1.0,63.00,,name,swit
2,2,3003,0,61,1,1.0,1.0,1.0,,4,...,1.0,1.0,1.0,1.0,1.0,1.0,67.00,,name,swit
3,3,3004,0,50,1,1.0,1.0,1.0,,4,...,1.0,1.0,1.0,1.0,5.0,4.0,36.00,,name,swit
4,4,3005,0,57,1,1.0,1.0,1.0,,4,...,1.0,1.0,1.0,1.0,1.0,1.0,60.00,,name,swit
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894,195,200,0,54,0,1.0,1.0,1.0,,4,...,1.0,1.0,1.0,1.0,1.0,1.0,0.76,5.6,name,long
895,196,201,0,62,1,0.0,0.0,0.0,,1,...,1.0,1.0,1.0,1.0,1.0,2.0,0.62,3.5,name,long
896,197,202,0,55,1,1.0,1.0,1.0,,4,...,1.0,1.0,1.0,1.0,1.0,1.0,0.69,5.6,name,long
897,198,116,0,58,1,1.0,1.0,1.0,,4,...,1.0,1.0,1.0,1.0,1.0,1.0,0.81,6.0,name,long


In [3]:
# drop columns:
    # added index row (from converting pandas df to csv)
    # ssn (documentation: replaced by dummy value of 0)
    # dummy (same value as trestbps)
    # restckm (documentation: irrelevant)
    # exerckm (documentation: irrelevant)
    # thalsev (documentation: not used)
    # thalpul (documentation: not used)
    # earlobe (documentation: not used)
    # lvx1, lvx2, lvx3, lvx4, lvf, cathef (documentation: not used)
    # junk
    # name (documentation: last name of patient replaced by dummy string "name")
        # note that dummy equivalent to trestbps: https://www.sciencedirect.com/science/article/pii/S2352914820300125

heart_df = heart_df.drop(["Unnamed: 0", "ssn", "dummy", "restckm", "exerckm", "thalsev", "thalpul", "earlobe", "lvx1", "lvx2", "lvx3", "lvx4", "lvf", "cathef", "junk", "name"], axis=1)

In [4]:
# heart_df

In [5]:
# drop more irrelevant columns
heart_df = heart_df.drop(["id", "ekgmo", "ekgday", "ekgyr", "slope", "cmo", "cday", "cyr", "lmt", "ladprox", "laddist", "cxmain", "om1", "rcaprox", "rcadist"], axis=1)

In [6]:
heart_df["num"].value_counts()

0    404
1    191
3    132
2    130
4     42
Name: num, dtype: int64

**Note: num = diagnosis of heart disease (angiographic disease status)**
* Originally 0, 1, 2, 3, 4 with 0 indicating no heart disease (< 50% artery (?) diameter narrowing) and 1, 2, 3, 4 indicating various levels of artery diameter narrowing (> 50% diameter narrowing)
* Combine 1, 2, 3, 4 into the value 1 (i.e. clip col at 1) --> **update: 0 = no heart disease, 1 = heart disease**

**Source for rationale to combining varying levels of (artery?) diameter narrowing that indicate diff. levels of heart disease (but all indicate presence): https://www.sciencedirect.com/science/article/pii/S2352914820300125**

In [7]:
heart_df.num.clip(0, 1, inplace=True)

In [8]:
heart_df.num.value_counts()

1    495
0    404
Name: num, dtype: int64

In [9]:
heart_df.sex.value_counts()

1    711
0    188
Name: sex, dtype: int64

In [10]:
# recreating pncaden col --> documentation: the sum of cols 5-7 (in this case painloc, painexer, relrest)
heart_df["pncaden"] = heart_df[["painloc", "painexer", "relrest"]].apply(lambda x: x.painloc + x.painexer + x.relrest, axis=1)
heart_df

Unnamed: 0,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,htn,chol,...,restef,restwm,exeref,exerwm,thal,num,diag,ramus,om2,loc
0,65,1,1.0,1.0,1.0,3.0,4,115.0,0.0,0.0,...,,,,,7.0,1,1.0,1.0,1.0,swit
1,32,1,0.0,0.0,0.0,0.0,1,95.0,1.0,0.0,...,,,,,,1,1.0,1.0,1.0,swit
2,61,1,1.0,1.0,1.0,3.0,4,105.0,0.0,0.0,...,,,,,,1,1.0,1.0,1.0,swit
3,50,1,1.0,1.0,1.0,3.0,4,145.0,0.0,0.0,...,,,,,,1,1.0,1.0,1.0,swit
4,57,1,1.0,1.0,1.0,3.0,4,110.0,0.0,0.0,...,,,,,,1,1.0,2.0,1.0,swit
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894,54,0,1.0,1.0,1.0,3.0,4,127.0,0.0,333.0,...,,,,,,1,1.0,1.0,1.0,long
895,62,1,0.0,0.0,0.0,0.0,1,,0.0,139.0,...,0.41,1.0,,,,0,1.0,1.0,1.0,long
896,55,1,1.0,1.0,1.0,3.0,4,122.0,1.0,223.0,...,0.39,3.0,,,6.0,1,1.0,1.0,1.0,long
897,58,1,1.0,1.0,1.0,3.0,4,,0.0,385.0,...,,,,,,0,1.0,1.0,1.0,long


In [11]:
# using .loc for replacing values in a col that meet a given condition: https://www.kite.com/python/answers/how-to-change-values-in-a-pandas-dataframe-column-based-on-a-condition-in-python
# syntax: df.loc[(condition), "name of col to update"] = new_value

# rationale: some rows with 0 in the cholestorol col --> not possible so replace with NaN
# problem documented by researchers that used this dataset in the publication "Classification models for heart disease prediction using feature selection and PCA"
heart_df.loc[(heart_df.chol == 0),'chol']= np.nan

In [12]:
# create a copy of df for making additional changes
heart_df_copy = heart_df.copy()

In [13]:
heart_df_copy

Unnamed: 0,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,htn,chol,...,restef,restwm,exeref,exerwm,thal,num,diag,ramus,om2,loc
0,65,1,1.0,1.0,1.0,3.0,4,115.0,0.0,,...,,,,,7.0,1,1.0,1.0,1.0,swit
1,32,1,0.0,0.0,0.0,0.0,1,95.0,1.0,,...,,,,,,1,1.0,1.0,1.0,swit
2,61,1,1.0,1.0,1.0,3.0,4,105.0,0.0,,...,,,,,,1,1.0,1.0,1.0,swit
3,50,1,1.0,1.0,1.0,3.0,4,145.0,0.0,,...,,,,,,1,1.0,1.0,1.0,swit
4,57,1,1.0,1.0,1.0,3.0,4,110.0,0.0,,...,,,,,,1,1.0,2.0,1.0,swit
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894,54,0,1.0,1.0,1.0,3.0,4,127.0,0.0,333.0,...,,,,,,1,1.0,1.0,1.0,long
895,62,1,0.0,0.0,0.0,0.0,1,,0.0,139.0,...,0.41,1.0,,,,0,1.0,1.0,1.0,long
896,55,1,1.0,1.0,1.0,3.0,4,122.0,1.0,223.0,...,0.39,3.0,,,6.0,1,1.0,1.0,1.0,long
897,58,1,1.0,1.0,1.0,3.0,4,,0.0,385.0,...,,,,,,0,1.0,1.0,1.0,long


In [14]:
class RevisedDataFrames:
    """This is a revised dataframe object"""
    
    def __init__(self, df, threshold, high_corr=None):
        self.df = df
        self.threshold = threshold
        
        self.high_corr = [] if high_corr == None else high_corr
    
    def check_na(self):
        """
        PURPOSE: report the number of missing values for each column in a dataframe to help determine if these columns can and should be retained in the dataframe
        
        INPUT (implied using attributes of self):
        self.df: a dataframe
        
        OUTPUT:
        col_sums: a list of tuples, where each tuple contains (column id, column name, number of missing values in that column)
        """
        bool_df = self.df.isna()
        col_sums = []
        for ix, col in enumerate(self.df.columns):
            col_sum = bool_df[col].sum()
            
            col_sums.append((ix, self.df.columns[ix], col_sum))
        
        return col_sums
    
    def remove_missing(self):
        """
        PURPOSE: take the list of tuples generated by check_na() and drop columns with more than the number of missing values deemed accetable
        
        INPUT (implied using attributes of self):
        self.df: dataframe
        self.threshold: the maximum number (exclusive) of missing values deemed acceptable
        
        OUTPUT:
        self.df: dataframe updated to drop the columns where the sum of missing values >= self.threshold
        """
        missing_vals = self.check_na()
        
        for item in missing_vals:
            if item[2] >= self.threshold:
                self.df.drop(item[1], axis=1, inplace = True)
        
        return self.df
    
    def find_high_corr(self, corr_min = 0.4):
        """
        PURPOSE: pinpoint variables in a dataframe that report a strong correlation coefficient
        
        INTPUT (implied in part by attributes of self):
        self.df: dataframe
        corr_min: a threshold at and above which the correlation of two variables is deemed high enough to be considered of interest
        
        OUTPUT:
        self.high_corr: a list of tuples, where each tuple contains (name of column 1, name of column 2, correlation coefficient between them)
        """
        corr_df = self.df.corr()
        #high_corr = []
        
        for i in range(0, len(corr_df.columns)):
            for j in range(0, len(corr_df.columns)):
                if i != j and abs(corr_df.iat[i, j]) >= corr_min:
                    if (corr_df.columns[j], corr_df.columns[i], corr_df.iat[j, i]) not in self.high_corr:
                        self.high_corr.append((corr_df.columns[i], corr_df.columns[j], corr_df.iat[i, j]))
        
        return self.high_corr

In [15]:
revised_df = RevisedDataFrames(heart_df_copy, 540)

In [16]:
# revised_df.check_na()

In [17]:
heart_df_new = revised_df.remove_missing()

In [18]:
# heart_df_new

In [19]:
# heart_df

In [20]:
heart_high_corr_list = revised_df.find_high_corr()

In [21]:
# unit test to see if dataframe returned as expected using pandas.testing.assert_frame_equal:
# pandas.testing.assert_frame_equal documentation: https://pandas.pydata.org/docs/reference/api/pandas.testing.assert_frame_equal.html
# example of using pd.testing.assert_frame_equal in a unit test: https://stackoverflow.com/questions/38839402/how-to-use-assert-frame-equal-in-unittest

import unittest
class RevisedDataFrameTestCase(unittest.TestCase):
    
     # checking if results as expected when transforming heart_df to heart_df_new
        # heart_df_copy set equal to heart_df prior to creating class, object of that class, and calling methods
        # heart_df_copy transformed then reassigned to heart_df_new
        # check if heart_df the same as heart_df_new when it undergoes the transformations
        #revised_df = RevisedDataFrames(heart_df, 540)
    revised_df = RevisedDataFrames(heart_df, 540)
    
    # additional test to see if methods for class RevisedDataFrame work for other dataframes
    new_df = RevisedDataFrames(pd.DataFrame(np.arange(20).reshape((5, 4))), 3)
    
    # additional test with some missing values that meet the threshold for dropping a row --> initially a 4 x 3 df but third col dropped since has two NaNs and threshold set at 2
    test_df = RevisedDataFrames(pd.DataFrame(np.array([[1, 5, np.nan], [10, -8, 6.5], [0, -6.2, np.nan], [np.nan, 12, 23]])), 2)
    
    def test_1_is_remove_missing_correct(self):
        
        """
        PURPOSE: test to see if the method remove_missing() (and, by extension, the check_na() method) defined for the RevisedDataFrames class functions correctly
        """
        
        # test using the actual data of interest (checking if results as expected when transforming heart_df to heart_df_new)
        self.revised_df.check_na()
        self.revised_df.remove_missing()
        
        expected1 = heart_df_new
        pd.testing.assert_frame_equal(self.revised_df.df, expected1)
        
        # test using a generic array with missing values
#         self.test_df.check_na()
#         self.test_df.remove_missing()
        
#         exp_df = pd.DataFrame(np.array([[1, 5], [10, -8], [0, -6.2], [np.nan, 12]]))
#         pd.testing.assert_frame_equal(self.test_df.df, exp_df)
        
        # test using an array without any missing values, which should be unchanged by the methods tested here
#         self.new_df.check_na()
#         self.new_df.remove_missing()
        
#         expected2 = pd.DataFrame(np.arange(20).reshape((5, 4)))  # no missing values so dataframe should not be affected by the methods
#         pd.testing.assert_frame_equal(self.new_df.df, expected2)
    
    
    def test_2_find_high_corr_correct(self):
        
        """
        PURPOSE: test to see if the method find_high_corr() defined for the RevisedDataFrames class functions correctly
        """
        
        # test using the actual data of interest (i.e. heart disease data)
        self.revised_df.find_high_corr()
        
        expected3 = heart_high_corr_list
        self.assertEqual(self.revised_df.high_corr, expected3)
        
    
        # test using a generic array with missing values and a specified threshold (rather than the default 0.4)
        # self.test_df.find_high_corr(0.5)
        
#         exp_corr = [(0, 1, -0.5318756671564486)]
#         self.assertEqual(self.test_df.high_corr, exp_corr)

        # test using a generic array without missing values
#         self.new_df.find_high_corr(0.5)
        
#         expected4 = [(0, 1, 1.0), (0, 2, 1.0), (0, 3, 1.0), (1, 2, 1.0), (1, 3, 1.0), (2, 3, 1.0)]
#         self.assertEqual(self.new_df.high_corr, expected4)

In [22]:
tc = RevisedDataFrameTestCase()

In [23]:
tc.test_1_is_remove_missing_correct()

In [24]:
tc.test_2_find_high_corr_correct()

In [25]:
# my_df = pd.DataFrame(np.arange(20).reshape((5, 4)))
# my_df.corr()
# expected_new = [(0, 1, 1.0), (0, 2, 1.0), (0, 3, 1.0), (1, 2, 1.0), (1, 3, 1.0), (2, 3, 1.0)]

**For comparison with one unit test example case outlined above**

In [26]:
missing_test_df = pd.DataFrame(np.array([[1, 5, np.nan], [10, -8, 6.5], [0, -6.2, np.nan], [np.nan, 12, 23]]))
missing_test_df

Unnamed: 0,0,1,2
0,1.0,5.0,
1,10.0,-8.0,6.5
2,0.0,-6.2,
3,,12.0,23.0


In [27]:
dropped_col_df = pd.DataFrame(np.array([[1, 5], [10, -8], [0, -6.2], [np.nan, 12]]))  # expected outcome after check_na() and remove_missing() methods
dropped_col_df

Unnamed: 0,0,1
0,1.0,5.0
1,10.0,-8.0
2,0.0,-6.2
3,,12.0


In [28]:
# correlation matrix
dropped_col_df.corr()
# expected outcome after find_high_corr() method: [(0, 1, -0.5318756671564486)]

Unnamed: 0,0,1
0,1.0,-0.531876
1,-0.531876,1.0
