In [1]:
pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('framingham_3000.csv')

In [4]:
df['TenYearCHD'].value_counts()

TenYearCHD
0    2541
1     459
Name: count, dtype: int64

In [5]:
negative_positions = (df < 0) #no negative values
rows, cols = negative_positions.values.nonzero()
for row, col in zip(rows, cols):
    print(f"Negative value at row {df.index[row]}, column {df.columns[col]}: {df.iat[row, col]}")

In [6]:
duplicates = df[df.duplicated()] #no duplicates
print(duplicates)

Empty DataFrame
Columns: [male, age, education, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totChol, sysBP, diaBP, BMI, heartRate, glucose, TenYearCHD]
Index: []


In [7]:
df.isnull().sum()

male                 0
age                  0
education           75
currentSmoker        0
cigsPerDay          15
BPMeds              40
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             38
sysBP                0
diaBP                0
BMI                 17
heartRate            1
glucose            282
TenYearCHD           0
dtype: int64

In [8]:
df_imputed = df.copy() #imputing null values
# When selecting columns to impute
columns_to_impute = ['education', 'cigsPerDay', 'BPMeds', 'totChol', 'BMI', 'glucose', 'heartRate']

# Apply median imputation ONLY to these columns
for column in columns_to_impute:
    median_value = df_imputed[column].median()
    df_imputed[column].fillna(median_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_imputed[column].fillna(median_value, inplace=True)


In [9]:
df_imputed['TenYearCHD'].value_counts()

TenYearCHD
0    2541
1     459
Name: count, dtype: int64

In [25]:
df_outliers_handled['TenYearCHD'].value_counts()

TenYearCHD
0    2541
1     459
Name: count, dtype: int64

In [31]:
from sklearn.feature_selection import SelectKBest, f_classif

# Assuming df is your DataFrame and 'target' is the categorical column
X = df_outliers_handled.drop(columns=['TenYearCHD'])  # Features (continuous variables)
y = df_outliers_handled['TenYearCHD']  # Target (categorical variable)

# Perform ANOVA F-test
selector = SelectKBest(score_func=f_classif, k='all')  # k can be set to 'all' or a specific number of top features
selector.fit(X, y)

# Get the p-values of the features
p_values = selector.pvalues_

# Create a DataFrame with features and their corresponding p-values
feature_scores = pd.DataFrame({
    'Feature': X.columns,
    'P-Value': p_values
})

# Sort features by p-value (lowest p-value first)
sorted_features = feature_scores.sort_values(by='P-Value', ascending=True)

# Optional: Select features with p-value below a threshold (e.g., 0.05)
significant_features = sorted_features[sorted_features['P-Value'] < 0.05]
print(f"Significant features: \n{significant_features}")


Significant features: 
            Feature       P-Value
1               age  5.928346e-35
10            sysBP  5.365650e-32
7      prevalentHyp  2.110050e-20
14          glucose  7.178498e-15
11            diaBP  1.266971e-14
8          diabetes  2.080084e-10
0              male  5.501063e-06
9           totChol  6.150936e-06
12              BMI  9.997507e-06
4        cigsPerDay  2.405940e-04
6   prevalentStroke  2.974041e-04
2         education  7.623249e-04
5            BPMeds  1.229069e-03


In [32]:
df_outliers_handled.columns

Index(['male', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD'],
      dtype='object')

In [34]:
# Get the list of significant feature names
selected_columns = significant_features['Feature'].tolist()

# Include the target column
selected_columns.append('TenYearCHD')

# Create a new DataFrame with only the selected features and target
df_selected = df_outliers_handled[selected_columns]

In [35]:
df_selected.columns

Index(['age', 'sysBP', 'prevalentHyp', 'glucose', 'diaBP', 'diabetes', 'male',
       'totChol', 'BMI', 'cigsPerDay', 'prevalentStroke', 'education',
       'BPMeds', 'TenYearCHD'],
      dtype='object')

In [36]:
from imblearn.over_sampling import SMOTE

# Separate features and target
X = df_selected.drop(columns=['TenYearCHD'])
y = df_selected['TenYearCHD']

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Combine back into a single DataFrame
df_balanced = pd.DataFrame(X_resampled, columns=X.columns)
df_balanced['TenYearCHD'] = y_resampled

In [41]:
df_balanced.columns

Index(['age', 'sysBP', 'prevalentHyp', 'glucose', 'diaBP', 'diabetes', 'male',
       'totChol', 'BMI', 'cigsPerDay', 'prevalentStroke', 'education',
       'BPMeds', 'TenYearCHD'],
      dtype='object')

In [42]:
df_balanced['TenYearCHD'].value_counts()

TenYearCHD
0    2541
1    2541
Name: count, dtype: int64

In [45]:
df_balanced.to_csv('data.csv', index=False)