In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
#from sklearn.impute import IterativeImputer

import util

In [2]:
df = pd.read_table("../data/drugs.tsv")
df

Unnamed: 0,CASEID,QUESTID2,CIGEVER,CIGOFRSM,CIGWILYR,CIGTRY,CIGYFU,CIGMFU,CIGREC,CIG30USE,...,II2EMSTY,EMPSTAT4,IIEMPST4,II2EMST4,PDEN00,COUTYP2,MAIIN002,ANALWT_C,VESTR,VEREP
0,1,48694667,1,99,99,19,2012,7,1,1,...,1,4,1,1,2,2,2,4398.40,30017,1
1,2,88530883,1,99,99,14,9999,99,2,93,...,1,4,1,1,1,1,2,1419.19,30052,2
2,3,33251077,1,99,99,14,9999,99,1,2,...,1,99,9,9,3,3,2,14052.62,30028,1
3,4,37814127,1,99,99,16,9999,99,4,93,...,1,4,1,1,1,1,2,10848.18,30055,2
4,5,18762590,1,99,99,14,9999,99,4,93,...,1,1,1,1,2,2,2,5651.73,30013,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55155,55156,13675473,2,99,99,991,9991,91,91,91,...,1,2,1,1,3,3,2,679.36,30003,1
55156,55157,49609908,1,99,99,16,9999,99,4,93,...,1,1,1,1,2,2,2,2296.28,30057,2
55157,55158,81795924,2,99,99,991,9991,91,91,91,...,1,4,1,1,3,3,2,17180.64,30026,1
55158,55159,17198338,2,4,4,991,9991,91,91,91,...,9,99,9,9,1,1,2,3104.06,30051,2


Before we alter any columns, let us see if there are any duplicate entries.
In the cell below, we see the length is the same, therefore we verify each 
Question ID is unique (this is important for ensuring no duplicate entries)

In [3]:
df['QUESTID2'].value_counts().sum() 

55160

## Wrangling and Cleaning

First we will only look at the data where the age of the respondant is between [12, 25]. Then we will assess the missingness of each column by first changing all *'MISSING'* *'SKIPPED'* *'CHOSE NOT TO RESPOND'*, etc values denoted by the Codebook documentation. We will also use vectorized functions to make use of efficient algorithms. 

In [4]:
def check_missing(x):
    """
    Checks whether or not a value in a series ends with a certain number

    Parameters
    ----------
    x : int
        value from series
    
    Returns
    -------
    np.Nan or x : True if it does end with the number otherwise false
    """
    str_x = str(x)
    if str_x.endswith(('91', '93', '94', '97', '98', '99', '.0')):
        return pd.NA
    else: 
        return x

In [5]:
# only want ages 12 - 25
df = df[(df['CATAGE'] == 1) | (df['CATAGE'] == 2)]
df = df.set_index('CASEID')

Lets also see if we can partition the operations to make the function faster, since we are dealing with a dataset of about a billion observations.
We will attempt to do this through Dask, a distributed systems library, and if it is faster we will apply this later on in the future.

In [6]:
#dask_df = dd.from_pandas(df, npartitions=10)  # You can adjust the number of partitions
# Apply function to each partition
#dask_df = dask_df.map_partitions(lambda df: 
#                                 df.applymap(check_missing)).compute(scheduler='processes')

# Convert back to pandas DataFrame (optional)
#dask_df

Using the % timeit function, we see that there is no significant difference in using Dask. This is probably because our Dataset is large enough to take a while to run on, but not large enough to where it would warrant memory issues. It is in a middle ground where it is large enough to where it would take a while on a local system, but not large enough to benefit from parallizing. 

Moving on, we will use applymap as normal

In [7]:
# make all missing values as pd.NA
df = df.applymap(check_missing)
df

Unnamed: 0_level_0,QUESTID2,CIGEVER,CIGOFRSM,CIGWILYR,CIGTRY,CIGYFU,CIGMFU,CIGREC,CIG30USE,CG30EST,...,II2EMSTY,EMPSTAT4,IIEMPST4,II2EMST4,PDEN00,COUTYP2,MAIIN002,ANALWT_C,VESTR,VEREP
CASEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,48694667,1,,,19,2012,7,1,1,,...,1,4,1,1,2,2,2,4398.4,30017,1
2,88530883,1,,,14,,,2,,,...,1,4,1,1,1,1,2,1419.19,30052,2
3,33251077,1,,,14,,,1,2,,...,1,,9,9,3,3,2,14052.62,30028,1
8,35106150,1,,,22,,,2,,,...,1,1,1,1,2,2,2,3211.24,30024,1
9,67182690,1,,,16,,,1,20,,...,1,4,1,1,1,1,2,6396.73,30038,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55153,31776420,2,,,,,,,,,...,1,4,1,1,2,2,2,1649.35,30024,2
55155,10772244,1,,,18,,,3,,,...,1,1,1,1,1,1,2,4594.84,30002,2
55156,13675473,2,,,,,,,,,...,1,2,1,1,3,3,2,679.36,30003,1
55157,49609908,1,,,16,,,4,,,...,1,1,1,1,2,2,2,2296.28,30057,2


Now, since there are many columns and a lot of NA values, we will drop all columns with over 95% missingness as the amount of data in these columns is not enough in comparison to the rest of the data and may not be a good representation of the overall distribution of these specific features.

In [39]:
df_subset = df.loc[:, df.isna().mean() < 0.95]
df_subset

Unnamed: 0_level_0,QUESTID2,CIGEVER,CIGOFRSM,CIGWILYR,CIGTRY,CIGYFU,CIGMFU,CIGREC,CIG30USE,CIG30AV,...,II2EMSTY,EMPSTAT4,IIEMPST4,II2EMST4,PDEN00,COUTYP2,MAIIN002,ANALWT_C,VESTR,VEREP
CASEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,48694667,1,,,19,2012,7,1,1,2,...,1,4,1,1,2,2,2,4398.4,30017,1
2,88530883,1,,,14,,,2,,,...,1,4,1,1,1,1,2,1419.19,30052,2
3,33251077,1,,,14,,,1,2,1,...,1,,9,9,3,3,2,14052.62,30028,1
8,35106150,1,,,22,,,2,,,...,1,1,1,1,2,2,2,3211.24,30024,1
9,67182690,1,,,16,,,1,20,3,...,1,4,1,1,1,1,2,6396.73,30038,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55153,31776420,2,,,,,,,,,...,1,4,1,1,2,2,2,1649.35,30024,2
55155,10772244,1,,,18,,,3,,,...,1,1,1,1,1,1,2,4594.84,30002,2
55156,13675473,2,,,,,,,,,...,1,2,1,1,3,3,2,679.36,30003,1
55157,49609908,1,,,16,,,4,,,...,1,1,1,1,2,2,2,2296.28,30057,2


Lets see the proportion of NA values in each column

In [40]:
columns_nas = df_subset.isna().sum()/df_subset.shape[0]
columns_nas

QUESTID2    0.062880
CIGEVER     0.000000
CIGOFRSM    0.590585
CIGWILYR    0.590641
CIGTRY      0.619878
              ...   
COUTYP2     0.000000
MAIIN002    0.000000
ANALWT_C    0.070210
VESTR       0.000000
VEREP       0.000000
Length: 1912, dtype: float64

Lets Verify that All Columns of 95% Missingness Have Been Dropped

In [41]:
columns_nas95 = columns_nas[columns_nas > 0.95]
columns_nas95 

Series([], dtype: float64)

Lets get a list of the 'bad' columns. These are columns representing hard drugs (anything other than Tobacco, Alcohol, and Marijuana)

In [42]:
bad_columns = util.get_bad_columns(filepath='../data/Data-Codebook.pdf')

In [43]:
#bad_columns

In [44]:
df_subset = df_subset.drop(columns=bad_columns, errors='ignore')
df_subset

Unnamed: 0_level_0,QUESTID2,CIGEVER,CIGOFRSM,CIGWILYR,CIGTRY,CIGYFU,CIGMFU,CIGREC,CIG30USE,CIG30AV,...,II2EMSTY,EMPSTAT4,IIEMPST4,II2EMST4,PDEN00,COUTYP2,MAIIN002,ANALWT_C,VESTR,VEREP
CASEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,48694667,1,,,19,2012,7,1,1,2,...,1,4,1,1,2,2,2,4398.4,30017,1
2,88530883,1,,,14,,,2,,,...,1,4,1,1,1,1,2,1419.19,30052,2
3,33251077,1,,,14,,,1,2,1,...,1,,9,9,3,3,2,14052.62,30028,1
8,35106150,1,,,22,,,2,,,...,1,1,1,1,2,2,2,3211.24,30024,1
9,67182690,1,,,16,,,1,20,3,...,1,4,1,1,1,1,2,6396.73,30038,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55153,31776420,2,,,,,,,,,...,1,4,1,1,2,2,2,1649.35,30024,2
55155,10772244,1,,,18,,,3,,,...,1,1,1,1,1,1,2,4594.84,30002,2
55156,13675473,2,,,,,,,,,...,1,2,1,1,3,3,2,679.36,30003,1
55157,49609908,1,,,16,,,4,,,...,1,1,1,1,2,2,2,2296.28,30057,2


HLTINDRG: represents drug abuse

Lets see the value counts for it so we can get an idea of the missingness in the response vector.

In [52]:
y = df_subset['HLTINDRG']
print(y.value_counts())
print('Number of NA values: ', len(y) - y.value_counts().iloc[0] - y.value_counts().iloc[1])

1    8361
2    2584
Name: HLTINDRG, dtype: int64
Number of NA values:  24933


Since we have a large proportion of missing values, we must assess how to handle them.

Data Exclusion: This strategy, also known as listwise or complete-case deletion, involves removing all cases (rows) with missing values. This is the simplest approach and can be used when the number of cases with missing response values is small and are Missing Completely at Random (MCAR). However, you can lose a lot of data if the number of missing values is substantial.

Imputation: You can still use imputation techniques, but need to choose ones that are suitable for binary variables. Common techniques include:

- Mode imputation: Replace the missing values with the mode (most common value) of the non-missing values. This is a simple and fast method, but it can introduce bias if the missing data is not MCAR.

- Predictive mean matching (PMM): This involves using a model to predict the missing values from the other data, and then taking a random draw from the observed values with similar predicted values. This is more robust than simple model-based imputation.

- Logistic regression imputation: Since the response variable is binary, you can fit a logistic regression model using the other variables, and then predict the missing values.

- Multiple Imputation: Similar to the continuous case, multiple imputation can be used. It acknowledges the uncertainty about the imputation by creating several different plausible imputed datasets and combining results obtained from each of them.

Remember, the way you handle missing data in the response variable should depend on the reason why the data is missing. For example, if a response is missing not at random, meaning that the missingness is related to the underlying, unobserved response, excluding these cases or filling in with imputed values can lead to bias in your results. In such cases, more advanced techniques might be necessary, possibly involving using models that can handle missing data directly.

## Imputation

Since we have a lot of missing values, we are going to assess handling them in two different ways:

- listwise deletion: dropping all rows where the response vector value is NA, then sampling from the distribution and imputing to preserve the shape and nature. 
- probabilistic: sampling from the distribution and imputing to preserve the shape and nature in both X and y. 

Then we will compare the distributions between these two, as well as run them through our model to see if there are significant differences in either. 

Our hypothesis is that probabisitic will be a better way of imputing values in our design matrix, X, and that we can just drop rows after this imputation step where the value in our response vector, y are <NA>

Now that we have are data cleaned, imputed, and subsetted to only the columns and rows relevant to our analysis, we can prepare it for PCA. By doing PCA, we want to find the most signifcant feature vectors that explain the variance in the dataset, and ultimately contribute to drug abuse so that we can discover where to prevent drug usage among teens.

In [53]:
X = df_subset.iloc[:, 1:]
X

Unnamed: 0_level_0,CIGEVER,CIGOFRSM,CIGWILYR,CIGTRY,CIGYFU,CIGMFU,CIGREC,CIG30USE,CIG30AV,CIG30BR2,...,II2EMSTY,EMPSTAT4,IIEMPST4,II2EMST4,PDEN00,COUTYP2,MAIIN002,ANALWT_C,VESTR,VEREP
CASEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,,,19,2012,7,1,1,2,,...,1,4,1,1,2,2,2,4398.4,30017,1
2,1,,,14,,,2,,,,...,1,4,1,1,1,1,2,1419.19,30052,2
3,1,,,14,,,1,2,1,112,...,1,,9,9,3,3,2,14052.62,30028,1
8,1,,,22,,,2,,,,...,1,1,1,1,2,2,2,3211.24,30024,1
9,1,,,16,,,1,20,3,112,...,1,4,1,1,1,1,2,6396.73,30038,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55153,2,,,,,,,,,,...,1,4,1,1,2,2,2,1649.35,30024,2
55155,1,,,18,,,3,,,,...,1,1,1,1,1,1,2,4594.84,30002,2
55156,2,,,,,,,,,,...,1,2,1,1,3,3,2,679.36,30003,1
55157,1,,,16,,,4,,,,...,1,1,1,1,2,2,2,2296.28,30057,2


In [58]:
def impute_na_with_random(series):
    """
    Replaces NA/NaN values in a pandas Series with samples drawn from the 
    non-missing values of that series.

    Parameters
    ----------
    series : pandas Series
        row from X

    Returns
    -------
    pandas Series : The input series with NA/NaN values replaced 
                    by sampled non-missing values.
    """
    mask = series.isna()
    samples = series[~mask].sample(mask.sum(), replace=True)
    samples.index = series[mask].index
    series[mask] = samples
    return series

Dropping all rows where the response vector value is NA, then sampling to impute missing values in X

In [57]:
X_listwise_deletion = df_subset.drop(df_subset[(df_subset['HLTINDRG'] == 94) | (df_subset['HLTINDRG'].isna())].index)
X_listwise_deletion = X_listwise_deletion.apply(impute_na_with_random)
X_listwise_deletion # len should be 8361 + 2584 based on y.value_counts()

Unnamed: 0_level_0,QUESTID2,CIGEVER,CIGOFRSM,CIGWILYR,CIGTRY,CIGYFU,CIGMFU,CIGREC,CIG30USE,CIG30AV,...,II2EMSTY,EMPSTAT4,IIEMPST4,II2EMST4,PDEN00,COUTYP2,MAIIN002,ANALWT_C,VESTR,VEREP
CASEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,48694667,1,3,4,19,2012,7,1,1,2,...,1,4,1,1,2,2,2,4398.4,30017,1
2,88530883,1,4,4,14,2013,7,2,1,3,...,1,4,1,1,1,1,2,1419.19,30052,2
3,33251077,1,4,4,14,2013,12,1,2,1,...,1,3,9,9,3,3,2,14052.62,30028,1
8,35106150,1,4,4,22,2011,89,2,11,2,...,1,1,1,1,2,2,2,3211.24,30024,1
9,67182690,1,4,4,16,2012,10,1,20,3,...,1,4,1,1,1,1,2,6396.73,30038,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55153,31776420,2,4,4,16,2012,7,1,1,5,...,1,4,1,1,2,2,2,1649.35,30024,2
55155,10772244,1,4,3,18,2012,1,3,30,3,...,1,1,1,1,1,1,2,4594.84,30002,2
55156,13675473,2,4,4,16,9985,7,1,15,3,...,1,2,1,1,3,3,2,679.36,30003,1
55157,49609908,1,4,4,16,2012,1,4,4,3,...,1,1,1,1,2,2,2,2296.28,30057,2


Imputing All NA values with values that are from the distribution (this step is harder and takes a longer amount of time)

In [59]:
# Apply the function on each column of df_subset
X_probabilistic = df_subset.apply(impute_na_with_random)
X_probabilistic

Unnamed: 0_level_0,QUESTID2,CIGEVER,CIGOFRSM,CIGWILYR,CIGTRY,CIGYFU,CIGMFU,CIGREC,CIG30USE,CIG30AV,...,II2EMSTY,EMPSTAT4,IIEMPST4,II2EMST4,PDEN00,COUTYP2,MAIIN002,ANALWT_C,VESTR,VEREP
CASEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,48694667,1,3,4,19,2012,7,1,1,2,...,1,4,1,1,2,2,2,4398.4,30017,1
2,88530883,1,4,4,14,2013,7,2,1,3,...,1,4,1,1,1,1,2,1419.19,30052,2
3,33251077,1,4,4,14,2013,12,1,2,1,...,1,3,9,9,3,3,2,14052.62,30028,1
8,35106150,1,4,4,22,2011,89,2,11,2,...,1,1,1,1,2,2,2,3211.24,30024,1
9,67182690,1,4,4,16,2012,10,1,20,3,...,1,4,1,1,1,1,2,6396.73,30038,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55153,31776420,2,4,4,16,2012,7,1,1,5,...,1,4,1,1,2,2,2,1649.35,30024,2
55155,10772244,1,4,3,18,2012,1,3,30,3,...,1,1,1,1,1,1,2,4594.84,30002,2
55156,13675473,2,4,4,16,9985,7,1,15,3,...,1,2,1,1,3,3,2,679.36,30003,1
55157,49609908,1,4,4,16,2012,1,4,4,3,...,1,1,1,1,2,2,2,2296.28,30057,2


In [19]:
imputer = SimpleImputer(strategy='mean')
imputed_data_probabilistic = imputer.fit_transform(X_probabilistic)

In [20]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(imputed_data_probabilistic)

In [21]:
scaled_data

array([[-0.20611092, -1.27195782, -2.07751832, ...,  1.67099356,
        -0.76247569, -1.02129816],
       [ 1.3587172 , -1.27195782,  0.40044718, ..., -0.15089467,
         1.25156655,  0.97914599],
       [-0.812759  , -1.27195782,  0.40044718, ...,  7.57487739,
        -0.12949098, -1.02129816],
       ...,
       [-1.58171898,  0.78618959,  0.40044718, ..., -0.60332587,
        -1.56809258, -1.02129816],
       [-0.17015884, -1.27195782, -2.07751832, ...,  0.38547569,
         1.53928687,  0.97914599],
       [-1.4433354 ,  0.78618959,  0.40044718, ...,  0.87946063,
         1.19402248,  0.97914599]])

In [22]:
pca = PCA()
pca.fit(scaled_data)

PCA()

In [23]:
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

variance_threshold = 0.95  # Set your desired threshold here
n_components = np.argmax(cumulative_variance_ratio >= variance_threshold) + 1

In [24]:
n_components

849

only 889 components contribute to the variance. This variance could be skewed due to the 99,999, or 9999 values that aren't considered missing

In [25]:
loadings = pca.components_
absolute_loadings = np.abs(loadings)
feature_importance = np.mean(absolute_loadings, axis=0)
sorted_indices = np.argsort(feature_importance)[::-1]
feature_names = list(X_probabilistic.columns)
for i in sorted_indices:
    print(f"{feature_names[i]}: {feature_importance[i]}")

STMMON: 0.01541173921043977
CPNSTMMN: 0.015292414418632928
SPILANAL: 0.01506784830309735
HALCION2: 0.014979560375467684
DRGTXER: 0.014884661751194031
PHENOBR2: 0.014852493118977352
ABUSESTM: 0.014788031841780768
MTHLTSC4: 0.014743316126571785
ABUSEANL: 0.014708255569975986
NEMBBAR2: 0.014704911542607233
EQUANIL2: 0.014690613236822483
DEPNDHAL: 0.014687165964812588
TRQMON: 0.014660645340257928
METHAQ2: 0.014638996201416043
SEDMON: 0.014617144079935446
OXYYR: 0.014582217566414138
HERYR: 0.014580067139044455
MEPROB2: 0.014576876418249592
IIMTHRC: 0.01456829578610593
CRKMON: 0.014559778047851352
ALCTXER: 0.014553680307858394
BUSPAR2: 0.014538227597631707
IIHALRC: 0.01452068885374839
TALACEN2: 0.014512638745315339
II2HALRC: 0.014500213838927991
MILTOWN2: 0.014480753919231672
II2MTHRC: 0.014443036244316953
VISTAR2: 0.014441650942923127
DALMANE2: 0.014413310191587122
ABUSEHER: 0.014393876614889733
IIECSRC: 0.014383224083890237
LIBRIUM2: 0.014376995164059053
BUTISOL2: 0.01437328207999183
DEPND

In [26]:
X_probabilistic['SUMFLAG']

CASEID
1        0
2        1
3        0
8        0
9        1
        ..
55153    0
55155    1
55156    0
55157    1
55159    0
Name: SUMFLAG, Length: 35878, dtype: int64

In [27]:
correlation_matrix = X_probabilistic.corr()
correlation_with_sumflag = correlation_matrix["SUMFLAG"]
correlation_with_sumflag

CIGEVER    -0.562219
IRCIGRC    -0.566895
IICIGRC     0.028558
II2CIGRC    0.029182
IRCGRRC    -0.443111
              ...   
PDEN00     -0.007015
COUTYP2    -0.010971
MAIIN002   -0.017569
VESTR      -0.002928
VEREP       0.001787
Name: SUMFLAG, Length: 1158, dtype: float64

In [28]:
most_correlated_features = abs(correlation_with_sumflag).drop(["SUMFLAG"]).sort_values(ascending = False)
most_correlated_features

FUSUM21     0.967264
MRJFLAG     0.879985
IIMJAGE     0.879033
IRMJRC      0.870129
IIMJYFU     0.869983
              ...   
IIPINC3     0.000673
FDOCSCHL    0.000669
IIFAMSZ2    0.000501
PAROL       0.000429
IIFSTAMP    0.000311
Name: SUMFLAG, Length: 1157, dtype: float64