In [27]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
#from sklearn.impute import IterativeImputer

import util
from imputation import check_missing, impute_na_with_random, convert_numeric
from pca import pca, create_correlation_matrix
from boosting import calculate_feature_importances
from nn import create_model

In [2]:
df = pd.read_table("../data/drugs.tsv")
df

Unnamed: 0,CASEID,QUESTID2,CIGEVER,CIGOFRSM,CIGWILYR,CIGTRY,CIGYFU,CIGMFU,CIGREC,CIG30USE,...,II2EMSTY,EMPSTAT4,IIEMPST4,II2EMST4,PDEN00,COUTYP2,MAIIN002,ANALWT_C,VESTR,VEREP
0,1,48694667,1,99,99,19,2012,7,1,1,...,1,4,1,1,2,2,2,4398.40,30017,1
1,2,88530883,1,99,99,14,9999,99,2,93,...,1,4,1,1,1,1,2,1419.19,30052,2
2,3,33251077,1,99,99,14,9999,99,1,2,...,1,99,9,9,3,3,2,14052.62,30028,1
3,4,37814127,1,99,99,16,9999,99,4,93,...,1,4,1,1,1,1,2,10848.18,30055,2
4,5,18762590,1,99,99,14,9999,99,4,93,...,1,1,1,1,2,2,2,5651.73,30013,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55155,55156,13675473,2,99,99,991,9991,91,91,91,...,1,2,1,1,3,3,2,679.36,30003,1
55156,55157,49609908,1,99,99,16,9999,99,4,93,...,1,1,1,1,2,2,2,2296.28,30057,2
55157,55158,81795924,2,99,99,991,9991,91,91,91,...,1,4,1,1,3,3,2,17180.64,30026,1
55158,55159,17198338,2,4,4,991,9991,91,91,91,...,9,99,9,9,1,1,2,3104.06,30051,2


Before we alter any columns, let us see if there are any duplicate entries.
In the cell below, we see the length is the same, therefore we verify each 
Question ID is unique (this is important for ensuring no duplicate entries)

In [3]:
df['QUESTID2'].value_counts().sum() 

55160

## Wrangling and Cleaning

First we will only look at the data where the age of the respondant is between [12, 25]. Then we will assess the missingness of each column by first changing all *'MISSING'* *'SKIPPED'* *'CHOSE NOT TO RESPOND'*, etc values denoted by the Codebook documentation. We will also use vectorized functions to make use of efficient algorithms. 

In [4]:
# only want ages 12 - 25
df = df[(df['CATAGE'] == 1) | (df['CATAGE'] == 2)]
df = df.set_index('CASEID')

Lets also see if we can partition the operations to make the function faster, since we are dealing with a dataset of about a billion observations.
We will attempt to do this through Dask, a distributed systems library, and if it is faster we will apply this later on in the future.

In [5]:
#dask_df = dd.from_pandas(df, npartitions=10)  # You can adjust the number of partitions
# Apply function to each partition
#dask_df = dask_df.map_partitions(lambda df: 
#                                 df.applymap(check_missing)).compute(scheduler='processes')

# Convert back to pandas DataFrame (optional)
#dask_df

Using the % timeit function, we see that there is no significant difference in using Dask. This is probably because our Dataset is large enough to take a while to run on, but not large enough to where it would warrant memory issues. It is in a middle ground where it is large enough to where it would take a while on a local system, but not large enough to benefit from parallizing. 

Moving on, we will use applymap as normal

In [6]:
# make all missing values as pd.NA
df = df.applymap(check_missing)
df

Unnamed: 0_level_0,QUESTID2,CIGEVER,CIGOFRSM,CIGWILYR,CIGTRY,CIGYFU,CIGMFU,CIGREC,CIG30USE,CG30EST,...,II2EMSTY,EMPSTAT4,IIEMPST4,II2EMST4,PDEN00,COUTYP2,MAIIN002,ANALWT_C,VESTR,VEREP
CASEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,48694667,1,,,19,2012,7,1,1,,...,1,4,1,1,2,2,2,4398.4,30017,1
2,88530883,1,,,14,,,2,,,...,1,4,1,1,1,1,2,1419.19,30052,2
3,33251077,1,,,14,,,1,2,,...,1,,9,9,3,3,2,14052.62,30028,1
8,35106150,1,,,22,,,2,,,...,1,1,1,1,2,2,2,3211.24,30024,1
9,67182690,1,,,16,,,1,20,,...,1,4,1,1,1,1,2,6396.73,30038,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55153,31776420,2,,,,,,,,,...,1,4,1,1,2,2,2,1649.35,30024,2
55155,10772244,1,,,18,,,3,,,...,1,1,1,1,1,1,2,4594.84,30002,2
55156,13675473,2,,,,,,,,,...,1,2,1,1,3,3,2,679.36,30003,1
55157,49609908,1,,,16,,,4,,,...,1,1,1,1,2,2,2,2296.28,30057,2


Now, since there are many columns and a lot of NA values, we will drop all columns with over 95% missingness as the amount of data in these columns is not enough in comparison to the rest of the data and may not be a good representation of the overall distribution of these specific features.

In [7]:
df_subset = df.loc[:, df.isna().mean() < 0.95]
df_subset

Unnamed: 0_level_0,QUESTID2,CIGEVER,CIGOFRSM,CIGWILYR,CIGTRY,CIGYFU,CIGMFU,CIGREC,CIG30USE,CIG30AV,...,II2EMSTY,EMPSTAT4,IIEMPST4,II2EMST4,PDEN00,COUTYP2,MAIIN002,ANALWT_C,VESTR,VEREP
CASEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,48694667,1,,,19,2012,7,1,1,2,...,1,4,1,1,2,2,2,4398.4,30017,1
2,88530883,1,,,14,,,2,,,...,1,4,1,1,1,1,2,1419.19,30052,2
3,33251077,1,,,14,,,1,2,1,...,1,,9,9,3,3,2,14052.62,30028,1
8,35106150,1,,,22,,,2,,,...,1,1,1,1,2,2,2,3211.24,30024,1
9,67182690,1,,,16,,,1,20,3,...,1,4,1,1,1,1,2,6396.73,30038,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55153,31776420,2,,,,,,,,,...,1,4,1,1,2,2,2,1649.35,30024,2
55155,10772244,1,,,18,,,3,,,...,1,1,1,1,1,1,2,4594.84,30002,2
55156,13675473,2,,,,,,,,,...,1,2,1,1,3,3,2,679.36,30003,1
55157,49609908,1,,,16,,,4,,,...,1,1,1,1,2,2,2,2296.28,30057,2


Lets see the proportion of NA values in each column

In [8]:
columns_nas = df_subset.isna().sum()/df_subset.shape[0]
columns_nas

QUESTID2    0.062880
CIGEVER     0.000000
CIGOFRSM    0.590585
CIGWILYR    0.590641
CIGTRY      0.619878
              ...   
COUTYP2     0.000000
MAIIN002    0.000000
ANALWT_C    0.070210
VESTR       0.000000
VEREP       0.000000
Length: 1912, dtype: float64

Lets Verify that All Columns of 95% Missingness Have Been Dropped

In [9]:
columns_nas95 = columns_nas[columns_nas > 0.95]
columns_nas95 

Series([], dtype: float64)

Lets get a list of the 'bad' columns. These are columns representing hard drugs (anything other than Tobacco, Alcohol, and Marijuana)

In [10]:
bad_columns = util.get_bad_columns(filepath='../data/Data-Codebook.pdf')

In [11]:
#bad_columns

In [12]:
df_subset = df_subset.drop(columns=bad_columns, errors='ignore')
df_subset

Unnamed: 0_level_0,QUESTID2,CIGEVER,CIGOFRSM,CIGWILYR,CIGTRY,CIGYFU,CIGMFU,CIGREC,CIG30USE,CIG30AV,...,II2EMSTY,EMPSTAT4,IIEMPST4,II2EMST4,PDEN00,COUTYP2,MAIIN002,ANALWT_C,VESTR,VEREP
CASEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,48694667,1,,,19,2012,7,1,1,2,...,1,4,1,1,2,2,2,4398.4,30017,1
2,88530883,1,,,14,,,2,,,...,1,4,1,1,1,1,2,1419.19,30052,2
3,33251077,1,,,14,,,1,2,1,...,1,,9,9,3,3,2,14052.62,30028,1
8,35106150,1,,,22,,,2,,,...,1,1,1,1,2,2,2,3211.24,30024,1
9,67182690,1,,,16,,,1,20,3,...,1,4,1,1,1,1,2,6396.73,30038,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55153,31776420,2,,,,,,,,,...,1,4,1,1,2,2,2,1649.35,30024,2
55155,10772244,1,,,18,,,3,,,...,1,1,1,1,1,1,2,4594.84,30002,2
55156,13675473,2,,,,,,,,,...,1,2,1,1,3,3,2,679.36,30003,1
55157,49609908,1,,,16,,,4,,,...,1,1,1,1,2,2,2,2296.28,30057,2


HLTINDRG: represents drug abuse

Lets see the value counts for it so we can get an idea of the missingness in the response vector.

In [13]:
y = df_subset['HLTINDRG'] - 1
print(y.value_counts())
print('Number of NA values: ', len(y) - y.value_counts().iloc[0] - y.value_counts().iloc[1])

0    8361
1    2584
Name: HLTINDRG, dtype: int64
Number of NA values:  24933


Since we have a large proportion of missing values, we must assess how to handle them.

Data Exclusion: This strategy, also known as listwise or complete-case deletion, involves removing all cases (rows) with missing values. This is the simplest approach and can be used when the number of cases with missing response values is small and are Missing Completely at Random (MCAR). However, you can lose a lot of data if the number of missing values is substantial.

Imputation: You can still use imputation techniques, but need to choose ones that are suitable for binary variables. Common techniques include:

- Mode imputation: Replace the missing values with the mode (most common value) of the non-missing values. This is a simple and fast method, but it can introduce bias if the missing data is not MCAR.

- Predictive mean matching (PMM): This involves using a model to predict the missing values from the other data, and then taking a random draw from the observed values with similar predicted values. This is more robust than simple model-based imputation.

- Logistic regression imputation: Since the response variable is binary, you can fit a logistic regression model using the other variables, and then predict the missing values.

- Multiple Imputation: Similar to the continuous case, multiple imputation can be used. It acknowledges the uncertainty about the imputation by creating several different plausible imputed datasets and combining results obtained from each of them.

Remember, the way you handle missing data in the response variable should depend on the reason why the data is missing. For example, if a response is missing not at random, meaning that the missingness is related to the underlying, unobserved response, excluding these cases or filling in with imputed values can lead to bias in your results. In such cases, more advanced techniques might be necessary, possibly involving using models that can handle missing data directly.

## Imputation

Since we have a lot of missing values, we are going to assess handling them in two different ways:

- listwise deletion: dropping all rows where the response vector value is NA, then sampling from the distribution and imputing to preserve the shape and nature. 
- probabilistic: sampling from the distribution and imputing to preserve the shape and nature in both X and y. 

Then we will compare the distributions between these two, as well as run them through our model to see if there are significant differences in either. 

Our hypothesis is that probabisitic will be a better way of imputing values in our design matrix, X, and that we can just drop rows after this imputation step where the value in our response vector, y are <NA>

Now that we have are data cleaned, imputed, and subsetted to only the columns and rows relevant to our analysis, we can prepare it for PCA. By doing PCA, we want to find the most signifcant feature vectors that explain the variance in the dataset, and ultimately contribute to drug abuse so that we can discover where to prevent drug usage among teens.

In [14]:
X = df_subset.iloc[:, 1:]
X

Unnamed: 0_level_0,CIGEVER,CIGOFRSM,CIGWILYR,CIGTRY,CIGYFU,CIGMFU,CIGREC,CIG30USE,CIG30AV,CIG30BR2,...,II2EMSTY,EMPSTAT4,IIEMPST4,II2EMST4,PDEN00,COUTYP2,MAIIN002,ANALWT_C,VESTR,VEREP
CASEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,,,19,2012,7,1,1,2,,...,1,4,1,1,2,2,2,4398.4,30017,1
2,1,,,14,,,2,,,,...,1,4,1,1,1,1,2,1419.19,30052,2
3,1,,,14,,,1,2,1,112,...,1,,9,9,3,3,2,14052.62,30028,1
8,1,,,22,,,2,,,,...,1,1,1,1,2,2,2,3211.24,30024,1
9,1,,,16,,,1,20,3,112,...,1,4,1,1,1,1,2,6396.73,30038,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55153,2,,,,,,,,,,...,1,4,1,1,2,2,2,1649.35,30024,2
55155,1,,,18,,,3,,,,...,1,1,1,1,1,1,2,4594.84,30002,2
55156,2,,,,,,,,,,...,1,2,1,1,3,3,2,679.36,30003,1
55157,1,,,16,,,4,,,,...,1,1,1,1,2,2,2,2296.28,30057,2


Dropping all rows where the response vector value is NA, then sampling to impute missing values in X

In [15]:
X_listwise_deletion = df_subset.drop(df_subset[(df_subset['HLTINDRG'] == 94) | (df_subset['HLTINDRG'].isna())].index)
#X_listwise_deletion = X_listwise_deletion.apply(lambda column: column.where(column.notna(), np.random.choice(column.dropna().values, size=len(column))))
X_listwise_deletion # len should be 8361 + 2584 based on y.value_counts()

Unnamed: 0_level_0,QUESTID2,CIGEVER,CIGOFRSM,CIGWILYR,CIGTRY,CIGYFU,CIGMFU,CIGREC,CIG30USE,CIG30AV,...,II2EMSTY,EMPSTAT4,IIEMPST4,II2EMST4,PDEN00,COUTYP2,MAIIN002,ANALWT_C,VESTR,VEREP
CASEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12,51195804,2,4,4,,,,,,,...,1,,9,9,2,3,2,241.7,30016,2
14,71728934,2,,,,,,,,,...,1,2,1,1,1,1,2,662.83,30028,1
18,68794044,2,4,4,,,,,,,...,1,,9,9,2,2,2,369.95,30034,2
19,67154315,2,3,3,,,,,,,...,1,,9,9,1,1,2,1245.3,30024,2
21,41568573,1,,,20,,,1,21,3,...,1,4,1,1,2,3,2,,30030,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55137,29379780,2,4,3,,,,,,,...,1,,9,9,1,1,2,1141.87,30038,2
55143,27846250,2,3,3,,,,,,,...,1,,9,9,2,2,2,1429.14,30050,2
55144,10834015,2,4,4,,,,,,,...,9,,9,9,1,1,2,738.54,30046,1
55151,44065933,2,4,4,,,,,,,...,9,,9,9,2,2,2,931.12,30030,2


Imputing All NA values with values that are from the distribution (this step is harder and takes a longer amount of time)

In [16]:
# Apply the function on each column of df_subset
X_probabilistic = df_subset.apply(impute_na_with_random)
X_probabilistic

Unnamed: 0_level_0,QUESTID2,CIGEVER,CIGOFRSM,CIGWILYR,CIGTRY,CIGYFU,CIGMFU,CIGREC,CIG30USE,CIG30AV,...,II2EMSTY,EMPSTAT4,IIEMPST4,II2EMST4,PDEN00,COUTYP2,MAIIN002,ANALWT_C,VESTR,VEREP
CASEID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,48694667,1,4,3,19,2012,7,1,1,2,...,1,4,1,1,2,2,2,4398.4,30017,1
2,88530883,1,3,4,14,2011,1,2,30,3,...,1,4,1,1,1,1,2,1419.19,30052,2
3,33251077,1,4,4,14,2012,5,1,2,1,...,1,4,9,9,3,3,2,14052.62,30028,1
8,35106150,1,4,3,22,2011,12,2,30,3,...,1,1,1,1,2,2,2,3211.24,30024,1
9,67182690,1,4,4,16,2012,1,1,20,3,...,1,4,1,1,1,1,2,6396.73,30038,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55153,31776420,2,4,4,18,2012,3,1,30,1,...,1,4,1,1,2,2,2,1649.35,30024,2
55155,10772244,1,4,4,18,2012,10,3,12,3,...,1,1,1,1,1,1,2,4594.84,30002,2
55156,13675473,2,4,4,14,2011,5,4,6,4,...,1,2,1,1,3,3,2,679.36,30003,1
55157,49609908,1,4,4,16,2013,8,4,7,1,...,1,1,1,1,2,2,2,2296.28,30057,2


In [17]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(X_probabilistic)
scaled_data

array([[-0.20577752, -1.27195782,  0.39728661, ...,  1.66990924,
        -0.76247569, -1.02129816],
       [ 1.3569023 , -1.27195782, -2.06963084, ..., -0.15140678,
         1.25156655,  0.97914599],
       [-0.81159275, -1.27195782,  0.39728661, ...,  7.57193883,
        -0.12949098, -1.02129816],
       ...,
       [-1.57949704,  0.78618959,  0.39728661, ..., -0.60369588,
        -1.56809258, -1.02129816],
       [-0.16987479, -1.27195782,  0.39728661, ...,  0.38479513,
         1.53928687,  0.97914599],
       [-1.44130344,  0.78618959,  0.39728661, ...,  0.87862492,
         1.19402248,  0.97914599]])

## Feature Significance

Now before we choose a model, we need to assess the most important features. Since we are working with mostly categorical columns, PCA will not work well compared to Tree Based Algorithms. To start we have decided to use XGBoost to calculate feature importances.

### XGBoost

Some notes on XGBoost:

- Speed and Performance: comparatively faster than other ensemble classifiers!

- Parallelizable: Because the core XGBoost algorithm is parallelizable, it can harness the power of multicore computers. It is also parallelizable onto GPU's and across networks of computers, making it feasible to train models on very large datasets.

- Wide Variety of Tuning Parameters: XGBoost internally has parameters for cross-validation, regularization, user-defined objective functions, missing values, tree parameters, scikit-learn compatible API, etc.

- Handling Missing Values: XGBoost has an in-built routine to handle missing values. (Our cleaning should have handled this)

- Regularization: XGBoost is also well known for having built-in L1 (Lasso Regression) and L2 (Ridge Regression) regularization which prevents the model from overfitting.

In [26]:
#pca(X=X_probabilistic, scaled_data=scaled_data, thres=.95)
#correlation_matrix = create_correlation_matrix(X_probabilistic)
#correlation_matrix

#most_correlated_features = abs(correlation_matrix).drop(["SUMFLAG"]).sort_values(ascending = False)
#most_correlated_features

In [18]:
# convert response vector to 0s and 1s
X_probabilistic['HLTINDRG'] = X_probabilistic['HLTINDRG'] - 1

# ensure all column dtypes are int / float, not object
X_probabilistic = convert_numeric(X_probabilistic)

# get most important
most_important_features = calculate_feature_importances(df=X_probabilistic, response='HLTINDRG')



In [25]:
most_important_features[most_important_features['Importance'] > 0.000]

Unnamed: 0,Feature,Importance
1747,HLTINALC,0.028417
1748,HLTINMNT,0.014057
1760,IRPRVHLT,0.013716
1745,PRVHLTIN,0.010891
1481,YSPEC,0.003848
...,...,...
1457,YUTPFEAR,0.000230
1414,ADPBRMBR,0.000204
549,MJFRAME3,0.000169
286,TOBYR,0.000134


### Neural Networks

Neural networks, including Convolutional Neural Networks (CNNs), can be used with categorical data, but the data needs to be prepared appropriately and the right kind of network architecture must be chosen.

We should consider:

- Embedding Layers: In Neural Networks, categorical data is typically transformed into numeric form using an embedding layer, which maps each category to a dense vector of real numbers. The advantage of this approach is that the network can learn a representation that captures the relationships between categories in a way that one-hot encoding cannot. Our dataset appears to have satisfied this

- Network Architecture: CNNs might not be the best fit for categorical data, *unless* the categorical data has some spatial or sequential structure. Fully connected networks (FCNs) or Multi-layer Perceptrons (MLPs) are usually more appropriate for structured, tabular data.

- Overfitting: Neural networks, particularly deep ones, can overfit to the training data if it is not large or diverse enough. Techniques like regularization, dropout, and early stopping can mitigate this risk.

In [None]:
nn_model = create_model(most_important_features['Feature'].tolist(), X_probabilistic)

### Boosted Decision Trees

[Read more here](https://dsc-courses.github.io/dsc140a-2023-wi/materials/lectures/17-boosting/slides.pdf)