In [16]:
import pandas as pd
import numpy as np
import plotly.express as px

In [17]:
df_de = pd.read_csv('D:\\OneDrive\\Vassar\\DataFest\\data\\DE\\de.csv', parse_dates=['DATE'])

In [18]:
def get_drug_names(df):
    # returns a list of drug names for convenience
    drugs = [x for x in df.columns if 'NMU' in x][:14]
    drugs = [x.split('_')[0] for x in drugs]
    return drugs

In [19]:
def get_use_cat(df):
    # make new df so old dataset is not affected
    new_df = df.copy()
    # get list of drug names
    drugs = get_drug_names(df)
    # make categorical variable for each drug
    for drug in drugs:
        new_df[f'{drug}_USE_CAT'] = new_df[f'{drug}_USE'] + new_df[f'{drug}_NMU']
        new_df[f'{drug}_USE_CAT'].fillna(value=0, inplace=True)
    # in the returned df, each drug now has a column indicating how the correspondent uses the drug
    # 0 -> never used
    # 1 -> used for prescription purposes
    # 2 -> used for recreational purposes
    return new_df

In [20]:
def calculate_proportions(df):
    new_df = get_use_cat(df)
    drugs = get_drug_names(df)
    # empty dict to insert values
    d = {}
    # for each drug, get proportions of recreational use
    for drug in drugs:
        # get number of people for prescription and recreational purposes
        num_pre = new_df[f'{drug}_USE_CAT'].value_counts().loc[1.0]
        num_rec = new_df[f'{drug}_USE_CAT'].value_counts().loc[2.0]
        # get percentage of recreational usage
        percentage = num_rec / (num_pre + num_rec)
        # insert into dictionary
        d[drug] = percentage
    return d

In [21]:
def get_pres_predictors(df):
    # returns a df with columns containing prescripted drug use category
    df_use = get_use_cat(df)
    drugs = get_drug_names(df)
    pred_cols = [x+'_USE_CAT' for x in drugs]
    return df_use[pred_cols]

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [37]:
def decision_tree(df):
    X_pres = get_pres_predictors(df_de)
    y = df['DAST_CAT']
    # split into train and test
    X_train, X_test, y_train, y_test = train_test_split(X_pres, y, test_size=0.33, random_state=42)
    # train model
    decision_tree = DecisionTreeClassifier(random_state=42, max_depth=2)
    decision_tree = decision_tree.fit(X_train, y_train)
    # add cross-validation???
    cv_score = cross_val_score(decision_tree, X_train, y_train, cv=10).mean()
    print(f'CV-Score: {round(cv_score*100, 4)}%')
    # check perdformance in test set
    df_prediction = pd.DataFrame()
    df_prediction['Predicted'], df_prediction['Actual'] = decision_tree.predict(X_test), y_test.values
    df_prediction = df_prediction.assign(Correct = df_prediction['Predicted'] == df_prediction['Actual'])
    # calculate accuracy in test set
    acc = df_prediction['Correct'].value_counts()[True] / len(df_prediction)
    print(f'Accuracy: {round(acc*100, 4)}%')

In [38]:
decision_tree(df_de)

CV-Score: 52.6673%
Accuracy: 53.4931%
