In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import minmax_scale

# read the doc
df0 = pd.read_csv('NSDUH_2015_Tab.tsv',sep='\t')


  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
heroin_ever = df0['HEREVER']
meth_ever = df0['METHAMEVR']
crack_ever = df0['CRKEVER']

# remove those never used one of these drugs
indexes_to_remove = []
for i in range(len(heroin_ever)):
    if heroin_ever[i] != 1 and meth_ever[i] != 1 and crack_ever[i] != 1:
        indexes_to_remove.append(i)

df = df0.drop(index=indexes_to_remove)

In [5]:
# did this to correct indexes
df.to_csv('rfd.tsv',sep='\t',index=False)
df = pd.read_csv('rfd.tsv',sep='\t')

In [7]:
# recency
heroin_recency = df['HERREC']
heroin_recency_score = []
for i in range(len(heroin_recency)):
    x = heroin_recency[i]
    if x == 1 or x == 11:
        rec_factor = 1
    elif x == 2 or x == 8:
        rec_factor = 0.5
    else:
        rec_factor = 0.2
    heroin_recency_score.append(rec_factor)

df['heroin_recency_score'] = heroin_recency_score

meth_recency = df['METHAMREC']
meth_recency_score = []
for i in range(len(meth_recency)):
    x = meth_recency[i]
    if x == 1 or x == 11:
        rec_factor = 1
    elif x == 2 or x == 8:
        rec_factor = 0.5
    else:
        rec_factor = 0.2
    meth_recency_score.append(rec_factor)

df['meth_recency_score'] = meth_recency_score

crack_recency = df['CRAKREC']
crack_recency_score = []
for i in range(len(crack_recency)):
    x = crack_recency[i]
    if x == 1 or x == 11:
        rec_factor = 1
    elif x == 2 or x == 8:
        rec_factor = 0.5
    else:
        rec_factor = 0.2
    crack_recency_score.append(rec_factor)

df['crack_recency_score'] = crack_recency_score

In [9]:
# frequency
heroin_frequency = df['HERYRTOT']
heroin_freq_score = []

for i in range(len(heroin_frequency)):
    if heroin_frequency[i] in range(1,366):
        heroin_freq_score.append(heroin_frequency[i])
    else:
        heroin_freq_score.append(0)

heroin_freq_score_mapped = np.interp(heroin_freq_score,(np.min(heroin_freq_score),np.max(heroin_freq_score)),(0.3,1))
df['heroin_freq_score_mapped'] = heroin_freq_score_mapped



meth_frequency = df['METHAMYFQ']
meth_freq_score = []

for i in range(len(meth_frequency)):
    if meth_frequency[i] in range(1,366):
        meth_freq_score.append(meth_frequency[i])
    else:
        meth_freq_score.append(0)

meth_freq_score_mapped = np.interp(meth_freq_score,(np.min(meth_freq_score),np.max(meth_freq_score)),(0.3,1))
df['meth_freq_score_mapped'] = meth_freq_score_mapped


crack_frequency = df['CRKYRTOT']
crack_freq_score = []

for i in range(len(crack_frequency)):
    if crack_frequency[i] in range(1,366):
        crack_freq_score.append(crack_frequency[i])
    else:
        crack_freq_score.append(0)

crack_freq_score_mapped = np.interp(crack_freq_score,(np.min(crack_freq_score),np.max(crack_freq_score)),(0.3,1))
df['crack_freq_score_mapped'] = crack_freq_score_mapped

In [11]:
# duration, 2015 is the year that the survey has been conducted. 
# IRHERYFU is the year of first use age.
# I did not include the month of first use as not many people responded.

heroin_duration = np.array(2015-df['IRHERYFU'])
heroin_duration_normalized = minmax_scale(heroin_duration)

heroin_duration_mapped = np.interp(heroin_duration_normalized,
                                   (np.min(heroin_duration_normalized),
                                    np.max(heroin_duration_normalized)),
                                   (0.3,1))
df['heroin_duration_score_mapped'] = heroin_duration_mapped



meth_duration = np.array(2015-df['IRMETHAMYFU'])
meth_duration_normalized = minmax_scale(meth_duration)

meth_duration_mapped = np.interp(meth_duration_normalized,
                                   (np.min(meth_duration_normalized),
                                    np.max(meth_duration_normalized)),
                                   (0.3,1))
df['meth_duration_score_mapped'] = meth_duration_mapped



crack_duration = np.array(2015-df['IRCRKYFU'])
crack_duration_normalized = minmax_scale(crack_duration)

crack_duration_mapped = np.interp(crack_duration_normalized,
                                   (np.min(crack_duration_normalized),
                                    np.max(crack_duration_normalized)),
                                   (0.3,1))
df['crack_duration_score_mapped'] = crack_duration_mapped



In [13]:
heroin_rfd_score = []       
for i in range(len(heroin_recency_score)):
    R = heroin_recency_score[i]
    F = heroin_freq_score_mapped[i]
    D = heroin_duration_mapped[i]
    if R == .2 and F <= .35:
        RFD = R * ((F*6/5)-(D*1/5))
    else:
        RFD = R * ((F*6/5)+(D*1/5))
    if RFD < 1:
        heroin_rfd_score.append(RFD)
    else:
        heroin_rfd_score.append(1)
df['Heroin RFD Score'] = heroin_rfd_score

meth_rfd_score = []       
for i in range(len(meth_recency_score)):
    R = meth_recency_score[i]
    F = meth_freq_score_mapped[i]
    D = meth_duration_mapped[i]
    if R == .2 and F <= .35:
        RFD = R * ((F*6/5)-(D*1/5))
    else:
        RFD = R * ((F*6/5)+(D*1/5))
    if RFD < 1:
        meth_rfd_score.append(RFD)
    else:
        meth_rfd_score.append(1)
df['Meth RFD Score'] = meth_rfd_score


crack_rfd_score = []       
for i in range(len(crack_recency_score)):
    R = crack_recency_score[i]
    F = crack_freq_score_mapped[i]
    D = crack_duration_mapped[i]
    if R == .2 and F <= .35:
        RFD = R * ((F*6/5)-(D*1/5))
    else:
        RFD = R * ((F*6/5)+(D*1/5))
    if RFD < 1:
        crack_rfd_score.append(RFD)
    else:
        crack_rfd_score.append(1)
df['Crack RFD Score'] = crack_rfd_score

final_rfd_score = []
for i in range(len(crack_recency_score)):
    rfd = heroin_rfd_score[i] + meth_rfd_score[i] + crack_rfd_score[i]
    if rfd > 1:
        final_rfd_score.append(1)
    else:
        final_rfd_score.append(rfd)

df['RFD Score'] = final_rfd_score

In [15]:
# finally remove features that are not in our list

features = pd.read_excel('clean_vars.xlsx')
var_list = np.array(features['vars'])
axes = np.array(df.axes[1])
rm_col_index = [item for item in axes if item not in var_list]
df1 = df.drop(columns=rm_col_index)
df1.to_csv('Final with RFD and selected features.tsv', sep='\t')