# Import

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp

In [2]:
# Read in the data set
df = pd.read_table('~/Desktop/Data/sdd_archive.tab', low_memory=False)

In [3]:
# Select required variables and assign to df1
df1 = df.loc[:,('pupilwt','age1115', 'sex', 'ddwbscore', 'ddwbcat', 'dgtdcan', 'dgtdamp','dgtdlsd','dgtdecs', 'dgtdcok', 'dgtdket', 'dgtdnox', 'dgtdleg', 'devrstm', 'devrpsy', 'devropi', 'devrcla', 'devrps', 'ddgany')]

# Clean Data

In [6]:
# Create functions for cleaning missing values

def CleanData(df1):
    nan_values = [-1,-8,-9] # These variables have values missing at -1,-8,-9
    df1.sex.replace(nan_values, np.nan, inplace=True)
    df1.dgtdcan.replace(nan_values, np.nan, inplace=True)
    df1.dgtdamp.replace(nan_values, np.nan, inplace=True)
    df1.dgtdlsd.replace(nan_values, np.nan, inplace=True)
    df1.dgtdecs.replace(nan_values, np.nan, inplace=True)
    df1.dgtdcok.replace(nan_values, np.nan, inplace=True)
    df1.dgtdket.replace(nan_values, np.nan, inplace=True)
    df1.dgtdnox.replace(nan_values, np.nan, inplace=True)
    df1.dgtdleg.replace(nan_values, np.nan, inplace=True)
    df1.devrstm.replace(nan_values, np.nan, inplace=True)
    df1.devrpsy.replace(nan_values, np.nan, inplace=True)
    df1.devropi.replace(nan_values, np.nan, inplace=True)
    df1.devrcla.replace(nan_values, np.nan, inplace=True)
    df1.devrps.replace(nan_values, np.nan, inplace=True)
    df1.ddgany.replace(nan_values, np.nan, inplace=True)
    
def CleanWell(df1):
    nan_values = [-8,-9,-98] # These variables have values missing at -8,-9,-98
    df1.ddwbscore.replace(nan_values, np.nan, inplace=True)
    df1.ddwbcat.replace(nan_values, np.nan, inplace=True)

In [7]:
# Run functions

CleanData(df1)
CleanWell(df1)

In [9]:
# Change NaNs to average mean
df1 = df1.fillna(df1.mean())
df1.head()

# Check datatype
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12051 entries, 0 to 12050
Data columns (total 19 columns):
pupilwt      12051 non-null int64
age1115      12051 non-null int64
sex          12051 non-null int64
ddwbscore    12051 non-null int64
ddwbcat      12051 non-null int64
dgtdcan      12051 non-null int64
dgtdamp      12051 non-null int64
dgtdlsd      12051 non-null int64
dgtdecs      12051 non-null int64
dgtdcok      12051 non-null int64
dgtdket      12051 non-null int64
dgtdnox      12051 non-null int64
dgtdleg      12051 non-null int64
devrstm      12051 non-null int64
devrpsy      12051 non-null int64
devropi      12051 non-null int64
devrcla      12051 non-null int64
devrps       12051 non-null int64
ddgany       12051 non-null int64
dtypes: int64(19)
memory usage: 1.7 MB


In [None]:
# Change floats to int
df1 = df1.astype(int)

In [10]:
# Create functions for binary variables

def CleanBin(df1):
    # Replace sex variables
    df1.sex.replace(1.0, 'male', inplace=True)
    df1.sex.replace(2.0, 'female', inplace=True)
    # Replace ever tried drug variables
    df1.dgtdcan.replace(1.0, 'yes', inplace=True)
    df1.dgtdcan.replace(2.0, 'no', inplace=True)
    df1.dgtdamp.replace(1.0, 'yes', inplace=True)
    df1.dgtdamp.replace(2.0, 'no', inplace=True)
    df1.dgtdlsd.replace(1.0, 'yes', inplace=True)
    df1.dgtdlsd.replace(2.0, 'no', inplace=True)
    df1.dgtdecs.replace(1.0, 'yes', inplace=True)
    df1.dgtdecs.replace(2.0, 'no', inplace=True)
    df1.dgtdcok.replace(1.0, 'yes', inplace=True)
    df1.dgtdcok.replace(2.0, 'no', inplace=True)
    df1.dgtdket.replace(1.0, 'yes', inplace=True)
    df1.dgtdket.replace(2.0, 'no', inplace=True)
    df1.dgtdnox.replace(1.0, 'yes', inplace=True)
    df1.dgtdnox.replace(2.0, 'no', inplace=True)
    df1.dgtdleg.replace(1.0, 'yes', inplace=True)
    df1.dgtdleg.replace(2.0, 'no', inplace=True)
    # Replace ever tried drug group variables
    df1.devrstm.replace(1.0, 'yes', inplace=True)
    df1.devrstm.replace(2.0, 'no', inplace=True)
    df1.devrpsy.replace(1.0, 'yes', inplace=True)
    df1.devrpsy.replace(2.0, 'no', inplace=True)
    df1.devropi.replace(1.0, 'yes', inplace=True)
    df1.devropi.replace(2.0, 'no', inplace=True)
    df1.devrcla.replace(1.0, 'yes', inplace=True)
    df1.devrcla.replace(2.0, 'no', inplace=True)
    df1.devrps.replace(1.0, 'yes', inplace=True)
    df1.devrps.replace(2.0, 'no', inplace=True)
    # Replace ever tried any drug
    df1.ddgany.replace(1.0, 'yes', inplace=True)
    df1.ddgany.replace(2.0, 'no', inplace=True)
    # Replace wellbeing category
    df1.ddwbcat.replace(1.0, 'low wellbeing', inplace=True)
    df1.ddwbcat.replace(2.0, 'not low wellbeing', inplace=True)
    
# Run function for binary variables
CleanBin(df1)

# Check data
df1.head()

Unnamed: 0,pupilwt,age1115,sex,ddwbscore,ddwbcat,dgtdcan,dgtdamp,dgtdlsd,dgtdecs,dgtdcok,dgtdket,dgtdnox,dgtdleg,devrstm,devrpsy,devropi,devrcla,devrps,ddgany
0,0,12,female,17,not low wellbeing,no,no,no,no,no,yes,yes,no,no,yes,no,yes,no,yes
1,1,12,male,19,not low wellbeing,no,no,no,no,no,yes,no,yes,no,no,no,no,yes,yes
2,0,14,female,20,not low wellbeing,no,no,no,no,no,no,yes,no,no,no,no,no,no,no
3,0,14,male,13,not low wellbeing,no,no,no,no,no,yes,yes,no,no,yes,no,no,no,yes
4,0,13,male,19,not low wellbeing,no,no,no,no,no,no,yes,no,no,no,no,no,no,no


# Descriptive Statistics

In [11]:
# Calculate descriptive statistics for wellbeing scores

wbmean = np.mean(df1.ddwbscore) # mean wellbeing score
wbvar = np.var(df1.ddwbscore) # variance
print(wbmean)
print(wbvar)

14.64351506099079
18.143103535146043


In [12]:
# Create plot to show distribution of wellbeing score
#sns.distplot(df1.ddwbscore)
#plt.title('Histogram of Wellbeing Scores')
#plt.xlabel('Wellbeing Scores')
#plt.ylabel('Estimated Density')
#plt.xticks((0,2,4,6,8,10,12,14,16,18,20))
#plt.xlim([1,20])
#plt.savefig('wellbeing_hist.jpg')
#plt.figure()

In [None]:
# Set up plot
ax = sns.distplot(df1.ddwbscore,rug=True,
                 kde_kws={"label": "Kernel Density", "color" : 'k'},
                 hist_kws={"label": "Histogram", "color" : 'c'}
                )
# Set parameters for visualising central tendency of wellbeing scores
xk,yk = ax.get_lines()[0].get_data()
mm = np.mean(df1.ddwbscore)
md = np.median(df1.ddwbscore)
mo = xk[np.argmax(yk)]

# Plot central tendency of wellbeing scores - including mean, median and KDE estimated mode
plt.figure(figsize=(5,2))
plt.plot(xk,yk,'-k')
xx = np.ones(2)
yy = np.array([0, 0.8])
plt.plot(mm*xx,yy,'--b',label='Mean')
plt.plot(md*xx,yy,'-.r',label='Median')
plt.plot(mo*xx,yy,':m',label='KDE-estimated Mode')
plt.xlabel('Wellbeing Score')
plt.ylabel('Estimated Density')
plt.xlim([0,20])
plt.legend()
plt.tight_layout()
plt.savefig('wellbeing_cent_tend.jpg')
plt.show()

