# Avivo - Analysis on changes from Admission to Discharge

Importing data and loading libraries:

In [1]:
#importing libraries
import pandas as pd
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
import pandas_profiling
import numpy as np
#import statsmodels.api as sm
import scipy
import re
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools

#setting visualization style and inline plots
sns.set()
plt.figure(figsize = (8,5))

This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



<matplotlib.figure.Figure at 0x1bde906d160>

<matplotlib.figure.Figure at 0x1bde906d160>

In [2]:
master = pd.read_csv('C:/Users/prabl/Documents/Star82/Data/master.csv')
print('Shape of dataframe: ', master.shape)
master.head()                     

Shape of dataframe:  (827, 120)


Unnamed: 0,Form Date,Program_admission,Staff_admission,Client Number,Age_admission,Age of first use (Primary),Age of first use (Secondary),Age of first use (Tertiary),Are any children living with someone else due to CPS court order or other action,CHSR Dimension 1_admission,...,Client_number,Program,Code,start_service_data,end_service_data,data_birth,gender,race,number_hours,year
0,12/5/2014 0:00,Treatment - Family - Outpatient,"Stewart, Lorene",111156598,,13.0,19,13,Not applicable - No children/no child protect ...,Minor problem,...,111156598,Treatment - Family - Outpatient,H2035 HH HQ U60945: IOP Family - IDD w/kids - ...,12/3/2014 0:00,12/30/2014 0:00,7/3/1979 0:00,Female,White,47.0,2014
1,11/10/2014 0:00,Treatment - IDD Men,"Salsness, Carrie",111156572,,9.0,9,0,Not applicable - No children/no child protect ...,No problem,...,111156572,Treatment - IDD Men,H2035 HH0944: IDD - Individual - Drug,11/6/2014 0:00,12/17/2014 0:00,2/19/1988 0:00,Male,Black or African American,87.0,2014
2,11/14/2014 0:00,Treatment - Relapse Men,"Salsness, Carrie",111156525,,16.0,0,0,Not applicable - No children/no child protect ...,No problem,...,111156525,Treatment - Relapse Men,H2035 HQ0945: Relapse - Group - Alcohol,11/14/2014 0:00,12/26/2014 0:00,5/7/1991 0:00,Male,White,78.0,2014
3,1/29/2015 0:00,Treatment - IDD Women,"Stewart, Lorene",111157248,,14.0,14,35,Not applicable - No children/no child protect ...,No problem,...,111157248,Treatment - IDD Women,H2035 HH0945: IDD - Individual - Alcohol,1/28/2015 0:00,5/1/2015 0:00,10/25/1968 0:00,Female,Black or African American,221.0,2015
4,1/5/2015 0:00,Treatment - IDD Men,"Stewart, Lorene",111156885,,42.0,18,19,Not applicable - No children/no child protect ...,Minor problem,...,111156885,Treatment - IDD Men,H2035 HH HQ0944: IDD - Group - Drug,1/2/2015 0:00,2/20/2015 0:00,3/15/1966 0:00,Male,White,99.0,2015


Looking at the sample we see that Age_admission has all NaN's, let's look into this:

In [3]:
master['Age_admission'].isnull().sum()

827

All the rows are Null values! Thus we can drop this column out.

In [4]:
master.drop('Age_admission', axis=1, inplace = True)

Let us now look at the list of columns in our dataframe:

In [5]:
list(master.columns)

['Form Date',
 'Program_admission',
 'Staff_admission',
 'Client Number',
 'Age of first use (Primary)',
 'Age of first use (Secondary)',
 'Age of first use (Tertiary)',
 'Are any children living with someone else due to CPS court order or other action',
 'CHSR Dimension 1_admission',
 'CHSR Dimension 2_admission',
 'CHSR Dimension 3_admission',
 'CHSR Dimension 4_admission',
 'CHSR Dimension 5_admission',
 'CHSR Dimension 6_admission',
 'Client currently involved with CPS',
 'Current CD Treatment admission',
 'Current labor status',
 'Current marital status',
 'Currently enrolled in school or job training program',
 'Currently smoke cigarettes',
 'Currently under the jurisdiction of the court or on probation/parole of',
 'Did drug court refer?',
 'Does client have children',
 'Does this involvement result from',
 "Driver's license revoked due to DWI",
 'Education',
 'For how many children has the client lost parental rights',
 'Hispanic Ethnicity',
 'History of injection drug use',
 '

Let us fix the column names, i.e remove the spaces in column names. Further, we also notice the following: 
* There are 4 Client number and 3 Program columns. Ideally these should be redundant, so let's look into these!
* There are 2 age_discharge columns as well - one from the discharge dataset and one from the demographics.
* Race_discharge and admission are also two columns and race_discharge is a column that has been broken down into Race1-Race4 in the demographics data
* There are 3 columns os sex of client as well
* 3 Columns for Date of birth as well

In [6]:
#Replacing all spaces and '_' in column names with '.' for uniformity
master.columns = [col.strip() for col in master.columns]
master.columns = [re.sub(r'[_| |/]', r'.', col) for col in master.columns]
master.columns = [re.sub(r'[?]',r'', col) for col in master.columns]
list(master.columns)

['Form.Date',
 'Program.admission',
 'Staff.admission',
 'Client.Number',
 'Age.of.first.use.(Primary)',
 'Age.of.first.use.(Secondary)',
 'Age.of.first.use.(Tertiary)',
 'Are.any.children.living.with.someone.else.due.to.CPS.court.order.or.other.action',
 'CHSR.Dimension.1.admission',
 'CHSR.Dimension.2.admission',
 'CHSR.Dimension.3.admission',
 'CHSR.Dimension.4.admission',
 'CHSR.Dimension.5.admission',
 'CHSR.Dimension.6.admission',
 'Client.currently.involved.with.CPS',
 'Current.CD.Treatment.admission',
 'Current.labor.status',
 'Current.marital.status',
 'Currently.enrolled.in.school.or.job.training.program',
 'Currently.smoke.cigarettes',
 'Currently.under.the.jurisdiction.of.the.court.or.on.probation.parole.of',
 'Did.drug.court.refer',
 'Does.client.have.children',
 'Does.this.involvement.result.from',
 "Driver's.license.revoked.due.to.DWI",
 'Education',
 'For.how.many.children.has.the.client.lost.parental.rights',
 'Hispanic.Ethnicity',
 'History.of.injection.drug.use',
 'H

In [7]:
# Now working to remove duplicate columns

#Checking if client number is same accross all three columns.
print(master['Client.number'].equals(master['Client.Number.x']))
print(master['Client.Number.x'].equals(master['Client.Number.y']))

True
True


In [8]:
#Since they are all equal, let's drop three of them
master.drop(['Client.Number.x','Client.Number.y','Client.number'],axis=1, inplace=True)

In [9]:
# Now let us look at Program
#Checking if Program is same accross all three columns.
print(master['Program'].equals(master['Program.admission']))
print(master['Program.discharge'].equals(master['Program.admission']))

False
False


In [10]:
# Let us look at how many records differ in the Program type:
temp = master[master['Program.admission'] != master['Program.discharge']]
temp[['Program.admission', 'Program.discharge']]

Unnamed: 0,Program.admission,Program.discharge
4,Treatment - IDD Men,CC - Aftercare Plus
8,Treatment - Relapse Women,Treatment - IDD Women
13,Treatment - IDD Women,Treatment - Family - Outpatient
42,Assessments,Treatment - IDD Men
49,CC - WRRI,Treatment - IDD Women
56,Treatment - IDD Men,Treatment - Relapse Men
57,Treatment - IDD Women,Hennepin CSP Pending Membership
59,Assessments,Treatment - Relapse Men
64,Treatment - Family - Outpatient,CC - MARFU
74,Assessments - Referred from Outside,Treatment - IDD Women


In [11]:
temp = master[master['Program'] != master['Program.admission']]
temp[['Program', 'Program.admission']]

Unnamed: 0,Program,Program.admission
8,Treatment - IDD Women,Treatment - Relapse Women
13,Treatment - Family - Outpatient,Treatment - IDD Women
42,Treatment - IDD Men,Assessments
49,Treatment - IDD Women,CC - WRRI
56,Treatment - Relapse Men,Treatment - IDD Men
59,Treatment - Relapse Men,Assessments
74,Treatment - IDD Women,Assessments - Referred from Outside
75,Treatment - Relapse Women,Treatment - IDD Women
77,Treatment - Relapse Men,Treatment - IDD Men
107,Treatment - IDD Men,Assessments - Referred from Outside


In [12]:
master['Program.admission'].unique()

array(['Treatment - Family - Outpatient', 'Treatment - IDD Men',
       'Treatment - Relapse Men', 'Treatment - IDD Women',
       'Treatment - Relapse Women', 'Assessments', 'CC - WRRI',
       'Assessments - Referred from Outside', 'Family Residential'], dtype=object)

In [13]:
master['Program.discharge'].unique()

array(['Treatment - Family - Outpatient', 'Treatment - IDD Men',
       'Treatment - Relapse Men', 'Treatment - IDD Women',
       'CC - Aftercare Plus', 'Treatment - Relapse Women',
       'Hennepin CSP Pending Membership', 'CC - MARFU', 'Assessments',
       'Assessments - Referred from Outside', 'Family Residential'], dtype=object)

In [14]:
master['Program'].unique()

array(['Treatment - Family - Outpatient', 'Treatment - IDD Men',
       'Treatment - Relapse Men', 'Treatment - IDD Women',
       'Treatment - Relapse Women'], dtype=object)

In [15]:
# Keeping only the entries where admission Program EQUALS discharge Program
master = master[master['Program.discharge'] == master['Program.admission']]

In [16]:
# Let's now check if both columns are equal and drop one of them
print(master['Program.discharge'].equals(master['Program.admission']))

#dropping Program.discharge
master.drop('Program.dsicharge', axis=1, inplace=True)

True


In [17]:
# Let us now look at the Program column from demographics data with admission.discharge
temp = master[master['Program'] != master['Program.admission']]
temp[['Program', 'Program.admission']]

Unnamed: 0,Program,Program.admission
75,Treatment - Relapse Women,Treatment - IDD Women
537,Treatment - Family - Outpatient,Family Residential


In [18]:
#Let us drop these two rows
master = master[master['Program'] == master['Program.admission']]

#Check if the two columns are now equal and drop the column
print(master['Program'].equals(master['Program.admission']))
master.drop(['Program.admission', 'Program.discharge'], axis=1, inplace=True)

True


In [19]:
master.shape

(793, 114)

We lost 34 rows, due to this filtering.  
Let us now look at the age at discharge columns

In [20]:
print(master['Age.at.Discharge'].equals(master['Age.discharge']))

False


On investigating, it was found that 'Age.discharge' was calculated using some random date in 2018 and 'Age.at.Discharge' was caculated using   
Age = Discharge_date - Birth_date. So we will drop the 'Age.discharge' column.

In [21]:
master.drop('Age.discharge', axis=1,inplace=True)

For the Race columns, we will drop both race_discharge, race_admission, Client.race and race columns as these are redundant and keep the race-1-4 columns from the demographics data

In [22]:
master.drop(['Race.discharge','Race.admission','race', 'Client.Race', 'Race'],axis=1,inplace=True)

In [23]:
#Removing redundant Sex columns
print(master['Sex.x'].unique())
print(master['Sex.y'].unique())
print(master['gender'].unique())

['Female' 'Male' nan]
['Female' 'Male' 'Unknown']
['Female' 'Male' 'Unknown']


In [24]:
master.loc[master['Sex.y'] == 'Unknown', 'Sex.y'] = np.nan 
master.loc[master['gender'] == 'Unknown', 'gender'] = np.nan 

In [25]:
print(master['Sex.y'].unique())
print(master['gender'].unique())

['Female' 'Male' nan]
['Female' 'Male' nan]


In [26]:
print(master['Sex.x'].equals(master['Sex.y']))
print(master['Sex.x'].equals(master['gender']))

True
True


Thus we can now drop both of the columns as well as the Client.gender columns as these are redundant- 

In [27]:
master.drop(['Sex.y','gender','Client.Gender'], axis=1, inplace=True)

In [28]:
#Checking Date of Birth
print(master['Date.of.Birth.x'].equals(master['Date.of.Birth.y']))
print(master['Date.of.Birth.x'].equals(master['data.birth']))

True
True


In [29]:
#Thus we can drop two of these columns and keep one
master.drop(['Date.of.Birth.y','data.birth'], axis=1,inplace=True)

We notice that the data-set has values of 99/88 in place of Unknown/NA. Let's replace all 99 and 88 values with Nan:

In [30]:
master.replace([99,88,'99','88','n/A','N/A','Unknown'],np.nan, inplace=True)

In [31]:
master.head(10)

Unnamed: 0,Form.Date,Staff.admission,Client.Number,Age.of.first.use.(Primary),Age.of.first.use.(Secondary),Age.of.first.use.(Tertiary),Are.any.children.living.with.someone.else.due.to.CPS.court.order.or.other.action,CHSR.Dimension.1.admission,CHSR.Dimension.2.admission,CHSR.Dimension.3.admission,...,Race1,Race2,Race3,Race4,Program,Code,start.service.data,end.service.data,number.hours,year
0,12/5/2014 0:00,"Stewart, Lorene",111156598,13.0,19.0,13.0,Not applicable - No children/no child protect ...,Minor problem,Minor problem,Moderate problem,...,White,,,,Treatment - Family - Outpatient,H2035 HH HQ U60945: IOP Family - IDD w/kids - ...,12/3/2014 0:00,12/30/2014 0:00,47.0,2014
1,11/10/2014 0:00,"Salsness, Carrie",111156572,9.0,9.0,0.0,Not applicable - No children/no child protect ...,No problem,No problem,Moderate problem,...,Black or African American,,,,Treatment - IDD Men,H2035 HH0944: IDD - Individual - Drug,11/6/2014 0:00,12/17/2014 0:00,87.0,2014
2,11/14/2014 0:00,"Salsness, Carrie",111156525,16.0,0.0,0.0,Not applicable - No children/no child protect ...,No problem,No problem,Moderate problem,...,White,,,,Treatment - Relapse Men,H2035 HQ0945: Relapse - Group - Alcohol,11/14/2014 0:00,12/26/2014 0:00,78.0,2014
3,1/29/2015 0:00,"Stewart, Lorene",111157248,14.0,14.0,35.0,Not applicable - No children/no child protect ...,No problem,Minor problem,Minor problem,...,Black or African American,,,,Treatment - IDD Women,H2035 HH0945: IDD - Individual - Alcohol,1/28/2015 0:00,5/1/2015 0:00,221.0,2015
5,1/8/2015 0:00,"Stewart, Lorene",111156911,12.0,0.0,0.0,Yes,No problem,Moderate problem,Minor problem,...,White,,,,Treatment - Family - Outpatient,H2035 HH HQ U60944: IOP Family - IDD w/kids - ...,1/16/2015 0:00,5/7/2015 0:00,202.0,2015
6,1/9/2015 0:00,"Stewart, Lorene",111156735,40.0,23.0,21.0,Not applicable - No children/no child protect ...,Moderate problem,Moderate problem,Minor problem,...,Declined to Specify,,,,Treatment - Relapse Women,H2035 HQ0944: UBH/Medica - Hourly - Group - Re...,1/7/2015 0:00,2/4/2015 0:00,43.0,2015
7,1/14/2015 0:00,"Stewart, Lorene",111156952,18.0,14.0,14.0,Not applicable - No children/no child protect ...,No problem,Moderate problem,Moderate problem,...,Black or African American,,,,Treatment - IDD Men,H2035 HH HQ0945: IDD - Group - Alcohol,1/13/2015 0:00,3/4/2015 0:00,78.0,2015
9,1/22/2015 0:00,"Ans, Rachel",111157035,12.0,12.0,12.0,Not applicable - No children/no child protect ...,No problem,No problem,Moderate problem,...,White,,,,Treatment - IDD Men,H2035 HH HQ0944: IDD - Group - Drug,1/21/2015 0:00,4/10/2015 0:00,177.0,2015
10,1/22/2015 0:00,"Ans, Rachel",111157109,17.0,23.0,12.0,Not applicable - No children/no child protect ...,No problem,No problem,Moderate problem,...,White,,,,Treatment - Relapse Men,H2035 HQ0945: Relapse - Group - Alcohol,1/20/2015 0:00,4/10/2015 0:00,205.0,2015
11,1/26/2015 0:00,"Stewart, Lorene",111157042,,,,Not applicable - No children/no child protect ...,No problem,Minor problem,Minor problem,...,Black or African American,,,,Treatment - Relapse Men,H2035 HQ0944: Relapse - Group - Drug,1/23/2015 0:00,4/17/2015 0:00,192.0,2015


In [32]:
#Looking at some of the columns related to involvement with children
print(master['Number.of.children'].equals(master['How.many.children']))

False


In [33]:
master[['Number.of.children','How.many.children']]

Unnamed: 0,Number.of.children,How.many.children
0,2.0,2.0
1,,
2,,1.0
3,3.0,
5,4.0,4.0
6,,4.0
7,0.0,0.0
9,0.0,0.0
10,3.0,3.0
11,2.0,3.0


Looks like a lot of values are inconsistent! ** Which one do we pick?? **   
Converting the other number of children column to a numeric column from categorical.

In [34]:
master['How.many.of.the.children.are.living.with.someone.else.for.these.reasons'].unique()

array(['No children/no child protection involvement', 'Four', 'One',
       'Three', 'Five', nan, 'Two', 'Ten or more', 'Six', 'Seven'], dtype=object)

In [35]:
#Converting into numeric column
conv_dict = {'How.many.of.the.children.are.living.with.someone.else.for.these.reasons': 
             {"One": 1, "Two" : 2, "Three":3, "Four":4, "Five":5, "Six":6,"Seven":7, "Ten or more": 10,
                "No children/no child protection involvement": 0, "Unknown": np.nan}}
master.replace(conv_dict, inplace=True)

In [36]:
master['How.many.of.the.children.are.living.with.someone.else.for.these.reasons'].unique()

array([  0.,   4.,   1.,   3.,   5.,  nan,   2.,  10.,   6.,   7.])

In [37]:
master['For.how.many.children.has.the.client.lost.parental.rights'].unique()

array(['No children/no child protection involvement', 'Three', 'One',
       'Two', nan, 'Four', 'Five', 'Ten or more'], dtype=object)

In [38]:
conv_dict = {'For.how.many.children.has.the.client.lost.parental.rights': 
             {"One": 1, "Two" : 2, "Three":3, "Four":4, "Five":5, "Six":6,"Seven":7, "Ten or more": 10,
                "No children/no child protection involvement": 0, "Unknown": np.nan}}
master.replace(conv_dict, inplace=True)

In [39]:
master['For.how.many.children.has.the.client.lost.parental.rights'].unique()

array([  0.,   3.,   1.,   2.,  nan,   4.,   5.,  10.])

Looking at the columns - 'Hispanic.Ethnicity' and 'Client.Ethnicity'

In [40]:
master[['Hispanic.Ethnicity','Client.Ethnicity']]

Unnamed: 0,Hispanic.Ethnicity,Client.Ethnicity
0,Not of Hispanic Origin,Not Entered
1,Not of Hispanic Origin,Not Entered
2,Not of Hispanic Origin,Not Hispanic or Latino
3,Not of Hispanic Origin,
5,Not of Hispanic Origin,
6,Not of Hispanic Origin,
7,Not of Hispanic Origin,
9,Not of Hispanic Origin,
10,Not of Hispanic Origin,
11,Not of Hispanic Origin,Not Hispanic or Latino


Looking at the columns, since 'Hispanic.Ethnicity' seems to have more information than 'Client.Ethnicity' let's retain the first column

In [41]:
master.drop('Client.Ethnicity', axis=1, inplace=True)

Looking at the Substance abuse problem at Admission and Discharge:

In [42]:
print(master['Substance.abuse.problem.at.Discharge'].unique())
master['Primary.substance.abuse.problem'].unique()

['Alcohol' 'Other Opiates/Synthetics' nan 'Marijuana/Hashish' 'Crack'
 'Methamphetamine' 'Cocaine powder' 'Heroin' 'Cocaine power'
 'Benzodiazepines' 'Other Sedative/Hypnotic/Anxiolytic' 'PCP'
 'Other Stimulants' 'Other' 'Inhalants' 'Other Amphetamines']


array(['Alcohol', 'Marijuana/Hashish', 'Other Opiates/Synthetics',
       'Heroin', 'Crack', 'Methamphetamine', 'Cocaine power',
       'Non-prescription Methadone', 'Benzodiazepines', 'Cocaine powder',
       'PCP', 'Other'], dtype=object)

We can see that there are two entries - 'Cocaine pow*d*er' & 'Cocaine power' which has clearly been misspelt, Let's correct this: 

In [43]:
#replacing the values:
master.loc[master['Substance.abuse.problem.at.Discharge'] == 'Cocaine power', 
           'Substance.abuse.problem.at.Discharge'] =  'Cocaine powder'
master.loc[master['Primary.substance.abuse.problem'] == 'Cocaine power', 
           'Primary.substance.abuse.problem'] = 'Cocaine powder'

print(master['Substance.abuse.problem.at.Discharge'].unique())
master['Primary.substance.abuse.problem'].unique()

['Alcohol' 'Other Opiates/Synthetics' nan 'Marijuana/Hashish' 'Crack'
 'Methamphetamine' 'Cocaine powder' 'Heroin' 'Benzodiazepines'
 'Other Sedative/Hypnotic/Anxiolytic' 'PCP' 'Other Stimulants' 'Other'
 'Inhalants' 'Other Amphetamines']


array(['Alcohol', 'Marijuana/Hashish', 'Other Opiates/Synthetics',
       'Heroin', 'Crack', 'Methamphetamine', 'Cocaine powder',
       'Non-prescription Methadone', 'Benzodiazepines', 'PCP', 'Other'], dtype=object)

In [44]:
list(master.columns)

['Form.Date',
 'Staff.admission',
 'Client.Number',
 'Age.of.first.use.(Primary)',
 'Age.of.first.use.(Secondary)',
 'Age.of.first.use.(Tertiary)',
 'Are.any.children.living.with.someone.else.due.to.CPS.court.order.or.other.action',
 'CHSR.Dimension.1.admission',
 'CHSR.Dimension.2.admission',
 'CHSR.Dimension.3.admission',
 'CHSR.Dimension.4.admission',
 'CHSR.Dimension.5.admission',
 'CHSR.Dimension.6.admission',
 'Client.currently.involved.with.CPS',
 'Current.CD.Treatment.admission',
 'Current.labor.status',
 'Current.marital.status',
 'Currently.enrolled.in.school.or.job.training.program',
 'Currently.smoke.cigarettes',
 'Currently.under.the.jurisdiction.of.the.court.or.on.probation.parole.of',
 'Did.drug.court.refer',
 'Does.client.have.children',
 'Does.this.involvement.result.from',
 "Driver's.license.revoked.due.to.DWI",
 'Education',
 'For.how.many.children.has.the.client.lost.parental.rights',
 'Hispanic.Ethnicity',
 'History.of.injection.drug.use',
 'How.many.children',
 'H

Looking at where client has been living for the past 30 days columns:

In [45]:
Living_adm = master['Where.has.client.been.living.in.past.30.days'].unique()
Living_dis = master['Where.client.has.been.living.in.past.30.days'].unique()
print(Living_adm)
print(Living_dis)

[ 'Dependent living - dependent children and/or adults living in a supervised setting'
 'Homless - no fixed address (includes shelters)'
 'Children living with their family'
 'Independent living - including on own, self supported and non-supervised group homes'
 nan]
[ 'Independent living - including on own, self supported and non-supervised group homes '
 nan
 'Dependent living - dependent children and/or adults living in a supervised setting'
 'Homeless - no fixed address (includes shelters)'
 'Children living with their family']


In [46]:
#Replacing nan with Unknown for analysis
master['Where.client.has.been.living.in.past.30.days'] = [str(val).strip() for val in master['Where.client.has.been.living.in.past.30.days']]

master.loc[master['Where.client.has.been.living.in.past.30.days'] == 'nan', 
           'Where.client.has.been.living.in.past.30.days'] = np.nan

master.loc[master['Where.has.client.been.living.in.past.30.days'] == 'Homless - no fixed address (includes shelters)', 
           'Where.has.client.been.living.in.past.30.days'] = 'Homeless - no fixed address (includes shelters)'

Looking at how many self help programs client has attended in the past 30 days:

In [47]:
self_adm = master['Number.of.times.at.self.help.programs.in.past.30.days.(eg..AA,.NA)'].unique()
self_dis = master['Number.of.self.help.programs.attended.in.past.30.days'].unique()
print(self_adm)
print(self_dis)

['16-30 times past month (0ver 3 times per week)' 'No attendance'
 '1-3 times past month (less than once per week)'
 '4-7 times past month (once per week)'
 '8-15 times past month (2 or 3 times per week)'
 'Some attendance, but frequency unknown' nan]
['Some attendance, but frequency unknown '
 '1-3 times past month (less than once per week)'
 '8-15 times past month (2 or 3 times per week)' nan
 '4-7 times past month (once per week)' 'No attendance'
 '16-30 times past month (0ver 3 times per week)']


In [48]:
master['Number.of.self.help.programs.attended.in.past.30.days'] = [str(val).strip() for val in master['Number.of.self.help.programs.attended.in.past.30.days']]

master.loc[master['Number.of.self.help.programs.attended.in.past.30.days'] == 'nan', 
           'Number.of.self.help.programs.attended.in.past.30.days'] = np.nan

In [49]:
self_adm = master['Number.of.times.at.self.help.programs.in.past.30.days.(eg..AA,.NA)'].unique()
self_dis = master['Number.of.self.help.programs.attended.in.past.30.days'].unique()
print(self_adm)
print(self_dis)

['16-30 times past month (0ver 3 times per week)' 'No attendance'
 '1-3 times past month (less than once per week)'
 '4-7 times past month (once per week)'
 '8-15 times past month (2 or 3 times per week)'
 'Some attendance, but frequency unknown' nan]
['Some attendance, but frequency unknown'
 '1-3 times past month (less than once per week)'
 '8-15 times past month (2 or 3 times per week)' nan
 '4-7 times past month (once per week)' 'No attendance'
 '16-30 times past month (0ver 3 times per week)']


Let's look at the age of first use columns:

In [50]:
print(master['Age.of.first.use.(Primary)'].unique())
print(master['Age.of.first.use.(Secondary)'].unique())
print(master[ 'Age.of.first.use.(Tertiary)'].unique())

[  1.30000000e+01   9.00000000e+00   1.60000000e+01   1.40000000e+01
   1.20000000e+01   4.00000000e+01   1.80000000e+01   1.70000000e+01
              nan   2.10000000e+01   3.90000000e+01   1.50000000e+01
   1.10000000e+01   3.20000000e+01   2.20000000e+01   1.90000000e+01
   3.70000000e+01   3.10000000e+01   2.50000000e+01   2.60000000e+01
   4.90000000e+01   2.80000000e+01   1.00000000e+01   4.80000000e+01
   2.90000000e+01   3.00000000e+01   2.70000000e+01   2.40000000e+01
   3.60000000e+01   2.30000000e+01   2.00000000e+01   4.40000000e+01
   4.70000000e+01   8.00000000e+00   4.20000000e+01   3.40000000e+01
   3.30000000e+01   7.00000000e+00   3.80000000e+01   4.30000000e+01
   5.30000000e+01   6.20000000e+01   5.00000000e+00   3.50000000e+01
   4.60000000e+01   1.00000000e+00   4.00000000e+00   6.00000000e+00
   5.00000000e+01   4.10000000e+01   1.45000000e+01   5.10000000e+01
   2.01400000e+03   3.00000000e+00   5.60000000e+01]
['19' '9' '0' '14' '23' '12' nan '13' '20' '18' '1

Looking at this we notice the first column is stored in an float format because of the NAN values. Let's change this. Also we can see that the first two columns have an instance where age of first use = 2014! We should change this to NA. There is also a value 'n/a' in the second column.

In [51]:
master.loc[master['Age.of.first.use.(Primary)'] == 2014, 'Age.of.first.use.(Primary)'] = np.nan

master.loc[master['Age.of.first.use.(Tertiary)'] == '16+', 'Age.of.first.use.(Tertiary)'] = 16

#converting the columns to int
master['Age.of.first.use.(Secondary)'] = pd.to_numeric(master['Age.of.first.use.(Secondary)'])
master['Age.of.first.use.(Tertiary)'] = pd.to_numeric(master['Age.of.first.use.(Tertiary)'])

print(master['Age.of.first.use.(Primary)'].dtypes)
print(master['Age.of.first.use.(Secondary)'].dtypes)
print(master['Age.of.first.use.(Tertiary)'].dtypes)

float64
float64
float64


In [52]:
#looking at unique values of some columns at admission:
print('Lifetime arrests: \n',master['Number.of.arrests.in.lifetime'].unique())
print('Days used: \n',master['Number.of.days.used.in.past.30.(Secondary)'].unique())
print('Detox sessions used: \n',master['Number.of.lifetime.detoxification.admissions'].unique())
print('Years of schooling: \n',master['Years.of.schooling'].unique())

Lifetime arrests: 
 [   2.   nan    5.    3.   12.   25.    0.   15.   10.    6.    1.    8.
    7.    4.   50.   35.   30.   80.   13.    9.   65.   20.   47.   40.
  100.   60.   24.   75.  150.   16.   21.   90.   58.   70.  200.   11.
   18.   14.   27.   57.   95.  500.   37.   19.   23.   45.   43.   68.]
Days used: 
 [   0.   21.    2.   25.   14.   30.   nan    4.    8.   15.    1.   12.
    3.    7.    6.   27.    5.   20.   29.   10.   13.  156.   24.    9.
  258.   26.   23.   28.   16.   22.]
Detox sessions used: 
 [   2.    0.    1.    6.    3.   15.    7.    9.  999.   nan    5.   10.
   30.  120.   20.   50.    4.  180.   11.   25.   75.   70.   12.  150.
  300.]
Years of schooling: 
 ['13' '11' '12' '10' '14' '7' '16' '9' nan '15' '4' '125' '18' '8' '19'
 '8th']


There are some garbage like values:
* **Lifetimes arrests:** 100, 150, 200, 500 
* **Days used: ** 156 and 258.
* **Detox sessions: ** 120, 150, 180, 300, 999 
* **Years of schooling: ** 125, 8th

Not too sure about how to handle these values.

In [53]:
# On average a student in grade 8 would have completed 9 years of schooling 
master.loc[master['Years.of.schooling'] == '8th', 'Years.of.schooling'] = 9 
master['Years.of.schooling'] = pd.to_numeric(master['Years.of.schooling'])

print(master['Years.of.schooling'].dtypes)

float64


In [54]:
#Creating a new comlumn for completed and not completed
master = master.assign(Completed = None)
for (index,row) in master.iterrows():
    if (master.loc[index,'Reason.for.Discharge'] == 'Completed program') or (master.loc[index,'Reason.for.Discharge'] == 'Transferred to other program'):
        master.loc[index,'Completed'] = 1
    else:
        master.loc[index,'Completed'] = 0

In [55]:
#Writing the edited file to disk
master.to_csv('Master_Edited.csv')

## Let us now look at how CHSR Dimensions change from admissions to discharge

In [56]:
#CHSR Dimension 1

dis = master['CHSR.Dimension.1'].value_counts()
adm = master['CHSR.Dimension.1.admission'].value_counts()

vals = pd.concat([adm,dis], axis=1)
vals.columns = ['Admission','Discharge']

Discharge = go.Scatter(x= vals.index, y = vals['Discharge'], mode = 'lines+markers', name = 'DISCHARGE')
Admission = go.Scatter(x= vals.index,y = vals['Admission'], mode = 'lines+markers', name = 'ADMISSION')
data = [Discharge, Admission]

layout = dict(title = 'Dimension 1 - Withdrawal problems: Admission v/s Discharge',
              xaxis = dict(title = 'Problem Level'),
              yaxis = dict(title = 'Number of Clients'),
              )

fig = go.Figure(data = data, layout = layout)
py.iplot(fig, filename='Dimension1')

In [57]:
total_clients = master.shape[0]
vals = vals/total_clients*100
vals

Unnamed: 0,Admission,Discharge
Extreme problem,,0.882724
Minor problem,25.851198,21.059269
Moderate problem,6.052963,12.862547
No problem,67.843632,49.306431
Serious problem,0.252207,4.539723
Unable to assess,,11.349306


** CHSR Dimension 1 : Acute intoxication/ Withdrawal potential **
We notice that the problems in Withdrawal tend to **increase** over the duration of the program.
We can see that there were no clients with a serious or extreme problem at admission but there are a few people with serious and moderate problem at discharge. Further the number of people with No problem - have reduced! And number of people having a moderate problem have doubled! 

In [58]:
#CHSR Dimension 2

dis = master['CHSR.Dimension.2'].value_counts()
adm = master['CHSR.Dimension.2.admission'].value_counts()

vals = pd.concat([adm,dis], axis=1)
vals.columns = ['Admission','Discharge']

Discharge = go.Scatter(x= vals.index, y = vals['Discharge'], mode = 'lines+markers', name = 'DISCHARGE')
Admission = go.Scatter(x= vals.index,y = vals['Admission'], mode = 'lines+markers', name = 'ADMISSION')
data = [Discharge, Admission]

layout = dict(title = 'Dimension 2 - Biomedical problems: Admission v/s Discharge',
              xaxis = dict(title = 'Problem Level'),
              yaxis = dict(title = 'Number of Clients'),
              )

fig = go.Figure(data = data, layout = layout)
py.iplot(fig, filename='Dimension2')

In [59]:
total_clients = master.shape[0]
vals = vals/total_clients*100
vals

Unnamed: 0,Admission,Discharge
Extreme problem,,0.504414
Minor problem,60.277427,44.262295
Moderate problem,5.422446,10.592686
No problem,34.047919,30.390921
Serious problem,0.252207,2.648172
Unable to assess,,11.601513


** CHSR Dimension 2 : Biomedical conditions and complications **
We notice that the problems in the CHSR Dimension tend to **increase** over the duration of the program.
We can see that there were no or very few clients with a serious/extreme problem at admission but there are a few people with serious and moderate problem at discharge. Further the number of clients having moderate problem has doubled at discharge.

In [60]:
#CHSR Dimension 3

dis = master['CHSR.Dimension.3'].value_counts()
adm = master['CHSR.Dimension.3.admission'].value_counts()

vals = pd.concat([adm,dis], axis=1)
vals.columns = ['Admission','Discharge']

Discharge = go.Scatter(x= vals.index, y = vals['Discharge'], mode = 'lines+markers', name = 'DISCHARGE')
Admission = go.Scatter(x= vals.index,y = vals['Admission'], mode = 'lines+markers', name = 'ADMISSION')
data = [Discharge, Admission]

layout = dict(title = 'Dimension 3 - Mental Health problems: Admission v/s Discharge',
              xaxis = dict(title = 'Problem Level'),
              yaxis = dict(title = 'Number of Clients'),
              )

fig = go.Figure(data = data, layout = layout)
py.iplot(fig, filename='Dimension3')

In [61]:
total_clients = master.shape[0]
vals = vals/total_clients*100
vals

Unnamed: 0,Admission,Discharge
Extreme problem,,2.143758
Minor problem,16.897856,21.311475
Moderate problem,75.91425,44.892812
No problem,6.68348,6.179067
Serious problem,0.504414,14.375788
Unable to assess,,11.0971


** CHSR Dimension 3 : Mental Health Barriers/Problems **
We notice that the problems in Mental Health tend to **increase** over the duration of the program.
The number of clients having a serious problem has dramatically indreased from 0.5% to 14%! 

In [62]:
#CHSR Dimension 4

dis = master['CHSR.Dimension.4'].value_counts()
adm = master['CHSR.Dimension.4.admission'].value_counts()

vals = pd.concat([adm,dis], axis=1)
vals.columns = ['Admission','Discharge']

Discharge = go.Scatter(x= vals.index, y = vals['Discharge'], mode = 'lines+markers', name = 'DISCHARGE')
Admission = go.Scatter(x= vals.index,y = vals['Admission'], mode = 'lines+markers', name = 'ADMISSION')
data = [Discharge, Admission]

layout = dict(title = 'Dimension 4 - Treatment Acceptance problems: Admission v/s Discharge',
              xaxis = dict(title = 'Problem Level'),
              yaxis = dict(title = 'Number of Clients'),
              )

fig = go.Figure(data = data, layout = layout)
py.iplot(fig, filename='Dimension4')

In [63]:
total_clients = master.shape[0]
vals = vals/total_clients*100
vals

Unnamed: 0,Admission,Discharge
Extreme problem,0.126103,5.422446
Minor problem,42.244641,18.15889
Moderate problem,45.018916,26.860025
No problem,9.457755,15.510719
Serious problem,3.152585,26.229508
Unable to assess,,7.818411


** CHSR Dimension 4 : Treatment Acceptance **
We notice that the problems in Treatment Acceptance tend to **increase** over the duration of the program.
We can see that the number of clients with a moderate or minor problem has reduced but there has been a drastic increase in the number of people having a serious problem from 3.15% to **26.2%**!

In [64]:
#CHSR Dimension 5

dis = master['CHSR.Dimension.5'].value_counts()
adm = master['CHSR.Dimension.5.admission'].value_counts()

vals = pd.concat([adm,dis], axis=1)
vals.columns = ['Admission','Discharge']

Discharge = go.Scatter(x= vals.index, y = vals['Discharge'], mode = 'lines+markers', name = 'DISCHARGE')
Admission = go.Scatter(x= vals.index,y = vals['Admission'], mode = 'lines+markers', name = 'ADMISSION')
data = [Discharge, Admission]

layout = dict(title = 'Dimension 5 - Relapse problems: Admission v/s Discharge',
              xaxis = dict(title = 'Problem Level'),
              yaxis = dict(title = 'Number of Clients'),
              )

fig = go.Figure(data = data, layout = layout)
py.iplot(fig, filename='Dimension5')

In [65]:
total_clients = master.shape[0]
vals = vals/total_clients*100
vals

Unnamed: 0,Admission,Discharge
Extreme problem,3.026482,19.293821
Minor problem,2.269861,11.601513
Moderate problem,49.684741,20.050441
No problem,2.269861,5.800757
Serious problem,42.749054,34.678436
Unable to assess,,8.575032


** CHSR Dimension 5 : Relapse Problems **
We notice that the problems in Relapse tend to slightly decrease over the duration of the program.
We can see that the number of clients with a moderate or serious problem has reduced but there has been an increase in the number of people having a extreme problem from 3% to 19.28% but there is also an increase in minor problem from 2% to 11%.

In [66]:
#CHSR Dimension 6

dis = master['CHSR.Dimension.6'].value_counts()
adm = master['CHSR.Dimension.6.admission'].value_counts()

vals = pd.concat([adm,dis], axis=1)
vals.columns = ['Admission','Discharge']

Discharge = go.Scatter(x= vals.index, y = vals['Discharge'], mode = 'lines+markers', name = 'DISCHARGE')
Admission = go.Scatter(x= vals.index,y = vals['Admission'], mode = 'lines+markers', name = 'ADMISSION')
data = [Discharge, Admission]

layout = dict(title = 'Dimension 6 - Recovery Environment problems: Admission v/s Discharge',
              xaxis = dict(title = 'Problem Level'),
              yaxis = dict(title = 'Number of Clients'),
              )

fig = go.Figure(data = data, layout = layout)
py.iplot(fig, filename='Dimension6')

In [67]:
total_clients = master.shape[0]
vals = vals/total_clients*100
vals

Unnamed: 0,Admission,Discharge
Extreme problem,36.443884,25.220681
Minor problem,0.630517,7.440101
Moderate problem,8.827238,11.979823
No problem,2.017654,4.918033
Serious problem,52.080706,27.364439
Unable to assess,,23.076923


** CHSR Dimension 6 : Recovery Environment Problems **
We notice that the problems in Recovery environment tend to clearly decrease over the duration of the program.
It is clear that the number of clients having extreme and serious problems reduces and there is an increase in No problem, minor problem and moderate problem.

## Let us now look at how Labor Status of clients has changed from admission to discharge

In [68]:
LaborForce_Discharge = master['Current.labor.force.status'].value_counts()
LaborForce_Admission = master['Current.labor.status'].value_counts()

LaborForce = pd.concat([LaborForce_Admission,LaborForce_Discharge], axis=1)
LaborForce.columns = ['Admission','Discharge']

Discharge = go.Scatter(x= LaborForce.index, y = LaborForce['Discharge'], mode = 'lines+markers', name = 'DISCHARGE')
Admission = go.Scatter(x= LaborForce.index,y = LaborForce['Admission'], mode = 'lines+markers', name = 'ADMISSION')
data = [Discharge, Admission]

layout = dict(title = 'Current Labor Force Status: Admission v/s Discharge',
              xaxis = dict(title = 'Status'),
              yaxis = dict(title = 'Number of Clients'),
              width = 900,
              height = 500,
              margin = go.Margin(b=200)
              )

fig = go.Figure(data = data, layout = layout)
py.iplot(fig, filename='Labor Force')


We can see that the number of people Not Looking for work has reduced by a great amount and there is a slight increase in the number of people looking for work and those that are employed part-time!   
**NOTE: ** However it does seem like the data might not be giving the right picture as the number of disbled people reduces over the program - which is something absurd.

## Let us now look at how the Primary substance abuse changes

In [69]:
Subs_dis= master['Substance.abuse.problem.at.Discharge'].value_counts()
Subs_adm = master['Primary.substance.abuse.problem'].value_counts()

Subs = pd.concat([Subs_adm,Subs_dis], axis=1)
Subs.columns = ['Admission','Discharge']

Discharge = go.Scatter(x= Subs.index, y = Subs['Discharge'], mode = 'lines+markers', name = 'DISCHARGE')
Admission = go.Scatter(x= Subs.index,y = Subs['Admission'], mode = 'lines+markers', name = 'ADMISSION')

data = [Discharge, Admission]

layout = dict(title = 'Primary substance abuse: Admission v/s Discharge',
              xaxis = dict(title = 'Substance Used'),
              yaxis = dict(title = 'Number of Clients'),
              margin = go.Margin(b=150)
              )

fig = go.Figure(data = data, layout = layout)
py.iplot(fig, filename='Substance abuse')


Looking at the above comparison, We can see that there is not a significant difference between the primary substance abuse at admission and discharge. However, there is a slight reduction in the case of Alcohol, Heroin, Methamphetamine and Marijuana/Hashish.

## Comparing Enrolled in school job or taining from admission to dsicharge:

In [70]:
enroll_dis = master['Enrolled.in.school.or.job.training'].unique()
enroll_adm = master['Currently.enrolled.in.school.or.job.training.program'].unique()
print(enroll_adm)
print(enroll_dis)

['Not enrolled' nan 'Enrolled, part time' 'Enrolled, full time']
['Not enrolled' nan 'Enrolled, full time ' 'Enrolled, part time ']


Looking at this we can see that the discharge column has an extra space at the end of the values, thus we need to strip the values of this space. Further, we need to replace the 'Unknown' with Nan.

In [71]:
master['Enrolled.in.school.or.job.training'] = [str(val).strip() for val in master['Enrolled.in.school.or.job.training']]

master.loc[master['Enrolled.in.school.or.job.training'] == 'nan', 
           'Enrolled.in.school.or.job.training'] = 'Unknown'

In [72]:
enroll_dis = master['Enrolled.in.school.or.job.training'].unique()
enroll_adm = master['Currently.enrolled.in.school.or.job.training.program'].unique()
print(enroll_adm)
print(enroll_dis)

['Not enrolled' nan 'Enrolled, part time' 'Enrolled, full time']
['Not enrolled' 'Unknown' 'Enrolled, full time' 'Enrolled, part time']


In [73]:
enroll_dis = master['Enrolled.in.school.or.job.training'].value_counts()
enroll_adm = master['Currently.enrolled.in.school.or.job.training.program'].value_counts()

Enroll = pd.concat([enroll_adm,enroll_dis], axis=1)
Enroll.columns = ['Admission','Discharge']

Discharge = go.Scatter(x= Enroll.index, y = Enroll['Discharge'], mode = 'lines+markers', name = 'DISCHARGE')
Admission = go.Scatter(x= Enroll.index,y = Enroll['Admission'], mode = 'lines+markers', name = 'ADMISSION')
data = [Discharge, Admission]

layout = dict(title = 'Enrolled in school or job training: Admission v/s Discharge',
              xaxis = dict(title = 'Status'),
              yaxis = dict(title = 'Number of Clients'))

fig = go.Figure(data = data, layout = layout)
py.iplot(fig, filename='Enrolled')

Again, here we can see that there is not much of a difference between the number of clients enrolled at discharge and at admission, however the number of clients Not Enrolled have reduced, however the number of clients Unknown has increased. 

## Cmparing where client has been living in the past 30 days at Admission and Discharge:

In [74]:
Living_adm = master['Where.has.client.been.living.in.past.30.days'].unique()
Living_dis = master['Where.client.has.been.living.in.past.30.days'].unique()
print(Living_adm)
print(Living_dis)

[ 'Dependent living - dependent children and/or adults living in a supervised setting'
 'Homeless - no fixed address (includes shelters)'
 'Children living with their family'
 'Independent living - including on own, self supported and non-supervised group homes'
 nan]
[ 'Independent living - including on own, self supported and non-supervised group homes'
 nan
 'Dependent living - dependent children and/or adults living in a supervised setting'
 'Homeless - no fixed address (includes shelters)'
 'Children living with their family']


In [75]:
#Replacing nan with Unknown for analysis
master['Where.client.has.been.living.in.past.30.days'] = [str(val).strip() for val in master['Where.client.has.been.living.in.past.30.days']]

master.loc[master['Where.client.has.been.living.in.past.30.days'] == 'nan', 
           'Where.client.has.been.living.in.past.30.days'] = 'Unknown'

master.loc[master['Where.has.client.been.living.in.past.30.days'] == 'Homless - no fixed address (includes shelters)', 
           'Where.has.client.been.living.in.past.30.days'] = 'Homeless - no fixed address (includes shelters)'

In [76]:
Living_adm = master['Where.has.client.been.living.in.past.30.days'].unique()
Living_dis = master['Where.client.has.been.living.in.past.30.days'].unique()
print(Living_adm)
print(Living_dis)

[ 'Dependent living - dependent children and/or adults living in a supervised setting'
 'Homeless - no fixed address (includes shelters)'
 'Children living with their family'
 'Independent living - including on own, self supported and non-supervised group homes'
 nan]
[ 'Independent living - including on own, self supported and non-supervised group homes'
 'Unknown'
 'Dependent living - dependent children and/or adults living in a supervised setting'
 'Homeless - no fixed address (includes shelters)'
 'Children living with their family']


In [77]:
Living_adm = master['Where.has.client.been.living.in.past.30.days'].value_counts()
Living_dis = master['Where.client.has.been.living.in.past.30.days'].value_counts()

Living = pd.concat([Living_adm,Living_dis], axis=1)
Living.columns = ['Admission','Discharge']

Discharge = go.Scatter(x= Living.index, y = Living['Discharge'], mode = 'lines+markers', name = 'DISCHARGE')
Admission = go.Scatter(x= Living.index,y = Living['Admission'], mode = 'lines+markers', name = 'ADMISSION')
data = [Discharge, Admission]

layout = dict(title = 'Living conditions in the past 30 days: Admission v/s Discharge',
              xaxis = dict(title = 'Status'),
              yaxis = dict(title = 'Number of Clients'),
              margin = go.Margin(b=250),
              height = 500)

fig = go.Figure(data = data, layout = layout)
py.iplot(fig, filename='Living')

Looking at this we can see that the number of homeless people has reduced from admission to discharge! However, there has also been an increase in the number of unknown values!

## Comparing Attendance at self help programs(external to Avivo) attended by clients at Admission v/s Discharge:

In [78]:
self_adm = master['Number.of.times.at.self.help.programs.in.past.30.days.(eg..AA,.NA)'].unique()
self_dis = master['Number.of.self.help.programs.attended.in.past.30.days'].unique()
print(self_adm)
print(self_dis)

['16-30 times past month (0ver 3 times per week)' 'No attendance'
 '1-3 times past month (less than once per week)'
 '4-7 times past month (once per week)'
 '8-15 times past month (2 or 3 times per week)'
 'Some attendance, but frequency unknown' nan]
['Some attendance, but frequency unknown'
 '1-3 times past month (less than once per week)'
 '8-15 times past month (2 or 3 times per week)' nan
 '4-7 times past month (once per week)' 'No attendance'
 '16-30 times past month (0ver 3 times per week)']


In [79]:
master['Number.of.self.help.programs.attended.in.past.30.days'] = [str(val).strip() for val in master['Number.of.self.help.programs.attended.in.past.30.days']]

master.loc[master['Number.of.self.help.programs.attended.in.past.30.days'] == 'nan', 
           'Number.of.self.help.programs.attended.in.past.30.days'] = 'Unknown'

In [80]:
self_adm = master['Number.of.times.at.self.help.programs.in.past.30.days.(eg..AA,.NA)'].unique()
self_dis = master['Number.of.self.help.programs.attended.in.past.30.days'].unique()
print(self_adm)
print(self_dis)

['16-30 times past month (0ver 3 times per week)' 'No attendance'
 '1-3 times past month (less than once per week)'
 '4-7 times past month (once per week)'
 '8-15 times past month (2 or 3 times per week)'
 'Some attendance, but frequency unknown' nan]
['Some attendance, but frequency unknown'
 '1-3 times past month (less than once per week)'
 '8-15 times past month (2 or 3 times per week)' 'Unknown'
 '4-7 times past month (once per week)' 'No attendance'
 '16-30 times past month (0ver 3 times per week)']


In [81]:
self_adm = master['Number.of.times.at.self.help.programs.in.past.30.days.(eg..AA,.NA)'].value_counts()
self_dis = master['Number.of.self.help.programs.attended.in.past.30.days'].value_counts()

Self_help = pd.concat([self_adm,self_dis], axis=1)
Self_help.columns = ['Admission','Discharge']

Discharge = go.Scatter(x= Self_help.index, y = Self_help['Discharge'], mode = 'lines+markers', name = 'DISCHARGE')
Admission = go.Scatter(x= Self_help.index,y = Self_help['Admission'], mode = 'lines+markers', name = 'ADMISSION')
data = [Discharge, Admission]

layout = dict(title = 'Number of self-help sessions attended in past 30 days: Admission v/s Discharge',
              xaxis = dict(title = 'Number of sessions'),
              yaxis = dict(title = 'Number of Clients'),
              margin = go.Margin(b=200))

fig = go.Figure(data = data, layout = layout)
py.iplot(fig, filename='Self help')

Looking at this plot, it is evident that there is some increase in attendance of self help programs from Admission to Discharge. No. of clients not attending has drastically reduced, and attendence has slightly improved!