## Hamilton Depression Scale

In [None]:
#This is a 17-item self-report scale; each item is an integer from 0-4 (some items only allow for a max of 2)
#Of all of the available datasets, I think this makes the most sense to start with.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
%matplotlib inline

In [None]:
hrsd01_filename = 'Data/hrsd01.txt'

In [None]:
hrsd01 = pd.read_table(hrsd01_filename)

## Basic visualization stuff

In [None]:
hrsd01.head(3)

In [None]:
hrsd01.tail(3)

In [None]:
hrsd01.ndim

In [None]:
hrsd01.shape

In [None]:
hrsd01.size

## Making a workable matrix

In [None]:
hrsd02 = pd.read_table(hrsd01_filename, skiprows=[1])
#skipping the first row so that I don't get that concatination error

In [None]:
hrsd02.head(3)

In [None]:
hrsd01.columns

In [None]:
len(np.unique(hrsd02.src_subject_id))

In [None]:
max(np.unique(hrsd02.src_subject_id))
#Some subject ID numbers are not used-- ie, the numbers are non-contiguous

In [None]:
hrsd02.shape

In [None]:
hrsd02.tail(3)

In [None]:
#let's check to make sure we know what the data structure is
 #Reminder: arrays can have the following attributes: np.array(hrsd02, dtype=None, copy=True, order='K', subok=False, ndmin=0)
type(hrsd02)

In [None]:
hrsd_dropped = hrsd02.dropna()
hrsd_dropped.head(3)
#every row has some missing values

In [None]:
hrsd02.interview_age[hrsd02.src_subject_id == 17]

In [None]:
hrsd02.interview_age[hrsd02.src_subject_id == 200]

In [None]:
max(np.bincount(hrsd02.src_subject_id))
#max number of rows for given patient id
#i think this is because some patients were followed for longer in the naturalistic followup phase

In [None]:
min(np.bincount(hrsd02.src_subject_id))
#min number of rows for given patient id

In [None]:
np.bincount((np.bincount(hrsd02.src_subject_id)))
#by bincounting the bincount, we can see how many patients were at each level

In [None]:
sum(np.bincount((np.bincount(hrsd02.src_subject_id))))
#aha! this explains the non-contiguous numbers-- 133 patients had zero observations (4174-133=4041)

In [None]:
len(np.bincount(hrsd02.src_subject_id))
#just doubling checking that

In [None]:
np.unique(hrsd02.level)

In [None]:
np.mean(np.bincount(hrsd02.src_subject_id))
#this is the mean number of HAMDs among patients

In [None]:
hrsd02[hrsd02.src_subject_id == 10]
#eg dropped patient

## A couple quick visualizations

In [None]:
hrsd02.groupby('src_subject_id')['hsuic'].mean().hist()

In [None]:
hrsd02.hdtot_r.hist(bins=np.linspace(0., 50., 100))

# Refining the matrix

In [None]:
#I think what we want to do is slice the subarrays in a stepwise fashion (eg pg 44 of Python Data Science Handbook)

In [None]:
hrsd_new = hrsd02.loc[:, 'hsoin':'hsex']
hrsd_drop = hrsd_new.dropna()
hrsd_drop.shape

In [None]:
hbase = hrsd02.loc[hrsd02['level']=='Enrollment', 'hsoin':'hsex']
hbase.head(10)

In [None]:
hbase.shape

In [None]:
hrsd02.index

In [None]:
hdata = hrsd02.loc[hrsd02['src_subject_id']==7, 'hsoin':'hsex']
hdata.head(10)

In [None]:
hbase2 = hrsd02.loc[hrsd02['days_baseline']==0, 'hsoin':'hsex']
hbase2.head(10)

In [None]:
hbase2.shape
#why is this different from enrollment?

In [None]:
hone = hrsd02.loc[hrsd02['level']=='Level 1', 'hsoin':'hsex']
hone.head(10)

In [None]:
hone.shape

In [None]:
htwo = hrsd02.loc[hrsd02['level']=='Level 2', 'hsoin':'hsex']
htwo.head(10)

In [None]:
htwo.shape

## Let's make a quick scatterplot

In [None]:
# This code is from http://stamfordresearch.com/k-means-clustering-in-python/
# Store the inputs as a Pandas Dataframe and set the column names
x1 = pd.DataFrame(hrsd02.hpanx)
x1.columns = ['Psychic Anxiety']
 
y1 = pd.DataFrame(hrsd02.hdtot_r)
y1.columns = ['Ham-D Total Score']

x2 = pd.DataFrame(hrsd02.hmdsd)
x2.columns = ['Depressed Mood']
 
y2 = pd.DataFrame(hrsd02.hdtot_r)
y2.columns = ['Ham-D Total Score']

In [None]:
# Set the size of the plot
plt.figure(figsize=(14,7))

# Plot
plt.subplot(1, 2, 1)
plt.scatter(x1, y1)
plt.title('Psychic Anxiety vs Total Score')
 
plt.subplot(1, 2, 2)
plt.scatter(x2, y2)
plt.title('Depressed Mood vs Total Score')

## K-means

Now I'm getting way ahead of myself! This can stay here for later.

In [None]:
# Again, from http://stamfordresearch.com/k-means-clustering-in-python/
# K Means Cluster
model = pd.KMeans(n_clusters=4)
model.fit(x)