## Hamilton Depression Scale

In [None]:
#This is a 17-item self-report scale; each item is an integer from 0-4 (some items only allow for a max of 2)
#Of all of the available datasets, I think this makes the most sense to start with.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
%matplotlib inline

In [2]:
hrsd01_filename = 'Data/hrsd01.txt'

In [3]:
hrsd01 = pd.read_table(hrsd01_filename)

## Basic visualization stuff

In [4]:
hrsd01.head(3)

In [5]:
hrsd01.tail(3)

In [6]:
hrsd01.ndim

In [7]:
hrsd01.shape

In [8]:
hrsd01.size

## Making a workable matrix

In [9]:
hrsd02 = pd.read_table(hrsd01_filename, skiprows=[1])
#skipping the first row so that I don't get that concatination error

In [10]:
hrsd02.head(3)

In [11]:
hrsd01.columns

In [12]:
len(np.unique(hrsd02.src_subject_id))

In [71]:
max(np.unique(hrsd02.src_subject_id))

In [13]:
hrsd02.shape

In [15]:
hrsd02.tail(3)

In [16]:
hrsd_dropped = hrsd02.dropna()
hrsd_dropped.head(3)

In [17]:
hrsd02.interview_age[hrsd02.src_subject_id == 17]

In [33]:
hrsd02.interview_age[hrsd02.src_subject_id == 200]

In [66]:
max(np.bincount(hrsd02.src_subject_id))
#max number of rows for given patient id

In [67]:
min(np.bincount(hrsd02.src_subject_id))
#min number of rows for given patient id

In [65]:
np.bincount((np.bincount(hrsd02.src_subject_id)))

In [82]:
np.bincount(hrsd02.level)

In [68]:
sum(np.bincount((np.bincount(hrsd02.src_subject_id))))

In [70]:
len(np.bincount(hrsd02.src_subject_id))

In [56]:
if 1 in np.bincount(hrsd02.src_subject_id): 
    print "min 1"

In [72]:
np.unique(hrsd02.level)

In [58]:
np.mean(np.bincount(hrsd02.src_subject_id))

In [59]:
hrsd02[hrsd02.src_subject_id == 1]

In [18]:
hrsd02.groupby('src_subject_id')['hsuic'].mean().hist()

In [19]:
hrsd02.hdtot_r.hist(bins=np.linspace(0., 50., 100))

In [20]:
#let's check to make sure we know what the data structure is
type(hrsd02)

In [None]:
#I could also inspect the individual cell variable types with the following code; this also reveals subarrays
#hrsd02.dtypes

In [None]:
 #Reminder: arrays can have the following attributes: np.array(hrsd02, dtype=None, copy=True, order='K', subok=False, ndmin=0)

In [21]:
hrsd02.shape

# Refining the matrix

In [22]:
#At this point, I think what I need to do is create a tuple that contains only the datapoints I care about.
#The most obvious way to do this would be to write a comma-separated tuple of indices pointing to individual values
    #But that seems terrible. Is there any otherway to do it?

In [23]:
#I think what we want to do is slice the subarrays in a stepwise fashion (eg pg 44 of Python Data Science Handbook)

In [27]:
#array slicing here

In [28]:
hrsd_new = hrsd02.loc[:, 'hsoin':'hsex']
hrsd_drop = hrsd_new.dropna()
hrsd_drop.shape

In [60]:
hbase = hrsd02.loc[hrsd02['level']=='Enrollment', 'hsoin':'hsex']
hbase.head(10)

In [62]:
hbase.shape

In [32]:
hrsd02.index

In [None]:
hdata = hrsd02.loc[hrsd02['src_subject_id']==7, 'hsoin':'hsex']
hdata.head(10)

In [63]:
hbase2 = hrsd02.loc[hrsd02['days_baseline']==0, 'hsoin':'hsex']
hbase2.head(10)

In [64]:
hbase2.shape

## Let's make a quick scatterplot

In [None]:
# This code is from http://stamfordresearch.com/k-means-clustering-in-python/
# Store the inputs as a Pandas Dataframe and set the column names
x1 = pd.DataFrame(hrsd02.hpanx)
x1.columns = ['Psychic Anxiety']
 
y1 = pd.DataFrame(hrsd02.hdtot_r)
y1.columns = ['Ham-D Total Score']

x2 = pd.DataFrame(hrsd02.hmdsd)
x2.columns = ['Depressed Mood']
 
y2 = pd.DataFrame(hrsd02.hdtot_r)
y2.columns = ['Ham-D Total Score']

In [None]:
# Set the size of the plot
plt.figure(figsize=(14,7))

# Plot
plt.subplot(1, 2, 1)
plt.scatter(x1, y1)
plt.title('Psychic Anxiety vs Total Score')
 
plt.subplot(1, 2, 2)
plt.scatter(x2, y2)
plt.title('Depressed Mood vs Total Score')

## K-means

Now I'm getting way ahead of myself! This can stay here for later.

In [None]:
# Again, from http://stamfordresearch.com/k-means-clustering-in-python/
# K Means Cluster
model = pd.KMeans(n_clusters=4)
model.fit(x)