## Hamilton Depression Scale

In [None]:
#This is a 17-item self-report scale; each item is an integer from 0-4 (some items only allow for a max of 2)
#Of all of the available datasets, I think this makes the most sense to start with.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
%matplotlib inline

In [None]:
hrsd01_filename = 'Data/hrsd01.txt'

In [None]:
hrsd01 = pd.read_table(hrsd01_filename)

## Basic visualization stuff

In [None]:
hrsd01.head(3)

In [None]:
hrsd01.tail(3)

In [None]:
hrsd01.ndim

In [None]:
hrsd01.shape

In [None]:
hrsd01.size

## Making a workable matrix

In [None]:
hrsd02 = pd.read_table(hrsd01_filename, skiprows=[1])
#skipping the first row so that I don't get that concatination error

In [None]:
hrsd02.head(3)

In [None]:
hrsd01.columns

In [None]:
len(np.unique(hrsd02.src_subject_id))

In [None]:
hrsd02.shape

In [None]:
hrsd02.tail(10)

In [None]:
hrsd_dropped = hrsd02.dropna()
hrsd_dropped.head(3)

In [None]:
hrsd02.interview_age[hrsd02.src_subject_id == 17]

In [None]:
hrsd02.groupby('src_subject_id')['hsuic'].mean().hist()

In [None]:
hrsd02.hdtot_r.hist(bins=np.linspace(0., 50., 100))

In [None]:
#let's check to make sure we know what the data structure is
type(hrsd02)

In [None]:
#I could also inspect the individual cell variable types with the following code; this also reveals subarrays
#hrsd02.dtypes

In [None]:
 #Reminder: arrays can have the following attributes: np.array(hrsd02, dtype=None, copy=True, order='K', subok=False, ndmin=0)

In [None]:
hrsd02.shape

# Refining the matrix

In [None]:
#At this point, I think what I need to do is create a tuple that contains only the datapoints I care about.
#The most obvious way to do this would be to write a comma-separated tuple of indices pointing to individual values
    #But that seems terrible. Is there any otherway to do it?

In [None]:
#I think what we want to do is slice the subarrays in a stepwise fashion (eg pg 44 of Python Data Science Handbook)

In [None]:
hrsd02.sub = hrsd02[]
print (hrsd02.sub)

In [None]:
hrsd_new = hrsd02.loc[:, 'hsoin':'hsex']
hrsd_drop = hrsd_new.dropna()
hrsd_drop.shape

In [None]:
hrsd02.iloc[hrsd02.index.any(hrsd02.src_subject_id==17), 13:29]

In [None]:
hrsd02.index

## Let's make a quick scatterplot

In [None]:
# This code is from http://stamfordresearch.com/k-means-clustering-in-python/
# Store the inputs as a Pandas Dataframe and set the column names
x1 = pd.DataFrame(hrsd02.hpanx)
x1.columns = ['Psychic Anxiety']
 
y1 = pd.DataFrame(hrsd02.hdtot_r)
y1.columns = ['Ham-D Total Score']

x2 = pd.DataFrame(hrsd02.hmdsd)
x2.columns = ['Depressed Mood']
 
y2 = pd.DataFrame(hrsd02.hdtot_r)
y2.columns = ['Ham-D Total Score']

In [None]:
# Set the size of the plot
plt.figure(figsize=(14,7))

# Plot
plt.subplot(1, 2, 1)
plt.scatter(x1, y1)
plt.title('Psychic Anxiety vs Total Score')
 
plt.subplot(1, 2, 2)
plt.scatter(x2, y2)
plt.title('Depressed Mood vs Total Score')

## K-means

Now I'm getting way ahead of myself! This can stay here for later.

In [None]:
# Again, from http://stamfordresearch.com/k-means-clustering-in-python/
# K Means Cluster
model = pd.KMeans(n_clusters=4)
model.fit(x)