In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
import sklearn.cluster.hierarchical as hclust
from sklearn import preprocessing
import seaborn as sns

In [None]:
df = pd.read_csv('../input/College.csv')

In [None]:
print(df.shape)
df.head()

<font size=5>**Features**</font>

Note that there's a categorical variable in our data - 'Private'. Categorical variables are tricky for clustering. You can't cluster off a categorical variable, so you'd have to do some kind of mapping to it. This can be intuitive for ordinal data, but for non-ordinal categorical variables, assigning numerical values can impact the clusters in ways not meaningful about the underlying data. 'Private' is a binary variable, yes or no, but mapping 0 or 1 would have outsized impact on clustering, since each point would be all the way at the min or the max of this variable while other variables will be continuous. For now, we will disregard this variable.

In [None]:
#exclude the categorical column and the college names
features = df.drop(['Private', 'Unnamed: 0'],axis=1)

In [None]:
features.describe()

**Normalization**

Note that the different categories have different ranges. If we don't normalize them, then columns with wider ranges will have disproportionate contributions to cluster separations.

In [None]:
scaler = preprocessing.MinMaxScaler()
features_normal = scaler.fit_transform(features)

In [None]:
pd.DataFrame(features_normal).describe()

Now all of our variables are scaled to be distributed between 0 and 1.

<font size=5>**K-Means Clustering**</font>

In [None]:
kmeans = KMeans(n_clusters=5).fit(features_normal)

In [None]:
labels = pd.DataFrame(kmeans.labels_) #This is where the label output of the KMeans we just ran lives. Make it a dataframe so we can concatenate back to the original data
labeledColleges = pd.concat((df,labels),axis=1)
labeledColleges = labeledColleges.rename({0:'labels'},axis=1)

<font size=5>**Visualization**</font>
    
*(Nota bene: I'm plotting the original data in these visualizations, not their normalized scaled versions. We clustered based on the normalized data but I wanted to see how that translates to the colleges' actual stats)*

The original dataset had 18 features and we clustered on 17 of them. We have 5 clusters of points in 17-dimensional space, which is hard to visualize. If we only had two attributes, we could look at how the clusters separate like this:

In [None]:
sns.lmplot(x='Top10perc',y='S.F.Ratio',data=labeledColleges,hue='labels',fit_reg=False)

Here we plotted the Top 10 Percent column (" Pct. new students from top 10% of H.S. class") versus the Student/Faculty ratio column and color-coded each data point by the cluster to which it was assigned. You can get the sense that cluster 0 colleges seem to have higher fractions of their class coming from the top 10% of their high schools and lower student/faculty ratios. The rest of the clusters aren't getting great separation with these two variables; we have 15 other variables contributing to the separation that we have to consider to get the full picture. We can't plot all 17 variables together on a plot like the one above. We could plot every variable against every other variable:

In [None]:
sns.pairplot(labeledColleges,hue='labels')

This is nice for scanning by eye and seeing what variables give you nice separation and getting a sense for what happened in the clusters, but there's a lot going on and it's hard to get a quick answer to questions like "what features tend to define cluster 0? How about cluster 4?" Let's try visualizing each variable separately using strip plots and swarm plots.

In [None]:
labeledColleges['Constant'] = "Data" #This is just to add something constant for the strip/swarm plots' X axis. Can be anything you want it to be.

In [None]:
sns.stripplot(x=labeledColleges['Constant'],y=labeledColleges['Top10perc'],hue=labeledColleges['labels'],jitter=True)

This is a strip plot. Seaborn plots one data point for each row and we've color coded the points by the cluster to which they were assigned. Adding jitter fans out the points horizontally. In a strip plot, the points can overlap. In a swarm plot (below), the points cannot overlap.

In [None]:
sns.swarmplot(x=labeledColleges['Constant'],y=labeledColleges['Top10perc'],hue=labeledColleges['labels'])

In both of these plots you can see that cluster 1 and 3 colleges have a smaller fraction of their incoming classes drawn from the top 10 percent of their high schools. Clusters 0 and 2 have a larger fraction. 4 seems to be distributed in the middle. Great! Let's look at all the features. 

In [None]:
plotData = labeledColleges.drop(['Private', 'Unnamed: 0'],axis=1) #I don't want to plot college name or the Private variable. I could have gone back to Features but that doesn't have the labels or the constant field I added.

In [None]:
f, axes = plt.subplots(4, 5, figsize=(20, 25), sharex=False) #create a 4x5 grid of empty figures where we will plot our feature plots. We will have a couple empty ones.
f.subplots_adjust(hspace=0.2, wspace=0.7) #Scooch them apart, give em some room
#In this for loop, I step through every column that I want to plot. This is a 4x5 grid, so I split this up by rows of 5 in the else if statements
for i in range(0,len(list(plotData))-2): #minus two because I don't want to plot labels or constant
    col = plotData.columns[i]
    if i < 5:
        ax = sns.stripplot(x=plotData['Constant'],y=plotData[col].values,hue=plotData['labels'],jitter=True,ax=axes[0,(i)])
        ax.set_title(col)
    elif i >= 5 and i<10:
        ax = sns.stripplot(x=plotData['Constant'],y=plotData[col].values,hue=plotData['labels'],jitter=True,ax=axes[1,(i-5)]) #so if i=6 it is row 1 column 1
        ax.set_title(col)
    elif i >= 10 and i<15:
        ax = sns.stripplot(x=plotData['Constant'],y=plotData[col].values,hue=plotData['labels'],jitter=True,ax=axes[2,(i-10)])
        ax.set_title(col)
    elif i >= 15:
        ax = sns.stripplot(x=plotData['Constant'],y=plotData[col].values,hue=plotData['labels'],jitter=True,ax=axes[3,(i-15)])
        ax.set_title(col)

In [None]:
f, axes = plt.subplots(4, 5, figsize=(20, 25), sharex=False) 
f.subplots_adjust(hspace=0.2, wspace=0.7)
for i in range(0,len(list(plotData))-2):
    col = plotData.columns[i]
    if i < 5:
        ax = sns.swarmplot(x=plotData['Constant'],y=plotData[col].values,hue=plotData['labels'],ax=axes[0,(i)])
        ax.set_title(col)
    elif i >= 5 and i<10:
        ax = sns.swarmplot(x=plotData['Constant'],y=plotData[col].values,hue=plotData['labels'],ax=axes[1,(i-5)])
        ax.set_title(col)
    elif i >= 10 and i<15:
        ax = sns.swarmplot(x=plotData['Constant'],y=plotData[col].values,hue=plotData['labels'],ax=axes[2,(i-10)])
        ax.set_title(col)
    elif i >= 15:
        ax = sns.swarmplot(x=plotData['Constant'],y=plotData[col].values,hue=plotData['labels'],ax=axes[3,(i-15)])
        ax.set_title(col)