In [1]:
import pandas as pd
import numpy as np

import altair as alt
import matplotlib.pyplot as plt

In this lab, we use the data of communities and crime (https://archive.ics.uci.edu/ml/datasets/Communities+and+Crime) which includes some attributes of communities.

In the original data folder, there are two data files: `communities.data` with all the vectors, and `communities.names` containing the description of the dataset, as well as column names and some metadata.


In [2]:
alt.data_transformers.enable(max_rows=250000)

DataTransformerRegistry.enable('default')

In [3]:
data = pd.read_csv(filepath_or_buffer='./communities.data', header=None, index_col=None).values
columns = [
    "state numeric",
"county numeric",
"community numeric",
"communityname string",
"fold numeric",
"population numeric",
"householdsize numeric",
"racepctblack numeric",
"racePctWhite numeric",
"racePctAsian numeric",
"racePctHisp numeric",
"agePct12t21 numeric",
"agePct12t29 numeric",
"agePct16t24 numeric",
"agePct65up numeric",
"numbUrban numeric",
"pctUrban numeric",
"medIncome numeric",
"pctWWage numeric",
"pctWFarmSelf numeric",
"pctWInvInc numeric",
"pctWSocSec numeric",
"pctWPubAsst numeric",
"pctWRetire numeric",
"medFamInc numeric",
"perCapInc numeric",
"whitePerCap numeric",
"blackPerCap numeric",
"indianPerCap numeric",
"AsianPerCap numeric",
"OtherPerCap numeric",
"HispPerCap numeric",
"NumUnderPov numeric",
"PctPopUnderPov numeric",
"PctLess9thGrade numeric",
"PctNotHSGrad numeric",
"PctBSorMore numeric",
"PctUnemployed numeric",
"PctEmploy numeric",
"PctEmplManu numeric",
"PctEmplProfServ numeric",
"PctOccupManu numeric",
"PctOccupMgmtProf numeric",
"MalePctDivorce numeric",
"MalePctNevMarr numeric",
"FemalePctDiv numeric",
"TotalPctDiv numeric",
"PersPerFam numeric",
"PctFam2Par numeric",
"PctKids2Par numeric",
"PctYoungKids2Par numeric",
"PctTeen2Par numeric",
"PctWorkMomYoungKids numeric",
"PctWorkMom numeric",
"NumIlleg numeric",
"PctIlleg numeric",
"NumImmig numeric",
"PctImmigRecent numeric",
"PctImmigRec5 numeric",
"PctImmigRec8 numeric",
"PctImmigRec10 numeric",
"PctRecentImmig numeric",
"PctRecImmig5 numeric",
"PctRecImmig8 numeric",
"PctRecImmig10 numeric",
"PctSpeakEnglOnly numeric",
"PctNotSpeakEnglWell numeric",
"PctLargHouseFam numeric",
"PctLargHouseOccup numeric",
"PersPerOccupHous numeric",
"PersPerOwnOccHous numeric",
"PersPerRentOccHous numeric",
"PctPersOwnOccup numeric",
"PctPersDenseHous numeric",
"PctHousLess3BR numeric",
"MedNumBR numeric",
"HousVacant numeric",
"PctHousOccup numeric",
"PctHousOwnOcc numeric",
"PctVacantBoarded numeric",
"PctVacMore6Mos numeric",
"MedYrHousBuilt numeric",
"PctHousNoPhone numeric",
"PctWOFullPlumb numeric",
"OwnOccLowQuart numeric",
"OwnOccMedVal numeric",
"OwnOccHiQuart numeric",
"RentLowQ numeric",
"RentMedian numeric",
"RentHighQ numeric",
"MedRent numeric",
"MedRentPctHousInc numeric",
"MedOwnCostPctInc numeric",
"MedOwnCostPctIncNoMtg numeric",
"NumInShelters numeric",
"NumStreet numeric",
"PctForeignBorn numeric",
"PctBornSameState numeric",
"PctSameHouse85 numeric",
"PctSameCity85 numeric",
"PctSameState85 numeric",
"LemasSwornFT numeric",
"LemasSwFTPerPop numeric",
"LemasSwFTFieldOps numeric",
"LemasSwFTFieldPerPop numeric",
"LemasTotalReq numeric",
"LemasTotReqPerPop numeric",
"PolicReqPerOffic numeric",
"PolicPerPop numeric",
"RacialMatchCommPol numeric",
"PctPolicWhite numeric",
"PctPolicBlack numeric",
"PctPolicHisp numeric",
"PctPolicAsian numeric",
"PctPolicMinor numeric",
"OfficAssgnDrugUnits numeric",
"NumKindsDrugsSeiz numeric",
"PolicAveOTWorked numeric",
"LandArea numeric",
"PopDens numeric",
"PctUsePubTrans numeric",
"PolicCars numeric",
"PolicOperBudg numeric",
"LemasPctPolicOnPatr numeric",
"LemasGangUnitDeploy numeric",
"LemasPctOfficDrugUn numeric",
"PolicBudgPerPop numeric",
"ViolentCrimesPerPop numeric",
]

df = pd.DataFrame(data=data, columns=columns)

In [4]:
num_features = columns[5:]
col_to_analysis = ['NumIlleg numeric', 'LemasSwornFT numeric','PolicOperBudg numeric', 'OfficAssgnDrugUnits numeric',
       'NumStreet numeric', 'LemasTotalReq numeric', 'NumUnderPov numeric', 'NumInShelters numeric',
       'PolicCars numeric', 'NumImmig numeric', 'PctPolicHisp numeric', 'HousVacant numeric', 'PolicBudgPerPop numeric',
       'PctPolicBlack numeric', 'LemasSwFTPerPop numeric']

In [5]:
''' Here we simply convert non-numeric values into 0 '''
# this step change strings into NaN
processed = df[num_features].apply(pd.to_numeric, errors="coerce")

# then we change NaN to 0
processed = processed.replace(np.nan,0)

# Lab starts here

## Dimensionality Reduction

We demostrate the usage of PCA, t-SNE, and MDS in this lab.

Find their documents here:
- PCA: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

- t-SNE: https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html

- MDS: https://scikit-learn.org/stable/modules/generated/sklearn.manifold.MDS.html

In [6]:
## Dimensionality Reduction

We demostrate the usage of PCA, t-SNE, and MDS in this lab.

Find their documents here:
- PCA: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

- t-SNE: https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html

- MDS: https://scikit-learn.org/stable/modules/generated/sklearn.manifold.MDS.htmlfrom sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.manifold import MDS

### Principal Component Analysis (PCA)

In [7]:
''' initialize PCA '''
pca = PCA(n_components=2)

''' prepare the features for be analyszed '''
X = processed[col_to_analysis].values

''' find the first two principal components '''
reduced_data = pca.fit_transform(X)


In [8]:
alt.Chart(pd.DataFrame(data=reduced_data, columns=['x', 'y'])).mark_point().encode(
    x='x:Q',
    y='y:Q'
)

In [10]:
pca_df = pd.DataFrame(data=reduced_data, columns=['x', 'y'])
pca_df['crime_rate'] = processed['ViolentCrimesPerPop numeric']

alt.Chart(pca_df).mark_circle().encode(
    x='x:Q',
    y='y:Q',
    color=alt.Color('binned_crime:O', scale=alt.Scale(scheme='yellowgreenblue'))
).transform_bin(
    as_='binned_crime', 
    field='crime_rate',
)

### T-SNE

In [11]:
result_tsne = TSNE(n_components=2).fit_transform(X)

tsne_df = pd.DataFrame(data=result_tsne, columns=['x','y'])

alt.Chart(tsne_df).mark_point().encode(
    x='x:Q',
    y='y:Q',
)

In [12]:
tsne_df['crime_rate'] = processed['ViolentCrimesPerPop numeric']

alt.Chart(tsne_df).mark_circle().encode(
    x='x:Q',
    y='y:Q',
    color=alt.Color('binned_crime:O', scale=alt.Scale(scheme='yellowgreenblue'))
).transform_bin(
    as_='binned_crime', 
    field='crime_rate',
)

### MDS

In [13]:
embedding = MDS(n_components=2)
result_mds = embedding.fit_transform(X)

mds_df = pd.DataFrame(data=result_mds, columns=['x','y'])

alt.Chart(mds_df).mark_point().encode(
    x='x:Q',
    y='y:Q',
)

In [14]:
mds_df['crime_rate'] = processed['ViolentCrimesPerPop numeric']

alt.Chart(mds_df).mark_circle().encode(
    x='x:Q',
    y='y:Q',
    color=alt.Color('binned_crime:O', scale=alt.Scale(scheme='yellowgreenblue'))
).transform_bin(
    as_='binned_crime', 
    field='crime_rate',
)

## Practice
Try to play with different parameters of t-SNE and see how the results change 

## Scalar 

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X)
scaled = scaler.transform(X)

In [16]:
result_pca2 = PCA(n_components=2).fit_transform(scaled)
pca_df2 = pd.DataFrame(data=result_pca2, columns=['x','y'])

alt.Chart(pca_df2).mark_point().encode(
    x='x:Q',
    y='y:Q',
)

In [17]:
pca_df2['crime_rate'] = processed['ViolentCrimesPerPop numeric']

alt.Chart(pca_df2).mark_circle().encode(
    x='x:Q',
    y='y:Q',
    color=alt.Color('binned_crime:O', scale=alt.Scale(scheme='yellowgreenblue'))
).transform_bin(
    as_='binned_crime', 
    field='crime_rate',
)

In [18]:
result_tsne2 = TSNE(n_components=2).fit_transform(scaled)
tsne_df2 = pd.DataFrame(data=result_tsne2, columns=['x','y'])

alt.Chart(tsne_df2).mark_point().encode(
    x='x:Q',
    y='y:Q',
)

### Distance Function

we use tsne as an example here, change the value of `metric`.

Please notice that, for MDS, the para. `metric` means something different. You will need to change `dissimilarity` if you want to use a precomputed distance matrix.

In [42]:
result_tsne3 = TSNE(n_components=2, metric='cosine').fit_transform(X)
tsne_df3 = pd.DataFrame(data=result_tsne3, columns=['x','y'])

alt.Chart(tsne_df3).mark_point().encode(
    x='x:Q',
    y='y:Q',
)

In [40]:
result_tsne4 = TSNE(n_components=2, metric='manhattan').fit_transform(X)

tsne_df4 = pd.DataFrame(data=result_tsne4, columns=['x','y'])

alt.Chart(tsne_df4).mark_point().encode(
    x='x:Q',
    y='y:Q',
)

In [39]:
result_tsne = TSNE(n_components=2, metric='euclidean').fit_transform(X)
tsne_df = pd.DataFrame(data=result_tsne, columns=['x','y'])

alt.Chart(tsne_df).mark_point().encode(
    x='x:Q',
    y='y:Q',
)

In [19]:
tsne_df2['crime_rate'] = processed['ViolentCrimesPerPop numeric']

alt.Chart(tsne_df2).mark_circle().encode(
    x='x:Q',
    y='y:Q',
    color=alt.Color('binned_crime:O', scale=alt.Scale(scheme='yellowgreenblue'))
).transform_bin(
    as_='binned_crime', 
    field='crime_rate',
)

## Clustering

### K-Means

In [20]:
from sklearn.cluster import KMeans

In [21]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
labels = kmeans.predict(X)

## Practice

show instances with their clustreing labels in scatterplot. Also, please check how the original values are distributed for different clusters.

Hint:
- you can show the mean/median vectors of different clusters in a parallel coordinates plot.

## Hierarchical Clustering

We demonstrate the result for Agglomerative Clustering today.

Agglomerative Clustering recursively merges the pair of clusters that minimally increases a given linkage distance.

In [26]:
from sklearn.cluster import AgglomerativeClustering

clustering = AgglomerativeClustering(n_clusters=3).fit(X)

hcluster_labels = clustering.labels_