In [1]:
import pandas as pd
import numpy as np

import altair as alt
import matplotlib.pyplot as plt

In this lab, we use the data of communities and crime (https://archive.ics.uci.edu/ml/datasets/Communities+and+Crime) which includes some attributes of communities.

In the original data folder, there are two data files: `communities.data` with all the vectors, and `communities.names` containing the description of the dataset, as well as column names and some metadata.


In [4]:
data = pd.read_csv(filepath_or_buffer='./communities.data', header=None, index_col=None).values
data.shape

(1994, 128)

In [3]:
columns = [
    "state numeric",
"county numeric",
"community numeric",
"communityname string",
"fold numeric",
"population numeric",
"householdsize numeric",
"racepctblack numeric",
"racePctWhite numeric",
"racePctAsian numeric",
"racePctHisp numeric",
"agePct12t21 numeric",
"agePct12t29 numeric",
"agePct16t24 numeric",
"agePct65up numeric",
"numbUrban numeric",
"pctUrban numeric",
"medIncome numeric",
"pctWWage numeric",
"pctWFarmSelf numeric",
"pctWInvInc numeric",
"pctWSocSec numeric",
"pctWPubAsst numeric",
"pctWRetire numeric",
"medFamInc numeric",
"perCapInc numeric",
"whitePerCap numeric",
"blackPerCap numeric",
"indianPerCap numeric",
"AsianPerCap numeric",
"OtherPerCap numeric",
"HispPerCap numeric",
"NumUnderPov numeric",
"PctPopUnderPov numeric",
"PctLess9thGrade numeric",
"PctNotHSGrad numeric",
"PctBSorMore numeric",
"PctUnemployed numeric",
"PctEmploy numeric",
"PctEmplManu numeric",
"PctEmplProfServ numeric",
"PctOccupManu numeric",
"PctOccupMgmtProf numeric",
"MalePctDivorce numeric",
"MalePctNevMarr numeric",
"FemalePctDiv numeric",
"TotalPctDiv numeric",
"PersPerFam numeric",
"PctFam2Par numeric",
"PctKids2Par numeric",
"PctYoungKids2Par numeric",
"PctTeen2Par numeric",
"PctWorkMomYoungKids numeric",
"PctWorkMom numeric",
"NumIlleg numeric",
"PctIlleg numeric",
"NumImmig numeric",
"PctImmigRecent numeric",
"PctImmigRec5 numeric",
"PctImmigRec8 numeric",
"PctImmigRec10 numeric",
"PctRecentImmig numeric",
"PctRecImmig5 numeric",
"PctRecImmig8 numeric",
"PctRecImmig10 numeric",
"PctSpeakEnglOnly numeric",
"PctNotSpeakEnglWell numeric",
"PctLargHouseFam numeric",
"PctLargHouseOccup numeric",
"PersPerOccupHous numeric",
"PersPerOwnOccHous numeric",
"PersPerRentOccHous numeric",
"PctPersOwnOccup numeric",
"PctPersDenseHous numeric",
"PctHousLess3BR numeric",
"MedNumBR numeric",
"HousVacant numeric",
"PctHousOccup numeric",
"PctHousOwnOcc numeric",
"PctVacantBoarded numeric",
"PctVacMore6Mos numeric",
"MedYrHousBuilt numeric",
"PctHousNoPhone numeric",
"PctWOFullPlumb numeric",
"OwnOccLowQuart numeric",
"OwnOccMedVal numeric",
"OwnOccHiQuart numeric",
"RentLowQ numeric",
"RentMedian numeric",
"RentHighQ numeric",
"MedRent numeric",
"MedRentPctHousInc numeric",
"MedOwnCostPctInc numeric",
"MedOwnCostPctIncNoMtg numeric",
"NumInShelters numeric",
"NumStreet numeric",
"PctForeignBorn numeric",
"PctBornSameState numeric",
"PctSameHouse85 numeric",
"PctSameCity85 numeric",
"PctSameState85 numeric",
"LemasSwornFT numeric",
"LemasSwFTPerPop numeric",
"LemasSwFTFieldOps numeric",
"LemasSwFTFieldPerPop numeric",
"LemasTotalReq numeric",
"LemasTotReqPerPop numeric",
"PolicReqPerOffic numeric",
"PolicPerPop numeric",
"RacialMatchCommPol numeric",
"PctPolicWhite numeric",
"PctPolicBlack numeric",
"PctPolicHisp numeric",
"PctPolicAsian numeric",
"PctPolicMinor numeric",
"OfficAssgnDrugUnits numeric",
"NumKindsDrugsSeiz numeric",
"PolicAveOTWorked numeric",
"LandArea numeric",
"PopDens numeric",
"PctUsePubTrans numeric",
"PolicCars numeric",
"PolicOperBudg numeric",
"LemasPctPolicOnPatr numeric",
"LemasGangUnitDeploy numeric",
"LemasPctOfficDrugUn numeric",
"PolicBudgPerPop numeric",
"ViolentCrimesPerPop numeric",
]

In [5]:
df = pd.DataFrame(data=data, columns=columns)
df.head()

Unnamed: 0,state numeric,county numeric,community numeric,communityname string,fold numeric,population numeric,householdsize numeric,racepctblack numeric,racePctWhite numeric,racePctAsian numeric,...,LandArea numeric,PopDens numeric,PctUsePubTrans numeric,PolicCars numeric,PolicOperBudg numeric,LemasPctPolicOnPatr numeric,LemasGangUnitDeploy numeric,LemasPctOfficDrugUn numeric,PolicBudgPerPop numeric,ViolentCrimesPerPop numeric
0,8,?,?,Lakewoodcity,1,0.19,0.33,0.02,0.9,0.12,...,0.12,0.26,0.2,0.06,0.04,0.9,0.5,0.32,0.14,0.2
1,53,?,?,Tukwilacity,1,0.0,0.16,0.12,0.74,0.45,...,0.02,0.12,0.45,?,?,?,?,0.0,?,0.67
2,24,?,?,Aberdeentown,1,0.0,0.42,0.49,0.56,0.17,...,0.01,0.21,0.02,?,?,?,?,0.0,?,0.43
3,34,5,81440,Willingborotownship,1,0.04,0.77,1.0,0.08,0.12,...,0.02,0.39,0.28,?,?,?,?,0.0,?,0.12
4,42,95,6096,Bethlehemtownship,1,0.01,0.55,0.02,0.95,0.09,...,0.04,0.09,0.02,?,?,?,?,0.0,?,0.03


In [21]:
num_features = columns[5:]
num_features

['population numeric',
 'householdsize numeric',
 'racepctblack numeric',
 'racePctWhite numeric',
 'racePctAsian numeric',
 'racePctHisp numeric',
 'agePct12t21 numeric',
 'agePct12t29 numeric',
 'agePct16t24 numeric',
 'agePct65up numeric',
 'numbUrban numeric',
 'pctUrban numeric',
 'medIncome numeric',
 'pctWWage numeric',
 'pctWFarmSelf numeric',
 'pctWInvInc numeric',
 'pctWSocSec numeric',
 'pctWPubAsst numeric',
 'pctWRetire numeric',
 'medFamInc numeric',
 'perCapInc numeric',
 'whitePerCap numeric',
 'blackPerCap numeric',
 'indianPerCap numeric',
 'AsianPerCap numeric',
 'OtherPerCap numeric',
 'HispPerCap numeric',
 'NumUnderPov numeric',
 'PctPopUnderPov numeric',
 'PctLess9thGrade numeric',
 'PctNotHSGrad numeric',
 'PctBSorMore numeric',
 'PctUnemployed numeric',
 'PctEmploy numeric',
 'PctEmplManu numeric',
 'PctEmplProfServ numeric',
 'PctOccupManu numeric',
 'PctOccupMgmtProf numeric',
 'MalePctDivorce numeric',
 'MalePctNevMarr numeric',
 'FemalePctDiv numeric',
 'To

In [22]:
''' Here we simply convert non-numeric values into 0 '''
# this step change strings into NaN
processed = df[num_features].apply(pd.to_numeric, errors="coerce")

# then we change NaN to 0
processed = processed.replace(np.nan,0)
processed

Unnamed: 0,population numeric,householdsize numeric,racepctblack numeric,racePctWhite numeric,racePctAsian numeric,racePctHisp numeric,agePct12t21 numeric,agePct12t29 numeric,agePct16t24 numeric,agePct65up numeric,...,LandArea numeric,PopDens numeric,PctUsePubTrans numeric,PolicCars numeric,PolicOperBudg numeric,LemasPctPolicOnPatr numeric,LemasGangUnitDeploy numeric,LemasPctOfficDrugUn numeric,PolicBudgPerPop numeric,ViolentCrimesPerPop numeric
0,0.19,0.33,0.02,0.90,0.12,0.17,0.34,0.47,0.29,0.32,...,0.12,0.26,0.20,0.06,0.04,0.90,0.5,0.32,0.14,0.20
1,0.00,0.16,0.12,0.74,0.45,0.07,0.26,0.59,0.35,0.27,...,0.02,0.12,0.45,0.00,0.00,0.00,0.0,0.00,0.00,0.67
2,0.00,0.42,0.49,0.56,0.17,0.04,0.39,0.47,0.28,0.32,...,0.01,0.21,0.02,0.00,0.00,0.00,0.0,0.00,0.00,0.43
3,0.04,0.77,1.00,0.08,0.12,0.10,0.51,0.50,0.34,0.21,...,0.02,0.39,0.28,0.00,0.00,0.00,0.0,0.00,0.00,0.12
4,0.01,0.55,0.02,0.95,0.09,0.05,0.38,0.38,0.23,0.36,...,0.04,0.09,0.02,0.00,0.00,0.00,0.0,0.00,0.00,0.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1989,0.01,0.40,0.10,0.87,0.12,0.16,0.43,0.51,0.35,0.30,...,0.01,0.28,0.05,0.00,0.00,0.00,0.0,0.00,0.00,0.09
1990,0.05,0.96,0.46,0.28,0.83,0.32,0.69,0.86,0.73,0.14,...,0.02,0.37,0.20,0.00,0.00,0.00,0.0,0.00,0.00,0.45
1991,0.16,0.37,0.25,0.69,0.04,0.25,0.35,0.50,0.31,0.54,...,0.08,0.32,0.18,0.08,0.06,0.78,0.0,0.91,0.28,0.23
1992,0.08,0.51,0.06,0.87,0.22,0.10,0.58,0.74,0.63,0.41,...,0.03,0.38,0.33,0.02,0.02,0.79,0.0,0.22,0.18,0.19


### Questions:
1) what are the correlations between different attributes?

2) how do the attributes contribute to the outcome (ViolentCrimesPerPop)?

3) Are there any groups in the data? Any outliers?

## SPLOM / Scatterplot Matrix

In [6]:
col_to_check = ['agePct12t21 numeric', 'PctLess9thGrade numeric', 'PersPerFam numeric',
                'PctFam2Par numeric', 'PctWorkMom numeric']

In [60]:
alt.Chart(processed).mark_point(
    opacity=.2
).encode(
    alt.X(alt.repeat("column"), type='quantitative'),
    alt.Y(alt.repeat("row"), type='quantitative'),
).properties(
    width=100,
    height=100
).repeat(
    row=col_to_check,
    column=col_to_check
)

## Correlationship

In [30]:
corr = processed[num_features[:-1]].corr()

corr

Unnamed: 0,population numeric,householdsize numeric,racepctblack numeric,racePctWhite numeric,racePctAsian numeric,racePctHisp numeric,agePct12t21 numeric,agePct12t29 numeric,agePct16t24 numeric,agePct65up numeric,...,PolicAveOTWorked numeric,LandArea numeric,PopDens numeric,PctUsePubTrans numeric,PolicCars numeric,PolicOperBudg numeric,LemasPctPolicOnPatr numeric,LemasGangUnitDeploy numeric,LemasPctOfficDrugUn numeric,PolicBudgPerPop numeric
population numeric,1.000000,-0.046148,0.231178,-0.300845,0.181603,0.156218,0.006368,0.130344,0.075596,-0.102006,...,0.532969,0.713652,0.231897,0.270356,0.837546,0.797330,0.540904,0.471167,0.466352,0.410710
householdsize numeric,-0.046148,1.000000,-0.067109,-0.235907,0.201996,0.468659,0.520461,0.367338,0.295225,-0.612666,...,-0.058980,-0.015078,-0.004072,-0.051506,-0.086127,-0.053341,-0.100452,-0.073426,-0.094368,-0.113298
racepctblack numeric,0.231178,-0.067109,1.000000,-0.794389,-0.106738,-0.066581,0.122338,0.153475,0.134068,0.052934,...,0.178369,0.149758,0.095053,0.147023,0.255182,0.196044,0.231094,0.199190,0.260793,0.214650
racePctWhite numeric,-0.300845,-0.235907,-0.794389,1.000000,-0.270266,-0.444166,-0.194015,-0.266852,-0.183804,0.136483,...,-0.244530,-0.131389,-0.337458,-0.215636,-0.256943,-0.233994,-0.260874,-0.230091,-0.276234,-0.219685
racePctAsian numeric,0.181603,0.201996,-0.106738,-0.270266,1.000000,0.266743,-0.025020,0.100727,0.052761,-0.272020,...,0.174322,-0.001084,0.389944,0.296921,0.051910,0.112861,0.120840,0.128631,0.101888,0.077938
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PolicOperBudg numeric,0.797330,-0.053341,0.196044,-0.233994,0.112861,0.117189,-0.020449,0.064460,0.033996,-0.032976,...,0.414180,0.529265,0.233281,0.272627,0.844864,1.000000,0.400359,0.331546,0.346562,0.487214
LemasPctPolicOnPatr numeric,0.540904,-0.100452,0.231094,-0.260874,0.120840,0.141579,-0.009625,0.116574,0.078563,-0.017710,...,0.741849,0.341831,0.263131,0.259561,0.512529,0.400359,1.000000,0.671129,0.857575,0.647339
LemasGangUnitDeploy numeric,0.471167,-0.073426,0.199190,-0.230091,0.128631,0.117990,0.000670,0.087242,0.054521,-0.044785,...,0.598143,0.348076,0.128368,0.128839,0.449205,0.331546,0.671129,1.000000,0.621957,0.509343
LemasPctOfficDrugUn numeric,0.466352,-0.094368,0.260793,-0.276234,0.101888,0.125353,0.001301,0.117290,0.083180,-0.010168,...,0.680129,0.299632,0.228084,0.206644,0.469950,0.346562,0.857575,0.621957,1.000000,0.672801


In [32]:
alt.data_transformers.enable(max_rows=20000)

DataTransformerRegistry.enable('default')

In [34]:
wide_form = corr.reset_index().rename(columns={'index':'attr1'})
to_plot = wide_form.melt('attr1', var_name='attr2', value_name='corr')

In [1]:
''' Visualization'''


'''What can you learn from it?'''



'What can you learn from it?'

## Relevance to outcome (ViolentCrimesPerPop)
- Regression Coefficent Estimate

In this example, we use the function `theilslopes()` from `scipy.stats` library. It gives you information of the median slope (coefficient) and intercept, as well as confidence interval of the slope.

Check the related information here: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.theilslopes.html. It also includes the comparison of the lines generated from `theilslopes()` and `linregress()` which uses the least-squares regression.


In [70]:
# the returned tuple contains: medslope, medintercept, lo_slope, up_slope
res

(1.0000000000000002, 0.13, 0.96, 1.1428571428571426)

In [73]:
coefficient = []

for col in num_features[:-1]:
    res = stats.theilslopes(target, processed[col], 0.95)
    coefficient.append([col, res[0], res[2], res[3]])
    
to_plot = pd.DataFrame(data=coefficient, columns=['attribute', 'med_slope', 'low_slope', 'up_slope'])
    

In [84]:
'''fill in the blanks'''

dot = alt.Chart(to_plot).mark_circle().encode(
    y= ???,
    x= ??? 
)

error = alt.Chart(to_plot).mark_rule().encode(
    y= ???,
    y2= ???,
    x= ???
)

dot + error

## Dimension Reduction

In [85]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

### Principal Component Analysis (PCA)

In [97]:
''' initialize PCA '''
pca = PCA(n_components=2)

''' prepare the features for be analyszed '''
X = processed[num_features[:-1]].values

''' find the first two principal components '''
reduced_data = pca.fit_transform(X)


In [98]:
alt.Chart(pd.DataFrame(data=reduced_data, columns=['x', 'y'])).mark_point().encode(
    x='x:Q',
    y='y:Q'
)

### K-Means

In [99]:
from sklearn.cluster import KMeans

In [136]:
kmeans = KMeans(n_clusters=10)
kmeans.fit(X)
labels = kmeans.predict(X)

In [137]:
reduced_df = pd.DataFrame(data=reduced_data, columns=['x', 'y'])
reduced_df['label'] = labels
alt.Chart(reduced_df).mark_point().encode(
    x='x:Q',
    y='y:Q',
    color='label:N'
)

## Other Dimensionality Reduction methods

### T-SNE

In [141]:
result_tsne = TSNE(n_components=2).fit_transform(X)

tsne_df = pd.DataFrame(data=result_tsne, columns=['x','y'])

alt.Chart(tsne_df).mark_point().encode(
    x='x:Q',
    y='y:Q',
)

In [142]:
tsne_df['label'] = labels
alt.Chart(tsne_df).mark_point().encode(
    x='x:Q',
    y='y:Q',
    color='label:N'
)

### MDS

In [140]:
from sklearn.manifold import MDS

embedding = MDS(n_components=2)
result_mds = embedding.fit_transform(X)

mds_df = pd.DataFrame(data=result_mds, columns=['x','y'])
mds_df['label'] = labels

alt.Chart(mds_df).mark_point().encode(
    x='x:Q',
    y='y:Q',
)

In [139]:
alt.Chart(mds_df).mark_point().encode(
    x='x:Q',
    y='y:Q',
    color='label:N'
)

### color by ViolentCrimesPerPop

In [108]:
to_plot = pd.DataFrame(reduced_df[['x', 'y']])
to_plot['ViolentCrimesPerPop'] = processed['ViolentCrimesPerPop numeric']

In [132]:
alt.Chart(to_plot).mark_circle().encode(
    x='x:Q',
    y='y:Q',
    color=alt.Color('binned_crime:O', scale=alt.Scale(scheme='yellowgreenblue'))
).transform_bin(
    as_='binned_crime', 
    field='ViolentCrimesPerPop',
 )

What can we learn from the chart here?

The crime rate are distributed from lower to higher in the chart from left bottom parts to the right side of the chart.