# Exploratory Data Analysis

When placed in Metapack data package, this notebook will load the package and run a variety of common EDA operations on the first resource. 


In [1]:
import matplotlib.pyplot as plt 
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
from IPython.display import display 

%matplotlib inline
sns.set_context('notebook')


In [2]:
pkg = mp.jupyter.open_source_package()

# For testing and development
#pkg = mp.open_package('http://s3.amazonaws.com/library.metatab.org/cde.ca.gov-accountability_dashboard-2.zip')

pkg

In [3]:
resource_name='r2r_2015_2018'

In [4]:
pkg.resource(resource_name)

Header,Type,Description
rin,integer,
primary_key,integer,
date_occurred,datetime,
time_occurred,string,
location,string,
area_command,string,
nature_of_contact,string,
reason_desc,string,
r2r_level,integer,
master_subject_id,string,


In [5]:
!mp config

Package        Version
-------------  ---------------------
metapack       0.9.22+g531202d.dirty
metatab        0.8.2
metatabdecl    1.9
rowgenerators  0.9.10
tableintuit    0.1.6

Subcommand    Package
------------  -----------------------------------
notebook      metapack-jupyter 0.0.7
github        metapack-github 0.0.2
config        metapack 0.9.22+g531202d.dirty
doc           metapack 0.9.22+g531202d.dirty
index         metapack 0.9.22+g531202d.dirty
info          metapack 0.9.22+g531202d.dirty
open          metapack 0.9.22+g531202d.dirty
run           metapack 0.9.22+g531202d.dirty
search        metapack 0.9.22+g531202d.dirty
wp            metapack-wp 0.0.11+g12b14a3.dirty
build         metapack-build 1.1.3+g2ccae7d.dirty
colmap        metapack-build 1.1.3+g2ccae7d.dirty
edit          metapack-build 1.1.3+g2ccae7d.dirty
make          metapack-build 1.1.3+g2ccae7d.dirty
mk            metapack-build 1.1.3+g2ccae7d.dirty
new           metapack-build 1.1.3+g2ccae7d.dirty
s3         

In [6]:
%load_ext autoreload
%autoreload 2
print(type(pkg.resource(resource_name)))
df = pkg.resource(resource_name).dataframe(parse_dates=True)

<class 'metapack.terms.Resource'>


CellExecutionError: Errors executing noteboook. See notebook at /Users/eric/proj/data-projects/metatab-packages/policedata-collection/austintexas.gov-response_to_resistance/notebooks/Combine-errors.ipynb for details.


In [None]:
df.head()

In [None]:
empty_col_names = [cn for cn in df.columns if df[cn].nunique() == 0]
const_col_names= [cn for cn in df.columns if df[cn].nunique() == 1]
ignore_cols = empty_col_names+const_col_names
dt_col_names= list(df.select_dtypes(include=[np.datetime64]).columns)

number_col_names = [ cn for cn in df.select_dtypes(include=[np.number]).columns if cn not in ignore_cols ]
other_col_names = [cn for cn in df.columns if cn not in (empty_col_names+const_col_names+dt_col_names+number_col_names)]

In [None]:
pd.DataFrame.from_dict({'empty':[len(empty_col_names)], 
                        'const':[len(const_col_names)],
                        'datetime':[len(dt_col_names)],
                        'number':[len(number_col_names)],
                        'other':[len(other_col_names)],
                       }, 
                       orient='index', columns=['count'])

# Constant Columns

In [None]:
if const_col_names:
    display(df[const_col_names].drop_duplicates().T)

# Empty Columns

In [None]:
if empty_col_names:
    display(df[empty_col_names].drop_duplicates().T)

# Date and Time Columns

In [None]:
if dt_col_names:
    display(df[dt_col_names].info())
    display(df[dt_col_names].describe().T)

# Number Columns

In [None]:
if number_col_names:
    display(df[number_col_names].info())
    display(df[number_col_names].describe().T)

## Distributions

In [None]:
def plot_histograms(df):

    col_names = list(df.columns)

    n_cols = np.ceil(np.sqrt(len(col_names)))
    n_rows = np.ceil(np.sqrt(len(col_names)))

    #plt.figure(figsize=(3*n_cols,3*n_rows))
    fig, ax = plt.subplots(figsize=(3*n_cols,3*n_rows))
    
    for i in range(0,len(col_names)):
        plt.subplot(n_rows + 1,n_cols,i+1)
        try:
            g = sns.distplot(df[col_names[i]].dropna(),kde=True) 
            g.set(xticklabels=[])
            g.set(yticklabels=[])
        except:
            pass
        
    plt.tight_layout()
        


In [None]:
plot_histograms(df[number_col_names])

## Box Plots

In [None]:
def plot_boxes(df):

    col_names = list(df.columns)

    n_cols = np.ceil(np.sqrt(len(col_names)))
    n_rows = np.ceil(np.sqrt(len(col_names)))

    #plt.figure(figsize=(2*n_cols,3*n_rows))
    fig, ax = plt.subplots(figsize=(2*n_cols,5*n_rows))
    
    for i in range(0,len(col_names)):
        plt.subplot(n_rows + 1,n_cols,i+1)
        try:
            g = sns.boxplot(df[col_names[i]].dropna(),orient='v') 
        except:
            pass
        
    plt.tight_layout()
  

In [None]:
  
plot_boxes(df[number_col_names])

In [None]:
## Correlations

In [None]:

cm = df[number_col_names].corr()

mask = np.zeros_like(cm, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

plt.figure(figsize=(.5*len(number_col_names),.5*len(number_col_names)))
sns.heatmap(cm, mask=mask, cmap = 'viridis')   

# Other Columns

In [None]:
if other_col_names:
    display(df[other_col_names].info())
    display(df[other_col_names].describe().T)

# Nulls


In [None]:
cols = dt_col_names + number_col_names + other_col_names

fig, ax = plt.subplots(figsize=(15,.5*len(cols)))
sns.heatmap(df[cols].isnull().T,cbar=False,xticklabels=False,cmap = 'viridis', ax=ax )