In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gzip
import shutil
from scipy.stats import chisquare

In [5]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.offline import plot

### Unzipping Deleware file, because it is one of the smallest and easiest to work with

In [6]:
with gzip.open('Data/Audience_DE.gz', 'rb') as f_in:
    with open('Data/Audience_DE.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [7]:
data_de = pd.read_csv('Data/Audience_DE.csv')
data_de

Unnamed: 0.1,Unnamed: 0,ZIP,GENDER,AGE,STATE_ABBREVIATION,DMA_NAME_ACXIOM,DEMO_HH_INCOME,ETHNICITY,HOMEOWNERSHIP_STATUS,EDUCATION
0,0,19808,F,63.0,DE,Philadelphia,"Less than $30,000",White,Homeowner,not_reported
1,1,19720,M,,DE,Philadelphia,"Less than $30,000",White,Renter,not_reported
2,2,19934,M,65.0,DE,Philadelphia,"$30,000 to $49,999",White,Homeowner,College
3,3,19804,F,67.0,DE,Philadelphia,"$75,000 to $99,999",White,Homeowner,not_reported
4,4,19968,M,73.0,DE,Salisbury,"$50,000 to $74,999",White,Homeowner,High School
...,...,...,...,...,...,...,...,...,...,...
824481,824481,19720,,,DE,Philadelphia,"$100,000 to $149,999",Asian,Homeowner,not_reported
824482,824482,19702,F,53.0,DE,Philadelphia,"$100,000 to $149,999",White,Homeowner,High School
824483,824483,19803,F,81.0,DE,Philadelphia,"$50,000 to $74,999",White,Homeowner,not_reported
824484,824484,19808,M,51.0,DE,Philadelphia,"Less than $30,000",African American,Renter,not_reported


In [8]:
data_de = data_de.replace({'$30,000 to $49,999':'30,000 to 49,999',
                           '$50,000 to $74,999':'50,000 to 74,999',
                           '$75,000 to $99,999':'75,000 to 99,999',
                           '$100,000 to $149,999':'100,000 to 149,999',
                           '$150,000 to $199,999':'150,000 to 199,999',
                           '$200,000 to $249,999':'200,000 to 249,999'})


In [9]:
only_high_inc = data_de[data_de['DEMO_HH_INCOME'] == '$250,000 +']
only_high_inc

Unnamed: 0.1,Unnamed: 0,ZIP,GENDER,AGE,STATE_ABBREVIATION,DMA_NAME_ACXIOM,DEMO_HH_INCOME,ETHNICITY,HOMEOWNERSHIP_STATUS,EDUCATION
108,108,19707,F,65.0,DE,Philadelphia,"$250,000 +",White,Homeowner,College
161,161,19803,M,69.0,DE,Philadelphia,"$250,000 +",White,Homeowner,College
188,188,19711,F,53.0,DE,Philadelphia,"$250,000 +",White,Homeowner,High School
280,280,19711,F,57.0,DE,Philadelphia,"$250,000 +",White,Homeowner,College
304,304,19702,F,63.0,DE,Philadelphia,"$250,000 +",White,Homeowner,High School
...,...,...,...,...,...,...,...,...,...,...
824260,824260,19709,M,27.0,DE,Philadelphia,"$250,000 +",White,Homeowner,High School
824309,824309,19702,M,57.0,DE,Philadelphia,"$250,000 +",White,Homeowner,Graduate School
824323,824323,19806,F,35.0,DE,Philadelphia,"$250,000 +",White,Homeowner,High School
824416,824416,19807,,,DE,Philadelphia,"$250,000 +",White,Renter,not_reported


In [43]:
def print_charts(orig, new, col_name, order):
    
    #pie chart for original dataset
    labels_orig = pd.DataFrame(orig[col_name]
                .value_counts(normalize=True)).reset_index().iloc[:, 0]
    sizes_orig = pd.DataFrame(orig[col_name]
                .value_counts(normalize=True)).reset_index().iloc[:, 1] * 100
    
    fig_orig = make_subplots(rows = 1, cols = 2, 
            subplot_titles=(col_name + ": Original", col_name + ": New"))
    fig_orig.add_trace(go.Bar(x = labels_orig, y=sizes_orig, name = 'Original'), row = 1, col = 1)
    
    #pie chart for new dataset
    
    sizes_new = []
    for i in labels_orig:
        sizes_new = np.append(sizes_new, (len(new[new[col_name] == i])/len(new[col_name]))*100)
    
    fig_orig.add_trace(go.Bar(x = labels_orig, y=sizes_new, name = "New"), row = 1, col = 2)
    
    fig_orig.update_xaxes(categoryorder = 'array', categoryarray = order)
    
    diff, pval = chisquare(sizes_orig, sizes_new)
    
    fig_orig.update_layout(height=350, width=700, 
                title_text="Change in " + col_name + 
                ". Chi squared difference: " + str(round(diff, 3)) +
                ". P_value: " + str(round(pval, 3)) )
    
    fig_orig.update_yaxes(range=[0,max(max(sizes_orig), max(sizes_new)) + 5])
    
    fig_orig.update_yaxes(tickvals = np.arange(0,101, 10))
    
    fig_orig.show()

In [44]:
def show_charts(orig, new):
    
    # DEMO_HH_INCOME
    print_charts(orig, new, "DEMO_HH_INCOME", ['Less than $30,000','30,000 to 49,999',
                                                   '50,000 to 74,999','75,000 to 99,999',
                                                   '100,000 to 149,999', '150,000 to 199,999',
                                                   '200,000 to 249,999', '$250,000 +'])

    #GENDER
    print_charts(orig, new, "GENDER", ['F', 'M'])
    
    
    #HOMEOWNERSHIP_STATUS
    print_charts(orig, new, "HOMEOWNERSHIP_STATUS", ['Homeowner', 'Renter'])
    
    #EDUCATION
    print_charts(orig, new, "EDUCATION", ['High School', 'College', 'Graduate School', 'Vocational_or_Technical', 'not_reported'])
    
    #ETHNICITY
    print_charts(orig, new, "ETHNICITY", np.unique(orig['ETHNICITY']))
    
    
    #DMA_NAME_ACXIOM
    
    print_charts(orig, new, "DMA_NAME_ACXIOM", np.unique(orig["DMA_NAME_ACXIOM"]))

    plt.show()

In [45]:
show_charts(data_de, only_high_inc)