# Chi Squared Testing

In this notebook, we divide our data into the categories we identified during our univariate analysis.  We then perform chi squared tests of association with each category serving as the target (except for Demographics) against all other categories.

In [None]:
!pip install joblib

import pandas as pd
import numpy as np
import joblib
from scipy.stats import chi2_contingency
from time import time
from plotnine import *
import joblib

import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Read in pre-processed data
data = joblib.load ('GroupedAndUngroupedData.pkl')

In [None]:
# We're going to want to focus on the translated responses, whose column names all end with "_Groups"

group_cols = [col for col in data.columns if '_Groups' in col]

In [None]:
# Here we define a dictionary to hold the category definitions
categoryDict = {}

categoryDict['Demographics'] = ['age_Groups',
                                'gender_Groups',
                                'internetAccessCafeKiosk_Groups',
                                'internetAccessLaptopTablet_Groups',
                                'internetAccessLibrary_Groups',
                                'internetAccessMobilePhone_Groups',
                                'internetAccessNone_Groups',
                                'internetAccessOtherHome_Groups',
                                'internetAccessOwnHome_Groups',
                                'internetAccessSchool_Groups',
                                'internetAccessSetting_Groups',
                                'internetAccessSomewhereElse_Groups',
                                'internetAccessWork_Groups',
                                'langSpokenHome_Groups',
                                'USAAreaType_Groups',
                                'USAAsianPacificIslander_Groups',
                                'USABlack_Groups',
                                'USAEducation_Groups',
                                'USAHispanic_Groups',
                                'USALatino_Groups',
                                'USAMiddleEastern_Groups',
                                'USAOtherEthnicity_Groups',
                                'USAWhite_Groups']

categoryDict['InternetUseVideoGames'] = ['groupElectionCampaignInternetRole_Groups',
                                        'groupLocalCommunityInternetRole_Groups',
                                        'groupPoliticalIssueInternetRole_Groups',
                                        'groupPoliticanInternetRole_Groups',
                                        'groupSocialIssueInternetRole_Groups',
                                        'internetUseBoughtThings_Groups',
                                        'internetUseGovernmentBenefits_Groups',
                                        'internetUseGovernmentForms_Groups',
                                        'internetUseHealthDifficult_Groups',
                                        'internetUseHealthStandard_Groups',
                                        'internetUsePayFine_Groups',
                                        'internetUseRemixFoundWork_Groups',
                                        'internetUseSearchEngine_Groups',
                                        'internetUseShareOtherWork_Groups',
                                        'internetUseShareOwnWork_Groups',
                                        'internetUseSocialNetwork_Groups',
                                        'internetUseTravel_Groups',
                                        'internetUseTVShows_Groups',
                                        'internetUseVideos_Groups',
                                        'videoGamePlayingGamer_Groups',
                                        'videoGamePlayingHelpOthers_Groups',
                                        'videoGamePlayingLearnSocietyProblems_Groups',
                                        'videoGamePlayingMoralEthicalIssues_Groups',
                                        'videoGameTypeMostFrequent_Groups',
                                        'videoGameTypeMultiplayerCoop_Groups',
                                        'videoGameTypeMultiplayerPVP_Groups',
                                        'videoGameTypeSingleplayer_Groups',
                                        'videoGameUseFrequency_Groups']  # Target

categoryDict['PoliticalEngagement'] = [ 'USAPoliticalParty_Groups', # Target
                                        'discussPoliticsFamily_Groups',
                                        'discussPoliticsFriends_Groups',
                                        'discussPoliticsOthers_Groups',
                                        'groupElectionCampaign_Groups',
                                        'groupFocusInternational_Groups',
                                        'groupFocusLocal_Groups',
                                        'groupFocusNational_Groups',
                                        'groupLocalCommunity_Groups',
                                        'groupPoliticalIssue_Groups',
                                        'groupPolitican_Groups',
                                        'groupSocialIssue_Groups',
                                        'politicalInterest_Groups',
                                        'politicsEngagementOccassional_Groups',
                                        'politicsFunImportance_Groups',
                                        'politicsInfluenceLocal_Groups',
                                        'politicsInfluenceSelf_Groups',
                                        'politicsInvovledOnlyImportant_Groups',
                                        'pyAttendedRally_Groups',  # Target -- online vs offline -- vs news source
                                        'pyCommentedNews_Groups',
                                        'pyContacedPolitican_Groups',
                                        'pyDiscussedPoliticsFriendsFamily_Groups',
                                        'pyEncourageVote_Groups',
                                        'pySignedPetition_Groups',
                                        'pyVoted_Groups',  # Target
                                        'selfInternetDiscussPolitics_Groups',
                                        'selfInternetExpressConcern_Groups',
                                        'selfInternetFindPoliticalInfo_Groups',
                                        'selfMoreInvolvedPolitics_Groups',
                                        'selfPoliticsTooComplicated_Groups',
                                        'selfUnderstandPolitics_Groups']

categoryDict['SocialViews'] = ['abortionIllegal_Groups',
                                'gayMarriage_Groups',
                                'govtBusinessRegulation_Groups',
                                'govtHelpVulnerable_Groups',
                                'govtPoorDependence_Groups',
                                'immigrationThreatenTradValues_Groups',
                                'newsAccurate_Groups',
                                'newsTrustworthy_Groups',
                                'newsUnbiased_Groups',
                                'peopleFair_Groups',
                                'peopleTakeAdvantage_Groups',
                                'peopleTrusted_Groups',
                                'politiciansCountryBestInterests_Groups',
                                'politiciansDoRight_Groups',
                                'politiciansTellTruth_Groups',
                                'protestEffective_Groups',
                                'protestIllegal_Groups',
                                'protestNormal_Groups',
                                'protestNotRepresentative_Groups',
                                'pyBoughtForCause_Groups',
                                'pyBoycottedForCause_Groups',
                                'pyDisplayedCauseSymbol_Groups',
                                'pyDonatedMoneyForCause_Groups',
                                'pyRaisedMoneyForCharity_Groups',
                                'pySignedUpForCauseInfo_Groups']

categoryDict['TraditionalSocialMedia'] = ['facebookDailyRoutine_Groups',
                                        'facebookEncouragePoliticalAction_Groups',
                                        'facebookEncourageVote_Groups',
                                        'facebookFollowPoliticalLinks_Groups',
                                        'facebookLearnEvents_Groups',
                                        'facebookLearnEventsBeforeNews_Groups',
                                        'facebookLearnEventsDeeper_Groups',
                                        'facebookLearnOpposingViewpoints_Groups',
                                        'facebookLearnOthersInterests_Groups',
                                        'facebookLikePoliticalMaterial_Groups',
                                        'facebookOutOfTouchIfNotUsed_Groups',
                                        'facebookPostPoliticalLinks_Groups',
                                        'facebookPostPoliticalOpinions_Groups',
                                        'facebookRepostPoliticalMaterial_Groups',
                                        'facebookUnderstandSociety_Groups',
                                        'facebookUpsetIfShutdown_Groups',
                                        'facebookUseAmount_Groups',
                                        'internationalNewsInterest_Groups',
                                        'localNewsInterest_Groups',
                                        'nationalNewsInterest_Groups',
                                        'politicalNewsInterest_Groups',
                                        'socialMediaUseFacebook_Groups',
                                        'socialMediaUseFoursquare_Groups',
                                        'socialMediaUseGooglePlus_Groups',
                                        'socialMediaUseInstagram_Groups',
                                        'socialMediaUseLinkedIn_Groups',
                                        'socialMediaUseMyspace_Groups',
                                        'socialMediaUsePinterest_Groups',
                                        'socialMediaUseReddit_Groups',
                                        'socialMediaUseTwitter_Groups',
                                        'socialMediaUseYouTube_Groups',
                                        'twitterUseAmount_Groups',
                                        'twitterUseDiscussNewsPolitics_Groups',
                                        'twitterUseReadNewsPolitics_Groups',
                                        'twitterUseShareNewsPolitics_Groups',
                                        'USAPoliticalNewsCableTV_Groups',
                                        'USAPoliticalNewsSourceComedyTV_Groups',
                                        'USAPoliticalNewsSourceDailyNewspapers_Groups',
                                        'USAPoliticalNewsSourceLateNightTV_Groups',
                                        'USAPoliticalNewsSourceLocalTV_Groups',
                                        'USAPoliticalNewsSourceNetworkTV_Groups',
                                        'USAPoliticalNewsSourceRadio_Groups',
                                        'USAPoliticalNewsSourceSocialMedia_Groups',
                                        'USAPoliticalNewsSourceWeeklyMagazines_Groups']

### Additional Data Carpentry: Progressivism Scoring / Grouping

We also will add one additional set of variables to the prepared dataframe.  There are a series of questions in the survey that ask for agreement / disagreement with various statements on social issues.  We translate those into a "score" that indicates how progressive / conservative the Respondent was.  The higher the number, the more progressive.
The raw responses are ordered 1 - 5, ranging from strongly disagree to strongly agree (3 = neutral).  
Respondents could also respond 99, which translates to "I don't know".  We'll treat that the same as neutral.
Depending on the question, we'll translate that into -2, -1, 0, 1, 2 for our progressivism score

In [None]:
consToProMap = {1: -2, 2: -1, 3: 0, 4: 1, 5: 2}
proToConsMap = {1: 2,  2: 1,  3: 0, 4:-1, 5:-2}

# "The gov't should help more vulnerable people, even if it means going deeper into debt"
# Disagree = conservative, agree = progressive
govtHelpVulnerable = 'govtHelpVulnerable'
data[f'{govtHelpVulnerable}_Score'] = data[govtHelpVulnerable].map(consToProMap)

# "Government regulation of business usually does more harm than good"
# Disagree = progressive, agree = conservative
govtBusinessRegulation = 'govtBusinessRegulation'
data[f'{govtBusinessRegulation}_Score'] = data[govtBusinessRegulation].map(proToConsMap)

# "Poor people have become too dependent on government assistance programs"
# Disagree = progressive, agree = conservative
govtPoorDependence = 'govtPoorDependence'
data[f'{govtPoorDependence}_Score'] = data[govtPoorDependence].map(proToConsMap)

# "The growing number of newcomers from other countries threatens traditional American values"
# Disagree = progressive, agree = conservative
immigrationThreatenTradValues = 'immigrationThreatenTradValues'
data[f'{immigrationThreatenTradValues}_Score'] = data[immigrationThreatenTradValues].map(proToConsMap)

# "Gays and lesbians should be allowed to marry legally"
# Disagree = conservative, agree = progressive
gayMarriage = 'gayMarriage'
data[f'{gayMarriage}_Score'] = data[gayMarriage].map(consToProMap)

# "Abortion should be illegal in all or most cases"
# Disagree = progressive, agree = conservative
abortionIllegal = 'abortionIllegal'
data[f'{abortionIllegal}_Score'] = data[abortionIllegal].map(proToConsMap)

cols = [f'{govtHelpVulnerable}_Score', 
        f'{govtBusinessRegulation}_Score', 
        f'{govtPoorDependence}_Score', 
        f'{immigrationThreatenTradValues}_Score', 
        f'{gayMarriage}_Score', 
        f'{abortionIllegal}_Score']

data['progressivism_Score'] = data[cols].sum(axis=1)

We can classify the scores into 5 groups:

Very Conservative, Slightly Conservative, Neither, Slightly Progressive, Very Progressive

There were six questions, making the max score 12 and the min score -12.  
So we'll divide each side in two down the middle (starting at 6 and -6)


In [None]:
bins = pd.IntervalIndex.from_tuples([(-13,-6), (-6,0), (0,0.1), (0.1,5), (5,13)])
binlabels = ['Very Conservative', 'Slightly Conservative', 'Neither', 'Slightly Progressive', 'Very Progressive']

progressivism_Groups = pd.cut(data.progressivism_Score.to_list(), bins=bins)
progressivism_Groups.categories = binlabels
data['progressivism_Groups'] = progressivism_Groups
data.progressivism_Groups = data.progressivism_Groups.astype(str)

# Handle the special case when the score is 0
data.loc[data.progressivism_Score==0, 'progressivism_Groups'] = 'Neither'

In [None]:
# Add an entry to the Category Dictionary for the progressivism_Groups column

categoryDict['Progressivism'] = ['progressivism_Groups']

### Chi Squared testing

Here we perform the testing.  Every category except Demographics will serve at the "Y" (or target).

In [None]:
yCats = [cat for cat in categoryDict.keys() if cat != 'Demographics']
xCats = categoryDict.keys()

Additional utility function to break out and store granular details about the chi squared tests.
Namely, we'll store the differences between the observed and expected values for each variable that gets tested.

In [None]:
def GetCrosstabVals(observed, expected):
    resultDict = {}
    resultList = []
    for XValue in observed.index:
        for YValue in observed.columns:
            observedVal = observed.loc[XValue,YValue]
            expectedVal = expected.loc[XValue,YValue]
            resultDict['XValue'] = XValue
            resultDict['YValue'] = YValue
            resultDict['observedVal'] = observedVal
            resultDict['expectedVal'] = expectedVal
            resultDict['delta'] = abs (observedVal - expectedVal)
            resultList.append(resultDict.copy())
    return pd.DataFrame (resultList).sort_values('delta', ascending=False)

Here we perform the testing. Since this takes a few minutes to complete, some progress messages are displayed along the way.

In [None]:
resultList = []
resultDict = {}

startTime = time()

for yCat in yCats:
    print (f'****** Starting yCat = {yCat} ******')
    yCols = categoryDict[yCat]
    for xCat in xCats:
        print (f'...... Starting xCat = {xCat}')
        tabCount = 0
        xCols = categoryDict[xCat]
        for X in xCols:
            for Y in yCols:
                if X != Y: # Don't need to compare columns to themselves
                    # There are some columns that don't have a "_Groups" equivalent (the binary columns)
                    # For those, we just use the response as-is
                    if Y not in data.columns:
                        Y = Y.replace('_Groups','')
                    if X not in data.columns:
                        X = X.replace('_Groups','')
                    ctab = pd.crosstab (data[X], data[Y])
                    chi2, p, dof, expected = chi2_contingency(ctab)
                    resultDict['XCategory'] = xCat
                    resultDict['YCategory'] = yCat
                    resultDict['X'] = X
                    resultDict['Y'] = Y
                    resultDict['chi2'] = chi2
                    resultDict['p'] = p
                    resultDict['dof'] = dof
                    resultDict['expected'] = pd.DataFrame(expected, columns = ctab.columns, index = ctab.index)
                    resultDict['observed'] = ctab
                    resultDict['comparison'] = GetCrosstabVals(ctab, resultDict['expected'])
                    resultList.append(resultDict.copy())
                    tabCount += 1
        print (f' ..... Finished {xCat} -- {tabCount} tables')
    print (f'****** Finished {yCat} ******')

print (f'Done after {time() - startTime} secs')

results = pd.DataFrame(resultList)

Store the results to be used for further analysis.  Also store the updated Data with Grouped and Ungrouped Responses.

In [None]:
joblib.dump(results, 'Chi2Results.pkl')
joblib.dump(data, 'GroupedAndUngroupedData.pkl')