In [4]:
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats.contingency import crosstab

In [5]:
df = pd.read_csv("survey_results_public.csv")

In [6]:
print(round(df.corr(numeric_only=True),1))  # find correlation matrix, and cut off unnecessary decimals
cat_vars = df.drop(['Respondent', 
                    'Age', 
                    'CompTotal', 
                    'ConvertedComp',
                    'WorkWeekHrs'], 
                    axis=1)


quant_vars = df.filter(['Respondent',
                        'Age',
                        'CompTotal',
                        'ConvertedComp',
                        'WorkWeekHrs'],
                        axis=1)

# for some reason, this data is all 'object', not strings, so this converts it
cat_vars = pd.DataFrame(cat_vars, dtype=str)  
quant_vars = pd.DataFrame(quant_vars, dtype=float)

               Respondent  Age  CompTotal  ConvertedComp  WorkWeekHrs
Respondent            1.0 -0.0        0.0           -0.0         -0.0
Age                  -0.0  1.0        0.0            0.1          0.1
CompTotal             0.0  0.0        NaN            0.0         -0.0
ConvertedComp        -0.0  0.1        0.0            1.0          0.0
WorkWeekHrs          -0.0  0.1       -0.0            0.0          1.0


In [22]:
ctab_result = crosstab(cat_vars['EdLevel'], 
                       cat_vars['Gender'],
                       levels=(
                           cat_vars['EdLevel'].unique(),
                           cat_vars['Gender'].unique()))

# these two are the label sets
print(ctab_result)
print(ctab_result[0][0])  # rows
print(ctab_result[0][1])  # columns

# this is the actual contingency table
print(ctab_result[1])
contingency_table = ctab_result[1]

# this is the chi-squared test
test_results = stats.chi2_contingency(contingency_table)
print(test_results[1]) # second value is the p-value

([array(['Master’s degree (M.A., M.S., M.Eng., MBA, etc.)',
       'Bachelor’s degree (B.A., B.S., B.Eng., etc.)', 'nan',
       'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)',
       'Professional degree (JD, MD, etc.)',
       'Some college/university study without earning a degree',
       'Associate degree (A.A., A.S., etc.)',
       'Other doctoral degree (Ph.D., Ed.D., etc.)',
       'Primary/elementary school',
       'I never completed any formal education'], dtype=object), array(['Man', 'nan', 'Woman',
       'Man;Non-binary, genderqueer, or gender non-conforming',
       'Non-binary, genderqueer, or gender non-conforming',
       'Woman;Non-binary, genderqueer, or gender non-conforming',
       'Woman;Man;Non-binary, genderqueer, or gender non-conforming',
       'Woman;Man'], dtype=object)], array([[10200,  1847,   948,    24,    69,    14,     2,     8],
       [20542,  3806,  1927,    37,   139,    46,    10,    35],
       [ 1005,  58

In [7]:
# this codeblock takes about 11 MINUTES to run on less than 10% of the data on my machine...
# ...run at your own risk, or only once, then export the results!
chi_sq_tests = np.zeros((len(cat_vars.columns),len(cat_vars.columns)))

for i, rows in enumerate(cat_vars.columns):
    for j, cols in enumerate(cat_vars.columns):
        
        ctab_result = crosstab(cat_vars[rows], cat_vars[cols],
                               levels=(
                                   cat_vars[rows].unique(),
                                   cat_vars[cols].unique()))

        test_results = stats.chi2_contingency(ctab_result[1])
        chi_sq_tests[i][j] = test_results[1]  # load p-values into the grid
        print(i,j) # progress

0 0
0 1
0 2
0 3
0 4
0 5
0 6
0 7
0 8
0 9
0 10
0 11
0 12
0 13
0 14
0 15
0 16
0 17
0 18
0 19
0 20
0 21
0 22
0 23
0 24
0 25
0 26
0 27
0 28
0 29
0 30
0 31
0 32
0 33
0 34
0 35
0 36
0 37
0 38
0 39
0 40
0 41
0 42
0 43
0 44
0 45
0 46
0 47
0 48
0 49
0 50
0 51
0 52
0 53
0 54
0 55
1 0
1 1
1 2
1 3
1 4
1 5
1 6
1 7
1 8
1 9
1 10
1 11
1 12
1 13
1 14
1 15
1 16
1 17
1 18
1 19
1 20
1 21
1 22
1 23
1 24
1 25
1 26
1 27
1 28
1 29
1 30
1 31
1 32
1 33
1 34
1 35
1 36
1 37
1 38
1 39
1 40
1 41
1 42
1 43
1 44
1 45
1 46
1 47
1 48
1 49
1 50
1 51
1 52
1 53
1 54
1 55
2 0
2 1
2 2
2 3
2 4
2 5
2 6
2 7
2 8
2 9
2 10
2 11
2 12
2 13
2 14
2 15
2 16
2 17


In [1]:
# redefine as a DataFrame, to have labels
chi_sq_tests = pd.DataFrame(round(chi_sq_tests,), 
                            columns=cat_vars.columns, 
                            index=cat_vars.columns)
pd.to_csv(chi_sq_tests)

NameError: name 'pd' is not defined