In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats.contingency import crosstab

In [2]:
df = pd.read_csv("survey_results_public.csv")

In [31]:
cat_vars = df.drop(['Respondent', 
                    'Age', 
                    'Age1stCode',
                    'CompTotal', 
                    'ConvertedComp',
                    'WorkWeekHrs',
                    'YearsCode',
                    'YearsCodePro'], 
                    axis=1)

# conversion to make these a quanititative var
df['YearsCode'].replace('Less than 1 year',0.5, inplace=True) 
df['YearsCode'].replace('More than 50 years',55, inplace=True)  # it's unlikely they coded for more than 60 years
df['YearsCodePro'].replace('Less than 1 year',0.5, inplace=True) 
df['YearsCodePro'].replace('More than 50 years',55, inplace=True)

# true values unlikely to be far from these replacements
df['Age1stCode'].replace('Younger than 5 years', 3, inplace=True)  
df['Age1stCode'].replace('Older than 85', 87, inplace=True)  

quant_vars = df.filter(['Respondent',
                        'Age',
                        'Age1stCode',
                        'CompTotal',
                        'ConvertedComp',
                        'WorkWeekHrs',
                        'YearsCode',
                        'YearsCodePro'],
                        axis=1)

# for some reason, this data is all 'object', not strings, so this converts it
cat_vars = pd.DataFrame(cat_vars, dtype=str)  
quant_vars = pd.DataFrame(quant_vars, dtype=float)
print(round(quant_vars.corr(numeric_only=True),2))  # find correlation matrix

               Respondent   Age  Age1stCode  CompTotal  ConvertedComp  \
Respondent           1.00 -0.03        0.03       0.00          -0.01   
Age                 -0.03  1.00        0.08       0.04           0.11   
Age1stCode           0.03  0.08        1.00       0.00          -0.04   
CompTotal            0.00  0.04        0.00        NaN           0.00   
ConvertedComp       -0.01  0.11       -0.04       0.00           1.00   
WorkWeekHrs         -0.00  0.05       -0.00      -0.00           0.03   
YearsCode           -0.05  0.79       -0.27       0.00           0.11   
YearsCodePro        -0.04  0.83       -0.16       0.00           0.11   

               WorkWeekHrs  YearsCode  YearsCodePro  
Respondent           -0.00      -0.05         -0.04  
Age                   0.05       0.79          0.83  
Age1stCode           -0.00      -0.27         -0.16  
CompTotal            -0.00       0.00          0.00  
ConvertedComp         0.03       0.11          0.11  
WorkWeekHrs       

In [23]:
# count number of levels on each categorical column, drop ones with high number of levels
# sorted for convenience
col_unique_counts = cat_vars.nunique(dropna=True).sort_values(axis=0, ascending=True)
print(col_unique_counts)

Hobbyist                            3
Trans                               3
NEWOtherComms                       3
NEWDevOps                           4
CompFreq                            4
SurveyLength                        4
SurveyEase                          4
NEWOffTopic                         4
NEWOnboardGood                      4
SOAccount                           4
PurchaseWhat                        4
JobSeek                             4
NEWLearn                            5
OpSys                               5
NEWPurpleLink                       5
JobSat                              6
NEWDevOpsImpt                       6
NEWEdImpt                           6
NEWOvertime                         6
MainBranch                          6
SOComm                              7
SOPartFreq                          7
SOVisitFreq                         7
WelcomeChange                       7
Gender                              8
Employment                          8
EdLevel     

In [36]:
# looks like we can drop about everything after Sexuality (and even keeping that is a bit much)
small_cats = col_unique_counts.loc[:'Sexuality'] # categorical variables with smaller number of levels
cat_vars = cat_vars[small_cats.index]
cat_vars.columns

Index(['Hobbyist', 'Trans', 'NEWOtherComms', 'NEWDevOps', 'CompFreq',
       'SurveyLength', 'SurveyEase', 'NEWOffTopic', 'NEWOnboardGood',
       'SOAccount', 'PurchaseWhat', 'JobSeek', 'NEWLearn', 'OpSys',
       'NEWPurpleLink', 'JobSat', 'NEWDevOpsImpt', 'NEWEdImpt', 'NEWOvertime',
       'MainBranch', 'SOComm', 'SOPartFreq', 'SOVisitFreq', 'WelcomeChange',
       'Gender', 'Employment', 'EdLevel', 'OrgSize', 'UndergradMajor',
       'Sexuality'],
      dtype='object')

In [14]:
ctab_result = crosstab(cat_vars['Gender'], 
                       cat_vars['OrgSize'],
                       levels=(
                           cat_vars['Gender'].unique(),
                           cat_vars['OrgSize'].unique()))

# these two are the label sets
# print(ctab_result)
# print(ctab_result[0][0])  # rows
# print(ctab_result[0][1])  # columns

# # this is the actual contingency table
print(ctab_result[1])
contingency_table = ctab_result[1]

# this is the chi-squared test
test_results = stats.chi2_contingency(contingency_table)
print(test_results) # second value is the p-value

[[ 3535  3949 10271  7748  1730  4963  6762  2320  3290  1445]
 [  565   534  8627  1121   337   737   925   328   521   209]
 [  253   330  1004   627    97   389   548   201   261   134]
 [   16     9    34    13     1    19    15     3     9     2]
 [   28    32   116    61    14    30    39    22    28    15]
 [    3     5    30    15     6     7    12     4     7     3]
 [    0     1     9     3     1     1     6     1     4     0]
 [    9     3    36     9     4     3     4     1     7     0]]
(8006.103227001142, 0.0, 63, array([[3.14719469e+03, 3.47126509e+03, 1.43668831e+04, 6.85044850e+03,
        1.56324708e+03, 4.38922662e+03, 5.93248698e+03, 2.05577698e+03,
        2.94589986e+03, 1.29057111e+03],
       [9.51005042e+02, 1.04893117e+03, 4.34131968e+03, 2.07003751e+03,
        4.72374924e+02, 1.32631663e+03, 1.79265205e+03, 6.21205380e+02,
        8.90178682e+02, 3.89978933e+02],
       [2.62921705e+02, 2.89995067e+02, 1.20023251e+03, 5.72297482e+02,
        1.30596174e+02, 

In [37]:
# this codeblock takes about 1 MINUTE little bit to run
chi_sq_tests = np.zeros((len(cat_vars.columns),len(cat_vars.columns)))

for i, rows in enumerate(cat_vars.columns):
    for j, cols in enumerate(cat_vars.columns):
        
        ctab_result = crosstab(cat_vars[rows], cat_vars[cols],
                               levels=(
                                   cat_vars[rows].unique(),
                                   cat_vars[cols].unique()))

        test_results = stats.chi2_contingency(ctab_result[1])
        chi_sq_tests[i][j] = test_results[1]  # load p-values into the grid
        amount_completed = 100*((len(cat_vars.columns)*i + j)/len(cat_vars.columns)**2)
        print(" row:", i, 
              "col:", j, 
              "Progress: {:.2f}%".format(amount_completed),
              end="\r")  # progress

 row: 29 col: 29 Progress: 99.89%

In [39]:
# redefine as a DataFrame, to have labels
chi_sq_tests = pd.DataFrame(chi_sq_tests, 
                            columns=cat_vars.columns, 
                            index=cat_vars.columns)
round(chi_sq_tests,2).to_csv('chi_sq_tests.csv')    # to save for later

In [46]:
indep_vars = chi_sq_tests.gt(0.05)           # show which cells in are greater than p=0.05
print(indep_vars[indep_vars==True].count())  # looks like it was none of them!

Hobbyist          0
Trans             0
NEWOtherComms     0
NEWDevOps         0
CompFreq          0
SurveyLength      0
SurveyEase        0
NEWOffTopic       0
NEWOnboardGood    0
SOAccount         0
PurchaseWhat      0
JobSeek           0
NEWLearn          0
OpSys             0
NEWPurpleLink     0
JobSat            0
NEWDevOpsImpt     0
NEWEdImpt         0
NEWOvertime       0
MainBranch        0
SOComm            0
SOPartFreq        0
SOVisitFreq       0
WelcomeChange     0
Gender            0
Employment        0
EdLevel           0
OrgSize           0
UndergradMajor    0
Sexuality         0
dtype: int64


This means no two variables are independent of each other. So we could use exactly 1 categorical variable, but not another one, since we'd run into multicolinearity issues with two or more.