https://umich.instructure.com/courses/825993/files/folder/2%20-%20Stats%20and%20Programming%20Review?preview=44628369

In [21]:
# Load libraries
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score
import os


# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)

# Set directories
data_dir = '/Users/nick/Desktop/School/Winter 2026/SI 313/si313_w26/data/'
os.chdir(data_dir)

In [24]:
# Load the Dawtry et al. (2015) Study 1a data
df = pd.read_csv('correlation-assignment_Dawtry Sutton and Sibley 2015 Study 1a.csv')

# Explore the dataframe
print(f"Shape: {df.shape}")
print(f"\nColumn names:\n{df.columns.tolist()}")
print(f"\nFirst few rows:\n{df.head()}")

Shape: (305, 37)

Column names:
['PS', 'PD_15', 'PD_30', 'PD_45', 'PD_60', 'PD_75', 'PD_90', 'PD_105', 'PD_120', 'PD_135', 'PD_150', 'PD_150plus', 'fairness', 'satisfaction', 'SC_15', 'SC_30', 'SC_45', 'SC_60', 'SC_75', 'SC_90', 'SC_105', 'SC_120', 'SC_135', 'SC_150', 'SC_150plus', 'redist1', 'redist2', 'redist3', 'redist4', 'Household_Income', 'Political_Preference', 'age', 'gender', 'Population_Inequality_Gini_Index', 'Population_Mean_Income', 'Social_Circle_Inequality_Gini_Index', 'Social_Circle_Mean_Income']

First few rows:
    PS  PD_15  PD_30  PD_45  PD_60  PD_75  PD_90  PD_105  PD_120  PD_135  \
0  233     27     48     21      0      0      0       0       0       0   
1  157     39      0      0      0      0      0       0       0       0   
2  275      0      0     50      0      0     50       0       0       0   
3  111      9     14     17     17     17      8       7       5       2   
4   52     68     32      0      0      0      0       0       0       0   

   PD_15

In [31]:
# Look at data types and missing values
print(f'''\n\n\n
######################
##### DATA TYPES #####
######################''')
print(df.info())

# Get basic stats for all numeric variables
print(f'''\n\n\n
#######################
##### BASIC STATS #####
#######################''')
print(df.describe())

# Check specific variables we'll use
print(f'''\n\n\n
##############################
##### RELEVANT VARIABLES #####
##############################''')
relevant_vars = ['fairness', 'satisfaction', 'redist1', 'redist2', 'redist3', 'redist4', 'Household_Income', 'Social_Circle_Mean_Income', 'Population_Mean_Income', 'Social_Circle_Inequality_Gini_Index', 'Population_Inequality_Gini_Index', 'Political_Preference']

print(df[relevant_vars].describe())





######################
##### DATA TYPES #####
######################
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Data columns (total 37 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   PS                                   305 non-null    int64  
 1   PD_15                                305 non-null    int64  
 2   PD_30                                305 non-null    int64  
 3   PD_45                                305 non-null    int64  
 4   PD_60                                305 non-null    int64  
 5   PD_75                                305 non-null    int64  
 6   PD_90                                305 non-null    int64  
 7   PD_105                               305 non-null    int64  
 8   PD_120                               305 non-null    int64  
 9   PD_135                               305 non-null    int64  
 10  PD_150                   

In [33]:
# Compute mean of fairness and satisfaction
df['fairness_and_satisfaction'] = df[['fairness', 'satisfaction']].mean(axis=1)

# Verify it worked
print(f"New variable created: {df['fairness_and_satisfaction'].describe()}")
print(f"\nMissing values: {df['fairness_and_satisfaction'].isna().sum()}")
# Check a few cases manually
print(df[['fairness', 'satisfaction', 'fairness_and_satisfaction']].head(10))

New variable created: count    305.00
mean       3.54
std        2.02
min        1.00
25%        2.00
50%        3.00
75%        5.00
max        9.00
Name: fairness_and_satisfaction, dtype: float64

Missing values: 0
   fairness  satisfaction  fairness_and_satisfaction
0         1             1                        1.0
1         5             2                        3.5
2         5             5                        5.0
3         7             7                        7.0
4         4             5                        4.5
5         1             4                        2.5
6         3             3                        3.0
7         5             4                        4.5
8         5             3                        4.0
9         4             5                        4.5


In [35]:
# Reverse code redist2 and redist4
# Formula: 7 - original value (for 1-6 scale)
df['redist2_recode'] = 7 - df['redist2']
df['redist4_recode'] = 7 - df['redist4']

# Verify it worked
print("Original redist2 values:")
print(df['redist2'].value_counts().sort_index())
print("\nRecoded redist2 values:")
print(df['redist2_recode'].value_counts().sort_index())

# Check a few cases
print("\nSample of recoding:")
print(df[['redist2', 'redist2_recode', 'redist4', 'redist2_recode']].head(10))

Original redist2 values:
redist2
1    78
2    97
3    77
4    31
5    15
6     7
Name: count, dtype: int64

Recoded redist2 values:
redist2_recode
1     7
2    15
3    31
4    77
5    97
6    78
Name: count, dtype: int64

Sample of recoding:
   redist2  redist2_recode  redist4  redist2_recode
0        3               4        1               4
1        2               5        4               5
2        4               3        5               3
3        3               4        4               4
4        5               2        5               2
5        5               2        6               2
6        2               5        5               5
7        3               4        4               4
8        4               3        5               3
9        4               3        5               3


In [36]:
# Compute mean of all redistribution items
df['support_for_redistribution'] = df[['redist1', 'redist2_recode', 'redist3', 'redist4_recode']].mean(axis=1)

# Verify it worked
print(f"New variable created: {df['support_for_redistribution'].describe()}")
print(f"\nMissing values: {df['support_for_redistribution'].isna().sum()}")

# Check internal consistency (Cronbach's alpha - optional)
redist_items = df[['redist1', 'redist2_recode', 'redist3', 'redist4_recode']]
print(redist_items.corr())

New variable created: count    305.00
mean       3.91
std        1.15
min        1.00
25%        3.25
50%        4.00
75%        4.75
max        6.00
Name: support_for_redistribution, dtype: float64

Missing values: 0
                redist1  redist2_recode  redist3  redist4_recode
redist1            1.00            0.36     0.75            0.51
redist2_recode     0.36            1.00     0.44            0.45
redist3            0.75            0.44     1.00            0.51
redist4_recode     0.51            0.45     0.51            1.00


Calculate descriptive statistics slide 27