In [9]:
# Load libraries
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)

# Helper Functions

To get p-values, you have to use a library specifically built for statistical testing, like _SciPy_.

`scipy` is the gold standard for scientific computing, but it is designed to calculate the correlation and p-value for one pair of variables at a time. It doesn't have a single "matrix" function like `df.corr()`.

To create the correlation matrix and get p-values using `scipy`, you need to iterate through your column combinations.

In [10]:
def get_scipy_corrs(df, variables):
    n = len(variables)
    # Initialize with NaNs to handle cases where calculation fails
    p_matrix = np.full((n, n), np.nan)
    r_matrix = np.full((n, n), np.nan)
    
    for i in range(n):
        for j in range(n):
            # 1. Select the two columns and drop rows where either is NaN
            subset = df[[variables[i], variables[j]]].dropna()
            
            # 2. Ensure data is numeric and flattened to 1D arrays
            x = subset[variables[i]].astype(float).values
            y = subset[variables[j]].astype(float).values
            
            # 3. Calculate (requires at least 2 points for a correlation)
            if len(x) > 1:
                try:
                    res = stats.pearsonr(x, y)
                    r_matrix[i, j] = res.statistic
                    p_matrix[i, j] = res.pvalue
                except:
                    continue # Skip pairs that cause math errors (e.g. zero variance)
                
    # Return as clean DataFrames
    r_df = pd.DataFrame(r_matrix, index=variables, columns=variables)
    p_df = pd.DataFrame(p_matrix, index=variables, columns=variables)
    return r_df, p_df

Creating a table that follows APA conventions is a little tricky. We've written a function to help you.

In [11]:
def export_apa_correlation(r_df, p_df):
    # Create a mask for the upper triangle
    mask = np.triu(np.ones_like(r_df, dtype=bool))
    
    # Initialize the formatted DataFrame
    apa_df = r_df.copy().astype(str)
    
    for i in range(len(r_df.index)):
        for j in range(len(r_df.columns)):
            # Hide the upper triangle and diagonal
            if i <= j:
                apa_df.iloc[i, j] = ""
                continue
                
            r_val = r_df.iloc[i, j]
            p_val = p_df.iloc[i, j]
            
            # Add stars
            stars = ""
            if p_val < .001: stars = "***"
            elif p_val < .01: stars = "**"
            elif p_val < .05: stars = "*"
            
            # Format to 2 or 3 decimal places (APA usually uses 2 or 3)
            # This version keeps 3 for precision
            formatted_r = f"{r_val:.2f}".replace("0.", ".") # Remove leading zero for APA style
            apa_df.iloc[i, j] = f"{formatted_r}{stars}"
            
    # Rename columns to 1, 2, 3... to follow APA table headers
    apa_df.columns = [f"{i+1}." for i in range(len(apa_df.columns))]
    # Add the variable names as the first column
    apa_df.insert(0, "Variable", r_df.index)
    # Add a column for the index numbers
    apa_df.insert(0, "â„–", range(1, len(apa_df) + 1))
    
    return apa_df

# Load the data

You'll need to download the data from the link in the assignment on Canvas. The filename you're looking for is `Dawtry Sutton and Sibley 2015 Study 1a.csv`.


In [13]:
# Load the Dawtry et al. (2015) Study 1a data
file_id = '0Bz-rhZ21ShvOMGxnYUJfYmR5d2M'
resource_key = '0-jo7UtjyXsahMUKXVOQvb9g'

# Construct a direct download link
direct_link = f'https://drive.google.com/uc?export=download&id={file_id}&resourcekey={resource_key}'
df = pd.read_csv(direct_link)

# Explore the dataframe
print(f"Shape: {df.shape}")
print(f"\nColumn names:\n{df.columns.tolist()}")
print(f"\nFirst few rows:\n{df.head()}")

Shape: (305, 37)

Column names:
['PS', 'PD_15', 'PD_30', 'PD_45', 'PD_60', 'PD_75', 'PD_90', 'PD_105', 'PD_120', 'PD_135', 'PD_150', 'PD_150plus', 'fairness', 'satisfaction', 'SC_15', 'SC_30', 'SC_45', 'SC_60', 'SC_75', 'SC_90', 'SC_105', 'SC_120', 'SC_135', 'SC_150', 'SC_150plus', 'redist1', 'redist2', 'redist3', 'redist4', 'Household_Income', 'Political_Preference', 'age', 'gender', 'Population_Inequality_Gini_Index', 'Population_Mean_Income', 'Social_Circle_Inequality_Gini_Index', 'Social_Circle_Mean_Income']

First few rows:
    PS  PD_15  PD_30  PD_45  PD_60  PD_75  PD_90  PD_105  PD_120  PD_135  \
0  233     27     48     21      0      0      0       0       0       0   
1  157     39      0      0      0      0      0       0       0       0   
2  275      0      0     50      0      0     50       0       0       0   
3  111      9     14     17     17     17      8       7       5       2   
4   52     68     32      0      0      0      0       0       0       0   

   PD_15