### Descriptive statistics and covariance in the PISA2018 dataset
Evan Edwards

In [1]:
# Imports
import pandas as pd
import numpy as np
from statsmodels.stats.descriptivestats import describe

In [2]:
# Load datasets, and select variables of interest
PISA2018 = pd.read_csv("pisa2018.BayesBook.csv")
variables = ["Female", "ESCS","METASUM","PERFEED","HOMEPOS","ADAPTIVITY","TEACHINT",
             "ICTRES","JOYREAD","ATTLNACT","COMPETE","WORKMAST","GFOFAIL","SWBP",
             "MASTGOAL","BELONG","SCREADCOMP","SCREADDIFF","PISADIFF","PV1READ"]

In [3]:
# Data processing: converting categorical values to numeric, limiting the scope of the dataset to relevant variables
PISA2018['Female'] = PISA2018['Female'].replace({'Female': 1, 'Male': 0})
PISA2018 = PISA2018[variables]

In [4]:
# A method to calculate descriptive statistics for a given variable, takes in a series and returns the statistics
def descriptiveStats(data):
    return describe(data, stats = ["mean", "std_err", "std","coef_var", "range", "max",
                                    "min","skew", "kurtosis", "mode", "median"])

In [5]:
# Printing out the descriptive statistics for each variable in the dataset
for var in variables:
      for keys, value in descriptiveStats(PISA2018[var]).items():
         print(keys +": "+ str(value))
      print()

Female: mean         0.491112
std_err      0.007188
std          0.499973
coef_var     1.018042
range        1.000000
max          1.000000
min          0.000000
skew         0.035557
kurtosis     1.001264
mode         0.000000
mode_freq    0.508888
median       0.000000
Name: Female, dtype: float64

ESCS: mean          0.065083
std_err       0.014569
std           1.013390
coef_var     15.570845
range         7.349800
max           3.254500
min          -4.095300
skew         -0.400275
kurtosis      2.931326
mode         -0.670000
mode_freq     0.000827
median        0.154100
Name: ESCS, dtype: float64

METASUM: mean         -0.083390
std_err       0.014515
std           1.009611
coef_var    -12.107128
range         3.080000
max           1.360000
min          -1.720000
skew         -0.366113
kurtosis      1.928865
mode          0.590000
mode_freq     0.188094
median        0.210000
Name: METASUM, dtype: float64

PERFEED: mean         0.294906
std_err      0.015046
std          1.0465

In [6]:
# A method that takes 2 parameters for each variable and 1 parameter for setting the return value to be the pearson correlation coefficent if set to true, else returns covariance
def covariance(data1, data2, pearson = False):
    if(pearson):
        return np.corrcoef([data1, data2])[0][1]
    return np.cov([data1, data2])[0][1]

In [7]:
# A demonstration of the covariance and pearson correlation coefficent between two variables
var1 = "JOYREAD"
var2 = "PV1READ"

print("Covariance between " + var1 + " and " + var2 + ":")
print(str(covariance(PISA2018[var1], PISA2018[var2])) +"\n")

print("Pearson correlation coefficent between " + var1 + " and " + var2 + ":")
print(str(covariance(PISA2018[var1], PISA2018[var2], pearson = True)) +"\n")

# Some additional correlation coefficents between select variables
testVariables = [["Female", "COMPETE"],
                 ["WORKMAST","ESCS"],
                 ["COMPETE", "PISADIFF"]]

for i in range(len(testVariables)):
    print("Pearson correlation coefficent between " + testVariables[i][0] + " and " + testVariables[i][1] + ":")
    print(covariance(PISA2018[testVariables[i][0]], PISA2018[testVariables[i][1]], pearson = True))

Covariance between JOYREAD and PV1READ:
35.254649131730474

Pearson correlation coefficent between JOYREAD and PV1READ:
0.3045749962237244

Pearson correlation coefficent between Female and COMPETE:
-0.02036926835911788
Pearson correlation coefficent between WORKMAST and ESCS:
0.11650104458555514
Pearson correlation coefficent between COMPETE and PISADIFF:
-0.11547952023660021


In [8]:
# Finding the largest correlation coefficent between two variables
largest = - np.inf
var1 = ""
var2 = ""

for i in PISA2018:
    for j in PISA2018:
        if(PISA2018[i].name != PISA2018[j].name):
            if largest < covariance(PISA2018[PISA2018[i].name], PISA2018[PISA2018[j].name], pearson = True):
                largest = covariance(PISA2018[PISA2018[i].name], PISA2018[PISA2018[j].name], pearson = True)
                var1 = PISA2018[i].name
                var2 = PISA2018[j].name
                
print(f'Largest correlation is between {var1} and {var2}: {largest}')

Largest correlation is between HOMEPOS and ICTRES: 0.7882022147249816
