### Descriptive statistics and covariance in the PISA2018 dataset
Evan Edwards

In [12]:
# Imports
import pandas as pd
import numpy as np
import scipy as sp

In [13]:
# Load datasets, and select variables of interest
PISA2018 = pd.read_csv("pisa2018.BayesBook.csv")
variables = ["Female", "ESCS","METASUM","PERFEED","HOMEPOS","ADAPTIVITY","TEACHINT",
             "ICTRES","JOYREAD","ATTLNACT","COMPETE","WORKMAST","GFOFAIL","SWBP",
             "MASTGOAL","BELONG","SCREADCOMP","SCREADDIFF","PISADIFF","PV1READ"]

In [14]:
# Data processing: converting categorical values to numeric, limiting the scope of the dataset to relevant variables
PISA2018['Female'] = PISA2018['Female'].replace({'Female': 1, 'Male': 0})
PISA2018 = PISA2018[variables]

In [15]:
# A method to calculate descriptive statistics for a given variable, takes in a series and returns a dictionary of statistics
def descriptiveStats(data):
    stats = dict()
    stats["mean"] = np.mean(data)
    stats["median"] = np.median(data)
    stats["mode"] =  sp.stats.mode(data)[0]
    stats["variance"] = np.var(data)
    stats["standard deviation"] = np.std(data)
    stats["minimum"] = np.min(data)
    stats["maximum"] = np.max(data)
    stats["range"] = np.ptp(data)
    stats["skewness"] = sp.stats.skew(data)
    stats["kurtosis"] = sp.stats.kurtosis(data)
    return stats

In [16]:
# Printing out the descriptive statistics for each variable in the dataset
for var in variables:
      print(var)
      for keys, value in descriptiveStats(PISA2018[var]).items():
         print(keys +": "+ str(value))
      print()

Female
mean: 0.49111202976436547
median: 0.0
mode: 0
variance: 0.24992100398509048
standard deviation: 0.499920997743734
minimum: 0
maximum: 1
range: 1
skewness: 0.035557499187864244
kurtosis: -1.998735664251505

ESCS
mean: 0.06508251343530384
median: 0.15410000000000001
mode: -0.67
variance: 1.026746459152674
standard deviation: 1.013284984174084
minimum: -4.0953
maximum: 3.2545
range: 7.3498
skewness: -0.4002750110875765
kurtosis: -0.06867370336124434

METASUM
mean: -0.08338983050847457
median: 0.21
mode: 0.59
variance: 1.019104424716755
standard deviation: 1.009507020637675
minimum: -1.72
maximum: 1.36
range: 3.08
skewness: -0.3661128141116265
kurtosis: -1.0711348812425072

PERFEED
mean: 0.2949057048367094
median: 0.2682
mode: -0.3253
variance: 1.0949392655730772
standard deviation: 1.0463934563886939
minimum: -1.6391
maximum: 2.0165
range: 3.6556
skewness: -0.025115249324997795
kurtosis: -0.7225323596910731

HOMEPOS
mean: -0.03159551467548574
median: -0.045450000000000004
mode: 5.3

In [17]:
# A method that takes 2 parameters for each variable and 1 parameter for setting the return value to be the pearson correlation coefficent if set to true, else returns covariance
def covariance(data1, data2, pearson = False):
    if(pearson):
        return np.corrcoef([data1, data2])[0][1]
    return np.cov([data1, data2])[0][1]

In [20]:
# A demonstration of the covariance and pearson correlation coefficent between two variables
var1 = "JOYREAD"
var2 = "PV1READ"

print("Covariance between " + var1 + " and " + var2 + ":")
print(str(covariance(PISA2018[var1], PISA2018[var2])) +"\n")

print("Pearson correlation coefficent between " + var1 + " and " + var2 + ":")
print(str(covariance(PISA2018[var1], PISA2018[var2], pearson = True)) +"\n")

# Some additional correlation coefficents between select variables
testVariables = [["Female", "COMPETE"],
                 ["WORKMAST","ESCS"],
                 ["COMPETE", "PISADIFF"]]

for i in range(len(testVariables)):
    print("Pearson correlation coefficent between " + testVariables[i][0] + " and " + testVariables[i][1] + ":")
    print(covariance(PISA2018[testVariables[i][0]], PISA2018[testVariables[i][1]], pearson = True))

Covariance between JOYREAD and PV1READ:
35.25464913173048

Pearson correlation coefficent between JOYREAD and PV1READ:
0.30457499622372447

Pearson correlation coefficent between Female and COMPETE:
-0.020369268359117874
Pearson correlation coefficent between WORKMAST and ESCS:
0.11650104458555514
Pearson correlation coefficent between COMPETE and PISADIFF:
-0.11547952023660024


In [19]:
# Finding the largest correlation coefficent between two variables
largest = - np.inf
var1 = ""
var2 = ""

for i in PISA2018:
    for j in PISA2018:
        if(PISA2018[i].name != PISA2018[j].name):
            if largest < covariance(PISA2018[PISA2018[i].name], PISA2018[PISA2018[j].name], pearson = True):
                largest = covariance(PISA2018[PISA2018[i].name], PISA2018[PISA2018[j].name], pearson = True)
                var1 = PISA2018[i].name
                var2 = PISA2018[j].name
                
print(f'Largest correlation is between {var1} and {var2}: {largest}')

Largest correlation is between HOMEPOS and ICTRES: 0.7882022147249816
