In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
spark_session = SparkSession.builder.appName('simple_statistics').getOrCreate()

Loading the data

In [5]:
df = spark_session.read.csv('data/StudentsPerformance.csv', header = True)

In [6]:
df.columns

['gender',
 'race/ethnicity',
 'parental level of education',
 'lunch',
 'test preparation course',
 'math score',
 'reading score',
 'writing score']

Categorical variables

In [9]:
df.select('gender').distinct().collect()

[Row(gender='female'), Row(gender='male')]

In [10]:
df.select('race/ethnicity').distinct().collect()

[Row(race/ethnicity='group B'),
 Row(race/ethnicity='group C'),
 Row(race/ethnicity='group D'),
 Row(race/ethnicity='group A'),
 Row(race/ethnicity='group E')]

In [11]:
df.select('parental level of education').distinct().collect()

[Row(parental level of education='some high school'),
 Row(parental level of education="associate's degree"),
 Row(parental level of education='high school'),
 Row(parental level of education="bachelor's degree"),
 Row(parental level of education="master's degree"),
 Row(parental level of education='some college')]

In [12]:
df.select('lunch').distinct().collect()

[Row(lunch='free/reduced'), Row(lunch='standard')]

In [13]:
df.select('test preparation course').distinct().collect()

[Row(test preparation course='completed'), Row(test preparation course='none')]

Numerical variables

In [20]:
math = df.select(df['math score'].cast('float').alias('math'))
mean = math.select(F.mean(math.math).alias('mean')).collect()[0]['mean']
std = math.select(F.stddev(math.math).alias('mean')).collect()[0]['mean']
min_ = math.select(F.min(math.math).alias('min')).collect()[0]['min']
max_ = math.select(F.max(math.math).alias('max')).collect()[0]['max']
print('Math: mean = ' + str(mean) + ', std = ' + str(std) + ', min = ' + str(min_) + ', max = ' + str(max_))

Math: mean = 66.089, std = 15.163080096009454, min = 0.0, max = 100.0


In [21]:
reading = df.select(df['reading score'].cast('float').alias('reading'))
mean = reading.select(F.mean(reading.reading).alias('mean')).collect()[0]['mean']
std = reading.select(F.stddev(reading.reading).alias('mean')).collect()[0]['mean']
min_ = reading.select(F.min(reading.reading).alias('min')).collect()[0]['min']
max_ = reading.select(F.max(reading.reading).alias('max')).collect()[0]['max']
print('Reading: mean = ' + str(mean) + ', std = ' + str(std) + ', min = ' + str(min_) + ', max = ' + str(max_))

Reading: mean = 69.169, std = 14.600191937252223, min = 17.0, max = 100.0


In [22]:
writing = df.select(df['writing score'].cast('float').alias('writing'))
mean = writing.select(F.mean(writing.writing).alias('mean')).collect()[0]['mean']
std = writing.select(F.stddev(writing.writing).alias('mean')).collect()[0]['mean']
min_ = writing.select(F.min(writing.writing).alias('min')).collect()[0]['min']
max_ = writing.select(F.max(writing.writing).alias('max')).collect()[0]['max']
print('Writing: mean = ' + str(mean) + ', std = ' + str(std) + ', min = ' + str(min_) + ', max = ' + str(max_))

Writing: mean = 68.054, std = 15.19565701086966, min = 10.0, max = 100.0
