In [1]:
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop2.7.tgz
!tar -xf spark-3.1.2-bin-hadoop2.7.tgz
!pip install findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"

import findspark
findspark.init()

Collecting findspark
  Downloading findspark-1.4.2-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: findspark
Successfully installed findspark-1.4.2


In [114]:
marks=[
    "김하나, English, 100",
    "김하나, Math, 80",
    "임하나, English, 70",
    "임하나, Math, 100",
    "김갑돌, English, 82.3",
    "김갑돌, Math, 98.5"
]

In [115]:
import pyspark
myConf=pyspark.SparkConf()
spark = pyspark.sql.SparkSession.builder\
    .master("local")\
    .appName("myApp")\
    .config(conf=myConf)\
    .getOrCreate()

In [116]:
_marksRdd=spark.sparkContext.parallelize(marks)

In [117]:
marksRdd = _marksRdd.map(lambda x:x.split(',')).map(lambda x: (x[0],x[1],float(x[2])))

# 문제 1-1 시작/답

In [124]:
marksDf = spark.createDataFrame(marksRdd)
marksDf.show()
marksDf.printSchema()

+------+--------+-----+
|    _1|      _2|   _3|
+------+--------+-----+
|김하나| English|100.0|
|김하나|    Math| 80.0|
|임하나| English| 70.0|
|임하나|    Math|100.0|
|김갑돌| English| 82.3|
|김갑돌|    Math| 98.5|
+------+--------+-----+

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: double (nullable = true)



In [125]:
marksDf = marksDf.withColumnRenamed("_1", "name")\
       .withColumnRenamed("_2", "subject")\
       .withColumnRenamed("_3", "score")

marksDf = marksDf.withColumn('score', marksDf['score'].cast("float"))

marksDf.show()
marksDf.printSchema()

+------+--------+-----+
|  name| subject|score|
+------+--------+-----+
|김하나| English|100.0|
|김하나|    Math| 80.0|
|임하나| English| 70.0|
|임하나|    Math|100.0|
|김갑돌| English| 82.3|
|김갑돌|    Math| 98.5|
+------+--------+-----+

root
 |-- name: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- score: float (nullable = true)



# 문제 1-2 시작

In [147]:
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
import numpy as np

X = marksRdd.map(lambda x:float(x[2])).collect()
m = float(np.mean(X))
std = float(np.std(X, ddof=1))
print(std)

zscoreUdf = udf(lambda x: (x-m)/std, FloatType())
marksDf = marksDf.withColumn('zscore', zscoreUdf(marksDf['score']))

12.786190467323198


# 문제 1-2 답

In [148]:
marksDf.show()

+------+--------+-----+-----------+
|  name| subject|score|     zscore|
+------+--------+-----+-----------+
|김하나| English|100.0| 0.90201485|
|김하나|    Math| 80.0|-0.66217273|
|임하나| English| 70.0| -1.4442666|
|임하나|    Math|100.0| 0.90201485|
|김갑돌| English| 82.3|-0.48229092|
|김갑돌|    Math| 98.5| 0.78470075|
+------+--------+-----+-----------+



# 문제 1-3 시작

In [162]:
from scipy.stats import norm

def calCdf(x):
  temp = norm.cdf(x, loc=m, scale=std)
  return float(temp)

cdfUdf = udf(lambda x: calCdf(x), FloatType())

# 문제 1-3 답

In [163]:
marksDf = marksDf.withColumn("cdf", cdfUdf(marksDf['score']))

In [164]:
marksDf.show()

+------+--------+-----+-----------+-----------+
|  name| subject|score|     zscore|        cdf|
+------+--------+-----+-----------+-----------+
|김하나| English|100.0| 0.90201485|  0.8164755|
|김하나|    Math| 80.0|-0.66217273| 0.25393027|
|임하나| English| 70.0| -1.4442666|0.074332014|
|임하나|    Math|100.0| 0.90201485|  0.8164755|
|김갑돌| English| 82.3|-0.48229092| 0.31479964|
|김갑돌|    Math| 98.5| 0.78470075|  0.7836855|
+------+--------+-----+-----------+-----------+

