# Spark Pipeline : Predicting a student's GPA from his performances
DEMBELE Mathilda, MARSOT Elouan


## 1. Starting the Spark session

In [1]:
# Initialisation des librairies pyspark

# Initialisation de Spark
import pyspark

from pyspark.sql import SparkSession

# import ML pyspark modules
# some examples
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import MultilayerPerceptronClassifier

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint

from pyspark import SparkConf, SparkContext, SQLContext

import findspark
findspark.init()

spark = SparkSession \
    .builder \
    .appName("GPAPredictor") \
    .getOrCreate()
 


## 2. Data reading

In [2]:
"""
DATASET DESCRIPTION
- TRAIN : 1531 samples 
- TEST : 384 samples 
- each line in the dataset stands for some student
- each column is a feature of performance for the student

14 features :
- StudentID : int, a four-figures unique number 
- Age : int 
- Gender : binary, 0 for a man, 1 for a woman 
- Ethnicity : categorial (Caucasian, Asian, African American, Other)
- ParentalEducation : categorial (High School, Bachelor, Some College, Higher)
- StudyTimeWeekly : float, nb of hours per week 
- Absences : int 
- Tutoring : binary, 1 if yes, 0 otherwise 
- ParentalSupport : categorial (Low, Moderate, High, Very High)
- Extracurricular : binary
- Sports : binary 
- Music : binary 
- Volunteering : binary 

- GPA : float (from 0 to 4)

"""

fileNameTrain = "datasets/train.csv"
fileNameTest = "datasets/test.csv"

# Reading the datasets
train_set = spark.read.csv(fileNameTrain, header=True, inferSchema=True)
test_set = spark.read.csv(fileNameTest, header=True, inferSchema=True)



In [3]:
# CHECKING THAT THEY HAVE THE SAME SCHEMA
train_set.printSchema()
test_set.printSchema()
train_set.take(1)
test_set.take(1)

root
 |-- StudentID: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: integer (nullable = true)
 |-- Ethnicity: string (nullable = true)
 |-- ParentalEducation: string (nullable = true)
 |-- StudyTimeWeekly: double (nullable = true)
 |-- Absences: integer (nullable = true)
 |-- Tutoring: integer (nullable = true)
 |-- ParentalSupport: string (nullable = true)
 |-- Extracurricular: integer (nullable = true)
 |-- Sports: integer (nullable = true)
 |-- Music: integer (nullable = true)
 |-- Volunteering: integer (nullable = true)
 |-- GPA: double (nullable = true)

root
 |-- StudentID: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: integer (nullable = true)
 |-- Ethnicity: string (nullable = true)
 |-- ParentalEducation: string (nullable = true)
 |-- StudyTimeWeekly: double (nullable = true)
 |-- Absences: integer (nullable = true)
 |-- Tutoring: integer (nullable = true)
 |-- ParentalSupport: string (nullable = true)
 |-- Extracurricul

[Row(StudentID=2340, Age=16, Gender=1, Ethnicity='Other', ParentalEducation='Higher', StudyTimeWeekly=5.04404804318662, Absences=25, Tutoring=1, ParentalSupport='Moderate', Extracurricular=1, Sports=0, Music=0, Volunteering=0, GPA=0.886889415770466)]

## 3. Data cleaning

In [4]:
# Missing values
from pyspark.sql.functions import col

train_set.select([(col(c).isNull().cast("int")).alias(c) for c in train_set.columns]).groupBy().sum().show()

+--------------+--------+-----------+--------------+----------------------+--------------------+-------------+-------------+--------------------+--------------------+-----------+----------+-----------------+--------+
|sum(StudentID)|sum(Age)|sum(Gender)|sum(Ethnicity)|sum(ParentalEducation)|sum(StudyTimeWeekly)|sum(Absences)|sum(Tutoring)|sum(ParentalSupport)|sum(Extracurricular)|sum(Sports)|sum(Music)|sum(Volunteering)|sum(GPA)|
+--------------+--------+-----------+--------------+----------------------+--------------------+-------------+-------------+--------------------+--------------------+-----------+----------+-----------------+--------+
|             0|       0|          0|             0|                   142|                   0|            0|            0|                 132|                   0|          0|         0|                0|       0|
+--------------+--------+-----------+--------------+----------------------+--------------------+-------------+-------------+--------

only ParentalEducation and ParentalSupport have some missing values : this should be handled in our future pipelines

In [5]:
categories = train_set.select("ParentalEducation").distinct().collect()
for row in categories:
    print(row["ParentalEducation"])

High School
Higher
Bachelor
Some College
None


In [6]:
categories = train_set.select("ParentalSupport").distinct().collect()
for row in categories:
    print(row["ParentalSupport"])

High
Very High
Low
Moderate
None


Because there is an order among those categories, let's map them then we will try to imput them.

In [7]:
ordinal_mapping_education = {"High School": 1, "Some College": 2, "Bachelor": 3, "Higher": 4}
ordinal_mapping_support = {"Low": 0, "Moderate": 1, "High": 2, "Very High": 4}

In [8]:
from pyspark.ml import Transformer
from pyspark.sql.functions import when, col

class OrdinalEncoder(Transformer):
    def __init__(self, mappings, inputCols, outputCols):
        super(OrdinalEncoder, self).__init__()
        self.mappings = mappings  
        self.inputCols = inputCols
        self.outputCols = outputCols

    def _transform(self, df):
        for inputCol, outputCol, mapping in zip(self.inputCols, self.outputCols, self.mappings):
            expr = None
            for category, value in mapping.items():
                if expr is None:
                    expr = when(col(inputCol) == category, value)
                else:
                    expr = expr.when(col(inputCol) == category, value)
            df = df.withColumn(outputCol, expr.otherwise(None)) 
        return df

In [9]:
from pyspark.ml.feature import Imputer

encoder = OrdinalEncoder(
    mappings=[ordinal_mapping_education, ordinal_mapping_support],
    inputCols=["ParentalEducation", "ParentalSupport"],
    outputCols=["ParentalEducation", "ParentalSupport"]
)

imputer = Imputer(
    inputCols=["ParentalEducation", "ParentalSupport"],
    outputCols=["ParentalEducation", "ParentalSupport"]
).setStrategy("mode")

pipeline = Pipeline(stages=[encoder, imputer])

In [10]:
pipeline_model = pipeline.fit(train_set)
df_transformed = pipeline_model.transform(train_set)

df_transformed.take(5)

[Row(StudentID=3321, Age=17, Gender=1, Ethnicity='Caucasian', ParentalEducation=1, StudyTimeWeekly=9.90635293867818, Absences=25, Tutoring=0, ParentalSupport=2, Extracurricular=1, Sports=1, Music=0, Volunteering=0, GPA=1.08238995034159),
 Row(StudentID=1160, Age=18, Gender=0, Ethnicity='Caucasian', ParentalEducation=1, StudyTimeWeekly=4.68115550034998, Absences=20, Tutoring=0, ParentalSupport=2, Extracurricular=0, Sports=0, Music=0, Volunteering=1, GPA=1.18954876961879),
 Row(StudentID=2644, Age=16, Gender=0, Ethnicity='Other', ParentalEducation=2, StudyTimeWeekly=6.84531185579492, Absences=13, Tutoring=0, ParentalSupport=1, Extracurricular=0, Sports=0, Music=1, Volunteering=0, GPA=1.85267174103724),
 Row(StudentID=2321, Age=18, Gender=0, Ethnicity='Asian', ParentalEducation=1, StudyTimeWeekly=19.8857597152212, Absences=2, Tutoring=0, ParentalSupport=2, Extracurricular=1, Sports=0, Music=0, Volunteering=0, GPA=3.51723712873573),
 Row(StudentID=2419, Age=15, Gender=1, Ethnicity='African

Transform the other features

In [11]:
numerical_features = ["Age", "StudyTimeWeekly", "Absences"]
categorical_features = ["Ethnicity"]

# the other columns are already preprocessed

In [12]:
from pyspark.ml.feature import OneHotEncoder, StandardScaler, VectorAssembler, StringIndexer

# one hot 
indexer = StringIndexer(inputCol="Ethnicity", outputCol="Ethnicity_indexed")
onehot_encoder= OneHotEncoder(inputCol="Ethnicity_indexed", outputCol="Ethnicity_encoded")
# scaling
numeric_assembler = VectorAssembler(inputCols=numerical_features, outputCol="numeric_features")
scaler = StandardScaler(inputCol="numeric_features", outputCol="numeric_features")

In [13]:
pipeline = Pipeline(stages=[indexer, onehot_encoder])
pipeline_model = pipeline.fit(train_set)
df_transformed = pipeline_model.transform(train_set)

df_transformed.take(5)

[Row(StudentID=3321, Age=17, Gender=1, Ethnicity='Caucasian', ParentalEducation='High School', StudyTimeWeekly=9.90635293867818, Absences=25, Tutoring=0, ParentalSupport='High', Extracurricular=1, Sports=1, Music=0, Volunteering=0, GPA=1.08238995034159, Ethnicity_indexed=0.0, Ethnicity_encoded=SparseVector(3, {0: 1.0})),
 Row(StudentID=1160, Age=18, Gender=0, Ethnicity='Caucasian', ParentalEducation='High School', StudyTimeWeekly=4.68115550034998, Absences=20, Tutoring=0, ParentalSupport='High', Extracurricular=0, Sports=0, Music=0, Volunteering=1, GPA=1.18954876961879, Ethnicity_indexed=0.0, Ethnicity_encoded=SparseVector(3, {0: 1.0})),
 Row(StudentID=2644, Age=16, Gender=0, Ethnicity='Other', ParentalEducation='Some College', StudyTimeWeekly=6.84531185579492, Absences=13, Tutoring=0, ParentalSupport=None, Extracurricular=0, Sports=0, Music=1, Volunteering=0, GPA=1.85267174103724, Ethnicity_indexed=3.0, Ethnicity_encoded=SparseVector(3, {})),
 Row(StudentID=2321, Age=18, Gender=0, Eth

faire une selection des colonnes dans les tp 