# Spark Pipeline : Predicting a student's GPA from his performances
DEMBELE Mathilda, MARSOT Elouan


## 1. Starting the Spark session

In [1]:
# Initialisation des librairies pyspark

# Initialisation de Spark
import pyspark

from pyspark.sql import SparkSession

# import ML pyspark modules
# some examples
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import MultilayerPerceptronClassifier

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint

from pyspark import SparkConf, SparkContext, SQLContext

import findspark
findspark.init()

spark = SparkSession \
    .builder \
    .appName("GPAPredictor") \
    .getOrCreate()
 


## 2. Data reading

In [2]:
"""
DATASET DESCRIPTION
- TRAIN : 1531 samples 
- TEST : 384 samples 
- each line in the dataset stands for some student
- each column is a feature of performance for the student

14 features :
- StudentID : int, a four-figures unique number 
- Age : int 
- Gender : binary, 0 for a man, 1 for a woman 
- Ethnicity : categorial (Caucasian, Asian, African American, Other)
- ParentalEducation : categorial (High School, Bachelor, Some College, Higher)
- StudyTimeWeekly : float, nb of hours per week 
- Absences : int 
- Tutoring : binary, 1 if yes, 0 otherwise 
- ParentalSupport : categorial (Low, Moderate, High, Very High)
- Extracurricular : binary
- Sports : binary 
- Music : binary 
- Volunteering : binary 

- GPA : float (from 0 to 4)

"""

fileNameTrain = "datasets/train.csv"
fileNameTest = "datasets/test.csv"

# Reading the datasets
train_set = spark.read.csv(fileNameTrain, header=True, inferSchema=True)
test_set = spark.read.csv(fileNameTest, header=True, inferSchema=True)



In [3]:
# CHECKING THAT THEY HAVE THE SAME SCHEMA
train_set.printSchema()
test_set.printSchema()
train_set.take(1)
test_set.take(1)

root
 |-- StudentID: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: integer (nullable = true)
 |-- Ethnicity: string (nullable = true)
 |-- ParentalEducation: string (nullable = true)
 |-- StudyTimeWeekly: double (nullable = true)
 |-- Absences: integer (nullable = true)
 |-- Tutoring: integer (nullable = true)
 |-- ParentalSupport: string (nullable = true)
 |-- Extracurricular: integer (nullable = true)
 |-- Sports: integer (nullable = true)
 |-- Music: integer (nullable = true)
 |-- Volunteering: integer (nullable = true)
 |-- GPA: double (nullable = true)

root
 |-- StudentID: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: integer (nullable = true)
 |-- Ethnicity: string (nullable = true)
 |-- ParentalEducation: string (nullable = true)
 |-- StudyTimeWeekly: double (nullable = true)
 |-- Absences: integer (nullable = true)
 |-- Tutoring: integer (nullable = true)
 |-- ParentalSupport: string (nullable = true)
 |-- Extracurricul

[Row(StudentID=2340, Age=16, Gender=1, Ethnicity='Other', ParentalEducation='Higher', StudyTimeWeekly=5.04404804318662, Absences=25, Tutoring=1, ParentalSupport='Moderate', Extracurricular=1, Sports=0, Music=0, Volunteering=0, GPA=0.886889415770466)]