In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
6,application_1716515774201_0008,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
# Create or retrieve a Spark session
spark = SparkSession.builder \
    .appName("Online Education Analysis") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.pyspark.python", "python3") \
    .config("spark.pyspark.virtualenv.enabled", "true") \
    .config("spark.pyspark.virtualenv.type", "native") \
    .config("spark.pyspark.virtualenv.bin.path", "/usr/bin/virtualenv") \
    .getOrCreate()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
def prepare_data(df):
    """ Prepare data by adding necessary columns for ML """
    df = df.withColumn('score', col('is_correct').cast('int'))
    df = df.withColumn('label', when(col('score') >= 0.6, 1).otherwise(0))
    return df

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
def train_models(df):
    """ Train logistic regression and random forest models and evaluate them """
    assembler = VectorAssembler(inputCols=["total_sec_taken", "total_attempt_cnt", "used_hint_cnt"], outputCol="features")
    data = assembler.transform(df).select('features', 'label')
    
    train_data, test_data = data.randomSplit([0.7, 0.3], seed=42)
    
    lr = LogisticRegression(featuresCol='features', labelCol='label')
    rf = RandomForestClassifier(featuresCol='features', labelCol='label', numTrees=10)
    
    lr_model = lr.fit(train_data)
    rf_model = rf.fit(train_data)
    
    evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
    lr_accuracy = evaluator.evaluate(lr_model.transform(test_data))
    rf_accuracy = evaluator.evaluate(rf_model.transform(test_data))
    
    return lr_model, rf_model, lr_accuracy, rf_accuracy

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
def cross_validate_model(df):
    """ Perform cross-validation on logistic regression model """
    assembler = VectorAssembler(inputCols=["total_sec_taken", "total_attempt_cnt", "used_hint_cnt"], outputCol="features")
    data = assembler.transform(df).select('features', 'label')
    
    train_data, _ = data.randomSplit([0.7, 0.3], seed=42)
    
    lr = LogisticRegression(featuresCol='features', labelCol='label')
    param_grid = ParamGridBuilder() \
        .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
        .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
        .build()
    
    evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
    crossval = CrossValidator(estimator=lr, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3)
    
    cv_model = crossval.fit(train_data)
    
    return cv_model, assembler

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
# Load the processed data from S3
student_performance = spark.read.parquet("s3://kz-final-project/student_performance/")

# Prepare the data
processed_df = prepare_data(student_performance)
lr_model, rf_model, lr_accuracy, rf_accuracy = train_models(processed_df)

# Print the results
print("Logistic Regression Model Accuracy:", lr_accuracy)
print("Random Forest Model Accuracy:", rf_accuracy)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Logistic Regression Model Accuracy: 0.9965807900136785
Random Forest Model Accuracy: 0.9965807900136785

In [13]:
# Prepare the data
cv_model, assembler = cross_validate_model(processed_df) 

# Get the best model's coefficients and feature names
best_model = cv_model.bestModel
coefficients = best_model.coefficients
features = assembler.getInputCols()
coef_list = coefficients.toArray().tolist()
feature_importance = dict(zip(features, coef_list))

# Print the results
print("Best Model Accuracy after Cross-validation:", max(cv_model.avgMetrics))
print("Feature Importances: ", feature_importance)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Best Model Accuracy after Cross-validation: 0.94260342694473
Feature Importances:  {'total_sec_taken': 0.0, 'total_attempt_cnt': -2.1948865183659922, 'used_hint_cnt': -2.709100276215105}