In [237]:
import pandas as pd
import numpy as np
import joblib

In [238]:
model=joblib.load('xgb_model.pkl')
trained_columns=joblib.load('trained_columns.pkl')


In [239]:
new_student_data = {
    'id_student': [987654321], 
    'sum-click': [55], 
    'score': [15], 
    'code_module': ['Data Science'], 
    'code_presentation': ['2024A'], 
    'date_registration': [-35],
    'gender': ['M'], 
    'region': ['London'], 
    'highest_education': ['High School'], 
    'imd_band': ['10-20%'],  
    'age_band': ['20-25'], 
    'num_of_prev_attempts': [2],  
    'studied_credits': [120],  
    'disability': ['N'], 
    'module_presentation_length': [30],
    'id_assessment': [654321],  
    'assessment_type': ['CMA'],  
    'date': [20], 
    'weight': [15], 
    'date_submitted': [18],
    'is_banked': [1],  
    'activity_type': ['quizzing'],}
new_student_df = pd.DataFrame(new_student_data)
student_id=new_student_data['id_student']
activity_summary = new_student_df.groupby(
    ['id_student', 'code_module', 'code_presentation', 'activity_type'], as_index=False
)['sum-click'].agg(['sum', 'count'])

new_student_df = new_student_df.merge(activity_summary, on=['id_student', 'code_module', 'code_presentation', 'activity_type'], how='left')

new_student_df.drop(columns=['sum-click'], inplace=True)

def feature_engineering(new_df, reference_df):
    new_df['engagement_dropoff'] = new_df.groupby('id_student')['sum'].transform(
    lambda x: (x.max() - x.min()) / max((x.count() - 1), 1)
)


    new_df['time_since_registration'] = new_df['date'] - new_df['date_registration']
    new_df['performance_by_registration'] = new_df['score'] / ((new_df['date'] - new_df['date_registration']) + 1)
    new_df['learning_pace'] = new_df.groupby('id_student')['date_submitted'].diff().fillna(0)
    new_df['module_engagement_rate'] = new_df['sum'] / new_df['module_presentation_length']

    new_df['engagement_consistency'] = new_df.groupby('id_student')['sum'].transform(lambda x: np.std(x) if len(x) > 1 else 0)
    new_df['engagement_consistency'] = new_df['engagement_consistency'].fillna(0)

    new_df['cumulative_score'] = new_df.groupby('id_student')['score'].cumsum().fillna(new_df['score'])
    new_df['weighted_score'] = new_df['score'] * new_df['weight'].fillna(1)  # Default weight is 1 if missing
    new_df['completed_assessments_ratio'] = 1 / new_df['module_presentation_length']
    new_df['assessment_weight_ratio'] = new_df['weight'] / new_df.groupby('code_presentation')['weight'].transform('sum').fillna(1)
    new_df['assessment_engagement_score'] = new_df['sum'] * new_df['id_assessment']
    new_df['average_score'] = new_df.groupby('id_student')['score'].transform('mean')

    new_df['repeat_student'] = new_df['num_of_prev_attempts'].apply(lambda x: 1 if x > 0 else 0)
    new_df['improvement_rate'] = new_df.groupby('id_student')['score'].transform(
        lambda x: (x.iloc[-1] - x.iloc[0]) / (x.count() - 1) if x.count() > 1 else 0
    )
    new_df['banked_assessment_ratio'] = new_df.groupby('id_student')['is_banked'].transform(lambda x: x.fillna(0).sum() / x.count())

    course_difficulty = reference_df.groupby('code_module')['score'].mean()
    new_df['difficulty_score'] = new_df['code_module'].map(course_difficulty).fillna(0)

    return new_df

reference_df = pd.read_csv('reference.csv')

new_student_df = feature_engineering(new_student_df, reference_df)

print(new_student_df.columns)


Index(['id_student', 'score', 'code_module', 'code_presentation',
       'date_registration', 'gender', 'region', 'highest_education',
       'imd_band', 'age_band', 'num_of_prev_attempts', 'studied_credits',
       'disability', 'module_presentation_length', 'id_assessment',
       'assessment_type', 'date', 'weight', 'date_submitted', 'is_banked',
       'activity_type', 'sum', 'count', 'engagement_dropoff',
       'time_since_registration', 'performance_by_registration',
       'learning_pace', 'module_engagement_rate', 'engagement_consistency',
       'cumulative_score', 'weighted_score', 'completed_assessments_ratio',
       'assessment_weight_ratio', 'assessment_engagement_score',
       'average_score', 'repeat_student', 'improvement_rate',
       'banked_assessment_ratio', 'difficulty_score'],
      dtype='object')


In [240]:
new_student_df.head()

Unnamed: 0,id_student,score,code_module,code_presentation,date_registration,gender,region,highest_education,imd_band,age_band,...,cumulative_score,weighted_score,completed_assessments_ratio,assessment_weight_ratio,assessment_engagement_score,average_score,repeat_student,improvement_rate,banked_assessment_ratio,difficulty_score
0,987654321,15,Data Science,2024A,-35,M,London,High School,10-20%,20-25,...,15,225,0.033333,1.0,35987655,15.0,1,0,1.0,0.0


In [241]:
# new_student_df['sum']

0    55
Name: sum, dtype: int64

In [242]:
required_columns = [
    'id_student', 'code_module', 'code_presentation', 'gender', 'region', 'highest_education', 
    'imd_band', 'age_band', 'disability', 'activity_type', 'assessment_type', 
    'sum', 'count', 'score', 'weighted_score', 'module_engagement_rate', 'engagement_consistency', 
    'improvement_rate', 'engagement_dropoff', 'banked_assessment_ratio', 'cumulative_score', 
    'completed_assessments_ratio', 'time_since_registration', 'learning_pace', 'repeat_student', 
    'assessment_weight_ratio', 'assessment_engagement_score', 'average_score', 'difficulty_score', 
    'num_of_prev_attempts', 'studied_credits'
]

new_student_df = new_student_df[required_columns]

print(new_student_df.columns)

Index(['id_student', 'code_module', 'code_presentation', 'gender', 'region',
       'highest_education', 'imd_band', 'age_band', 'disability',
       'activity_type', 'assessment_type', 'sum', 'count', 'score',
       'weighted_score', 'module_engagement_rate', 'engagement_consistency',
       'improvement_rate', 'engagement_dropoff', 'banked_assessment_ratio',
       'cumulative_score', 'completed_assessments_ratio',
       'time_since_registration', 'learning_pace', 'repeat_student',
       'assessment_weight_ratio', 'assessment_engagement_score',
       'average_score', 'difficulty_score', 'num_of_prev_attempts',
       'studied_credits'],
      dtype='object')


In [243]:
new_student_df['code_module']

0    Data Science
Name: code_module, dtype: object

In [244]:
scaler = joblib.load('scaler.pkl')
encoder = joblib.load('encoder.pkl')
label_encoder = joblib.load('label_encoder.pkl')
numeric_features = new_student_df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = new_student_df.select_dtypes(include=['object']).columns

scaled_features = scaler.transform(new_student_df[numeric_features])

encoded_features = encoder.transform(new_student_df[categorical_features])

final_features = np.hstack([scaled_features, encoded_features])

new_student_scaled = pd.DataFrame(
    final_features,
    columns=list(numeric_features) + list(encoder.get_feature_names_out(categorical_features)),
    index=new_student_df.index
)
print(new_student_scaled.columns)


Index(['id_student', 'sum', 'count', 'score', 'weighted_score',
       'module_engagement_rate', 'engagement_consistency', 'improvement_rate',
       'engagement_dropoff', 'banked_assessment_ratio', 'cumulative_score',
       'completed_assessments_ratio', 'time_since_registration',
       'learning_pace', 'repeat_student', 'assessment_weight_ratio',
       'assessment_engagement_score', 'average_score', 'difficulty_score',
       'num_of_prev_attempts', 'studied_credits', 'code_module_AAA',
       'code_module_BBB', 'code_module_CCC', 'code_module_DDD',
       'code_module_EEE', 'code_module_FFF', 'code_module_GGG',
       'code_presentation_2013B', 'code_presentation_2013J',
       'code_presentation_2014B', 'code_presentation_2014J', 'gender_F',
       'gender_M', 'region_East Anglian Region', 'region_East Midlands Region',
       'region_Ireland', 'region_London Region', 'region_North Region',
       'region_North Western Region', 'region_Scotland',
       'region_South East Region

In [245]:
missing_columns = set(trained_columns) - set(new_student_scaled.columns)
extra_columns = set(new_student_scaled.columns) - set(trained_columns)
print(f"Colonnes manquantes : {missing_columns}")
print(f"Colonnes supplémentaires : {extra_columns}")


Colonnes manquantes : {'completion_status'}
Colonnes supplémentaires : set()


In [253]:
# print("Statistiques des nouvelles données :\n", new_student_scaled.describe())


Statistiques des nouvelles données :
             sum     count    score  weighted_score  module_engagement_rate  \
count  1.000000  1.000000  1.00000        1.000000                 1.00000   
mean  -0.669902 -0.800652 -3.70351       -1.062088                 1.98802   
std         NaN       NaN      NaN             NaN                     NaN   
min   -0.669902 -0.800652 -3.70351       -1.062088                 1.98802   
25%   -0.669902 -0.800652 -3.70351       -1.062088                 1.98802   
50%   -0.669902 -0.800652 -3.70351       -1.062088                 1.98802   
75%   -0.669902 -0.800652 -3.70351       -1.062088                 1.98802   
max   -0.669902 -0.800652 -3.70351       -1.062088                 1.98802   

       engagement_consistency  improvement_rate  engagement_dropoff  \
count                1.000000          1.000000            1.000000   
mean                -0.810924          0.051692           -0.808938   
std                       NaN               Na

In [247]:
new_student_scaled.drop(columns='id_student',inplace=True)

In [248]:
print(new_student_scaled.isnull().sum())


sum                       0
count                     0
score                     0
weighted_score            0
module_engagement_rate    0
                         ..
activity_type_quiz        0
activity_type_resource    0
assessment_type_CMA       0
assessment_type_Exam      0
assessment_type_TMA       0
Length: 75, dtype: int64


In [249]:
prediction = model.predict(new_student_scaled)
probability = model.predict_proba(new_student_scaled)[:, 1] 


print(f"Prediction: {'Student At Risk of abandonning the course' if prediction[0] == 1 else 'Not At Risk'}")
print(f"Dropout Probability: {probability[0]:.2f}")


Prediction: Student At Risk of abandonning the course
Dropout Probability: 0.51


In [250]:
new_student_df['cumulative_score']

0    15
Name: cumulative_score, dtype: int64

In [251]:
prediction = model.predict(new_student_scaled)
probability = model.predict_proba(new_student_scaled)[:, 1]

print(f"Prediction: {'Student At Risk of Abandoning the Course' if prediction[0] == 1 else 'This student is not at risk of abandoning the course.'}")
print(f"Dropout Probability: {probability[0]:.2f}")

if prediction[0] == 1:
    def categorize(row):
        if row['module_engagement_rate'] < 0.5:
            row['engagement_issues'] = True
        else:
            row['engagement_issues'] = False

        if row['cumulative_score'] < 50:
            row['performance_issues'] = True
        else:
            row['performance_issues'] = False

        if row['num_of_prev_attempts'] > 1:
            row['difficulty_issues'] = True
        else:
            row['difficulty_issues'] = False

        if row['engagement_issues']:
            return 'Low Engagement'
        elif row['performance_issues']:
            return 'Low Performance'
        elif row['difficulty_issues']:
            return 'High Difficulty'
        return 'General'

    def recommend(segment):
        if segment == 'Low Engagement':
            return (
                "Your current engagement level suggests that active participation could significantly improve your learning experience. "
                "Join study groups and engage in online forums."
            )
        elif segment == 'Low Performance':
            return (
                "Your recent performance indicates that you may benefit from focused support. "
                "We recommend reviewing foundational concepts through remedial classes."
            )
        elif segment == 'High Difficulty':
            return (
                "The challenges you're facing suggest that a tailored approach might be beneficial. "
                "Consider scheduling personalized tutoring sessions."
            )
        return (
            "Great job maintaining steady progress! Keep utilizing available resources like discussion forums, peer collaboration, "
            "and instructor feedback to sustain your momentum."
        )

    def personalized_recommendation(row):
        if row['module_engagement_rate'] < 0.5 and row['engagement_consistency'] < 0.5:
            return "Increase engagement with interactive content and schedule reminders to study."
        elif row['cumulative_score'] < 50 and row['num_of_prev_attempts'] > 1:
            return "Focus on targeted remedial topics and request tutoring support."
        elif row['difficulty_score'] > 0.8 and row['learning_pace'] < 0.4:
            return "Reduce workload and attend time management workshops."
        elif row['disability'] == 'Y':
            return "Utilize accessibility tools and request extended deadlines if needed."
        return "Maintain steady progress and seek mentorship for continued success."

    new_student_df['segment'] = new_student_df.apply(categorize, axis=1)
    new_student_df['recommendations'] = new_student_df['segment'].apply(recommend)
    new_student_df['personalized_recommendation'] = new_student_df.apply(personalized_recommendation, axis=1)
    new_student_df['final_recommendation'] = (
        new_student_df['recommendations'] + " Additionally: " + new_student_df['personalized_recommendation']
    )

    print(f"Student ID: {new_student_df['id_student'].iloc[0]}")
    print(f"Segment: {new_student_df['segment'].iloc[0]}")
    print(f"Final Recommendation: {new_student_df['final_recommendation'].iloc[0]}")

else:
    print(" ")


Prediction: Student At Risk of Abandoning the Course
Dropout Probability: 0.51
Student ID: 987654321
Segment: Low Performance
Final Recommendation: Your recent performance indicates that you may benefit from focused support. We recommend reviewing foundational concepts through remedial classes. Additionally: Focus on targeted remedial topics and request tutoring support.


In [252]:
# import shap
# explainer = shap.Explainer(model)
# shap_values = explainer(new_student_scaled)
# shap.summary_plot(shap_values, new_student_scaled)
