# Step 1: Import the required libraries


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

### Load the data

In [None]:
from google.colab import files
import io
import pandas as pd

# This will prompt you to select the file from your local computer
uploaded = files.upload()

# Load the data into a DataFrame
# Ensure the filename matches exactly what you upload
df = pd.read_csv(io.BytesIO(uploaded['digital_learning_analytics_100k.csv']))

print("Dataset loaded successfully!")

Saving digital_learning_analytics_100k.csv to digital_learning_analytics_100k.csv
Dataset loaded successfully!


# Step 3: Display the dataset structure


In [None]:
print("--- Step 3: Preview ---")
display(df.head())


--- Step 3: Preview ---


Unnamed: 0,learner_id,age,gender,education_level,country,employment_status,prior_online_courses,digital_literacy_score,app_category,daily_app_minutes,...,time_to_mastery_hours,mastery_score,learning_efficiency_score,enrollment_date,last_activity_date,total_learning_hours,engagement_consistency,log_total_learning_hours,skill_improvement,engagement_index
0,LRN00000001,-0.503406,2,1,Australia,4,0,1.31413,5,85.0,...,21.0,44.2,21.05,2022-07-03,2022-07-16,34.2,0.527,3.561046,22.2,109.285714
1,LRN00000002,-0.113831,1,1,United States,2,7,0.592327,1,54.2,...,22.4,13.0,5.8,2024-09-24,2024-10-17,34.9,0.441,3.580737,9.9,61.942857
2,LRN00000003,-1.126727,1,3,India,0,6,1.128098,0,53.4,...,70.5,43.2,6.13,2022-03-25,2022-04-04,37.8,0.494,3.65842,15.4,15.257143
3,LRN00000004,-0.581321,0,0,South Africa,3,5,0.718828,5,60.3,...,39.1,44.6,11.41,2025-01-13,2025-02-04,41.7,0.493,3.754199,22.6,60.3
4,LRN00000005,1.210725,1,4,Germany,5,5,-1.119164,4,31.8,...,52.6,35.1,6.67,2024-09-27,2024-11-01,36.9,0.305,3.634951,12.8,18.171429


# Step 4: Check rows and columns


In [None]:
print(f"\n--- Step 4: Shape ---\nRows: {df.shape[0]}, Columns: {df.shape[1]}")



--- Step 4: Shape ---
Rows: 100000, Columns: 46


# Step 5: Data types


In [None]:
print("\n--- Step 5: Info ---")
df.info()


--- Step 5: Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 46 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   learner_id                        100000 non-null  object 
 1   age                               100000 non-null  float64
 2   gender                            100000 non-null  int64  
 3   education_level                   100000 non-null  int64  
 4   country                           100000 non-null  object 
 5   employment_status                 100000 non-null  int64  
 6   prior_online_courses              100000 non-null  int64  
 7   digital_literacy_score            100000 non-null  float64
 8   app_category                      100000 non-null  int64  
 9   daily_app_minutes                 100000 non-null  float64
 10  session_count_weekly              100000 non-null  int64  
 11  app_completion_rate            

# Step 6: Missing values


In [None]:
print("\n--- Step 6: Missing Values ---\n", df.isnull().sum())



--- Step 6: Missing Values ---
 learner_id                          0
age                                 0
gender                              0
education_level                     0
country                             0
employment_status                   0
prior_online_courses                0
digital_literacy_score              0
app_category                        0
daily_app_minutes                   0
session_count_weekly                0
app_completion_rate                 0
in_app_quiz_score                   0
gamification_engagement             0
skill_pre_score                     0
skill_post_score                    0
essay_topic_category                0
essay_word_count                    0
essay_grammar_errors                0
essay_vocabulary_richness           0
essay_coherence_score               0
human_grader_score                  0
automated_score                     0
mooc_platform                       0
course_category                     0
course_duration_w

# Step 7: Handle missing values (Median Imputation)


In [None]:
num_cols_with_nan = [
    'gamification_engagement', 'essay_vocabulary_richness',
    'essay_coherence_score', 'forum_posts', 'peer_review_given',
    'content_recommendations_followed', 'learning_efficiency_score'
]
for col in num_cols_with_nan:
    df[col] = df[col].fillna(df[col].median())

# Step 8: Remove duplicates


In [None]:
df = df.drop_duplicates()


# Step 10: Feature Transformation (Log transform)

In [None]:
df['log_total_learning_hours'] = np.log1p(df['total_learning_hours'])

# Step 14: Derived Features

In [None]:
df['skill_improvement'] = df['skill_post_score'] - df['skill_pre_score']
df['engagement_index'] = (df['daily_app_minutes'] * df['session_count_weekly']) / 7


# Step 11: Encode Categorical Variables

In [None]:
le = LabelEncoder()
categorical_cols = [
    'gender', 'education_level', 'employment_status', 'app_category',
    'essay_topic_category', 'mooc_platform', 'course_category', 'learning_path_type'
]
for col in categorical_cols:
    df[col] = le.fit_transform(df[col].astype(str))

df['course_completed'] = df['course_completed'].astype(int)

# Step 12 & 13: Standardization and Normalization

In [None]:
scaler_std = StandardScaler()
df[['age', 'digital_literacy_score']] = scaler_std.fit_transform(df[['age', 'digital_literacy_score']])

scaler_norm = MinMaxScaler()
df[['in_app_quiz_score']] = scaler_norm.fit_transform(df[['in_app_quiz_score']])

# Step 15: Final Processed Dataset

In [None]:
print("\n--- Step 15: Processed Data ---")
display(df.head())


--- Step 15: Processed Data ---


Unnamed: 0,learner_id,age,gender,education_level,country,employment_status,prior_online_courses,digital_literacy_score,app_category,daily_app_minutes,...,time_to_mastery_hours,mastery_score,learning_efficiency_score,enrollment_date,last_activity_date,total_learning_hours,engagement_consistency,log_total_learning_hours,skill_improvement,engagement_index
0,LRN00000001,-0.503406,2,1,Australia,4,0,1.31413,5,85.0,...,21.0,44.2,21.05,2022-07-03,2022-07-16,34.2,0.527,3.561046,22.2,109.285714
1,LRN00000002,-0.113831,1,1,United States,2,7,0.592327,1,54.2,...,22.4,13.0,5.8,2024-09-24,2024-10-17,34.9,0.441,3.580737,9.9,61.942857
2,LRN00000003,-1.126727,1,3,India,0,6,1.128098,0,53.4,...,70.5,43.2,6.13,2022-03-25,2022-04-04,37.8,0.494,3.65842,15.4,15.257143
3,LRN00000004,-0.581321,0,0,South Africa,3,5,0.718828,5,60.3,...,39.1,44.6,11.41,2025-01-13,2025-02-04,41.7,0.493,3.754199,22.6,60.3
4,LRN00000005,1.210725,1,4,Germany,5,5,-1.119164,4,31.8,...,52.6,35.1,6.67,2024-09-27,2024-11-01,36.9,0.305,3.634951,12.8,18.171429


# Step 16: Final Check

In [None]:
print("\n--- Step 16: Readiness Check ---")
print(f"Any missing values left? {df.isnull().values.any()}")


--- Step 16: Readiness Check ---
Any missing values left? False
