In [4]:
# import labraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [3]:
# import data
data = pd.read_csv("C:\\Users\\Maria Computer\\Downloads\\archive (7)\\students_adaptability_level_online_education.csv")
# show data
data

Unnamed: 0,Gender,Age,Education Level,Institution Type,IT Student,Location,Load-shedding,Financial Condition,Internet Type,Network Type,Class Duration,Self Lms,Device,Adaptivity Level
0,Boy,21-25,University,Non Government,No,Yes,Low,Mid,Wifi,4G,3-6,No,Tab,Moderate
1,Girl,21-25,University,Non Government,No,Yes,High,Mid,Mobile Data,4G,1-3,Yes,Mobile,Moderate
2,Girl,16-20,College,Government,No,Yes,Low,Mid,Wifi,4G,1-3,No,Mobile,Moderate
3,Girl,11-15,School,Non Government,No,Yes,Low,Mid,Mobile Data,4G,1-3,No,Mobile,Moderate
4,Girl,16-20,School,Non Government,No,Yes,Low,Poor,Mobile Data,3G,0,No,Mobile,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1200,Girl,16-20,College,Non Government,No,Yes,Low,Mid,Wifi,4G,1-3,No,Mobile,Low
1201,Girl,16-20,College,Non Government,No,No,High,Mid,Wifi,4G,3-6,No,Mobile,Moderate
1202,Boy,11-15,School,Non Government,No,Yes,Low,Mid,Mobile Data,3G,1-3,No,Mobile,Moderate
1203,Girl,16-20,College,Non Government,No,No,Low,Mid,Wifi,4G,1-3,No,Mobile,Low


In [5]:
# target variable with features
target_column = 'Adaptivity Level'
features = data.drop(target_column, axis=1)
labels = data[target_column]

In [6]:
# hot enable encodeing
nominal_columns = ['Gender', 'Institution Type', 'IT Student', 'Location', 'Load-shedding',
                   'Financial Condition', 'Internet Type', 'Network Type', 'Self Lms', 'Device']
ordinal_columns = ['Age', 'Education Level', 'Class Duration']
age_order = ['1-5', '6-10', '11-15', '16-20', '21-25', '26-30']
education_level_order = ['School', 'College', 'University']
class_duration_order = ['0', '1-3', '3-6']

In [7]:
# Setting up the ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('ordinal', OrdinalEncoder(categories=[age_order, education_level_order, class_duration_order]), ordinal_columns),
    ('nominal', OneHotEncoder(sparse=False, drop='first'), nominal_columns)
])

In [8]:
# apply processing
processed_features = preprocessor.fit_transform(features)

In [9]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.2, random_state=42)

In [10]:
# feature sacaliing
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# building the model
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# List of machine learning models
models = [
    ("Logistic Regression", LogisticRegression(max_iter=1000)),
    ("K-Nearest Neighbors", KNeighborsClassifier()),
    ("Support Vector Machine", SVC()),
    ("Decision Tree", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()),
    ("Gradient Boosting", GradientBoostingClassifier())
]

# Training and evaluating each model
for name, model in models:
    # Train the model
    model.fit(X_train_scaled, y_train)

    # Make predictions
    predictions = model.predict(X_test_scaled)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, predictions)

    # Count mislabeled data
    mislabeled = np.sum(predictions != y_test)

    print(f"{name}:\n Accuracy: {accuracy:.4f}, Mislabeled points: {mislabeled} out of {len(y_test)}\n")


Logistic Regression:
 Accuracy: 0.7261, Mislabeled points: 66 out of 241

K-Nearest Neighbors:
 Accuracy: 0.7676, Mislabeled points: 56 out of 241

Support Vector Machine:
 Accuracy: 0.8216, Mislabeled points: 43 out of 241

Decision Tree:
 Accuracy: 0.8672, Mislabeled points: 32 out of 241

Random Forest:
 Accuracy: 0.8921, Mislabeled points: 26 out of 241

Gradient Boosting:
 Accuracy: 0.8340, Mislabeled points: 40 out of 241



In [12]:
# Creating The Model
# This section is about setting up the Random Forest model and the parameters for hyperparameter tuning.
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],    # Minimum number of samples required at a leaf node
    'bootstrap': [True, False]        # Method of selecting samples for training each tree
}

# Create a base model
rf = RandomForestClassifier()

In [13]:
# hypert parameter tuning Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

In [14]:
#fit the model( Fit the grid search to the data)
grid_search.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 216 candidates, totalling 648 fits


In [15]:
# Predicting New Unseen Data

# Here we use the best estimator found by the grid search to make predictions.

# Best parameters found
print("Best parameters found: ", grid_search.best_params_)

# Use the best model for predictions
best_grid = grid_search.best_estimator_
predictions = best_grid.predict(X_test_scaled)


Best parameters found:  {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}


In [16]:
# evaluating the model
from sklearn.metrics import accuracy_score, classification_report

# Evaluate the model's performance
accuracy = accuracy_score(y_test, predictions)
class_report = classification_report(y_test, predictions)

print(f"Random Forest Classifier Accuracy (with hyperparameter tuning): {accuracy:.4f}")
print("Classification Report:\n", class_report)


Random Forest Classifier Accuracy (with hyperparameter tuning): 0.8921
Classification Report:
               precision    recall  f1-score   support

        High       0.88      0.65      0.75        23
         Low       0.89      0.94      0.92       103
    Moderate       0.90      0.90      0.90       115

    accuracy                           0.89       241
   macro avg       0.89      0.83      0.85       241
weighted avg       0.89      0.89      0.89       241



In [17]:
# model saving
import joblib

# Assuming best_grid is your trained model
filename = 'finalized_random_forest_model.sav'
joblib.dump(best_grid, filename)

print("Model saved!")

Model saved!


In [18]:
# Load the model from disk
loaded_model = joblib.load(filename)
print("Model loaded.")

# Now you can use loaded_model to make new predictions
# For example:
# new_predictions = loaded_model.predict(new_data)

Model loaded.
