In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from xgboost import XGBClassifier

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ojasv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
df = pd.read_excel(r"C:\Users\ojasv\smartedurecommendor\Student Details (Responses).xlsx")
df.head()


Unnamed: 0,Student Name,SRN,Program,YEAR OF STUDY,Age,Sex,Education [Mother's Education],Education [Father's Education],"Mother""s Occupation",Father's Occupation,...,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,SEM-1,SEM-2,SEM-3,SEM-4,SEM-5
0,Kalluru Harsha Vardhan Reddy,R22DE061,MSC,I,23,Male,Schooling,Schooling,Home Maker,Own Business,...,6,Overweight,126/83,77,4200,7.8,8.2,9.0,6.6,8.0
1,Neha Valmiki,R22DG034,MSC,II,22,Female,Schooling,Schooling,Home Maker,Public Service,...,8,Normal,125/80,75,10000,6.9,7.5,8.0,0.0,0.0
2,kommandi Bhavan,R22DG070,MSC,III,24,Male,Schooling,Graduate,Home Maker,Own Business,...,8,Normal,125/80,75,10000,6.0,7.0,8.0,,
3,Kalluru Harsha Vardhan Reddy,R22DE061,MCA,III,23,Male,Schooling,12th class,Home Maker,Own Business,...,8,Obese,140/90,85,3000,6.0,7.0,7.8,8.2,
4,Kancham Reddy Akhila,R22DE062,MCA,III,23,Female,Graduate,Graduate,Private Sector,Own Business,...,8,Obese,140/90,85,3000,9.0,9.2,9.4,9.0,


In [30]:
for col in df.columns:
    if df[col].dtype == 'object':
        if not df[col].mode().empty:
            df[col] = df[col].fillna(df[col].mode()[0])
    else:
        if df[col].notna().any():
            df[col] = df[col].fillna(df[col].mean())



In [31]:
# Normalize numeric features
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Encode categorical features
cat_cols = df.select_dtypes(include='object').columns
le = LabelEncoder()
for col in cat_cols:
    df[col] = df[col].astype(str)
    df[col] = le.fit_transform(df[col])


In [32]:
# Averaging semester scores
df['Overall Performance'] = df[['SEM-1', 'SEM-2', 'SEM-3', 'SEM-4', 'SEM-5']].mean(axis=1)


In [34]:
def categorize(score):
    if score >= 0.75:
        return 2  # Strong
    elif score >= 0.5:
        return 1  # Average
    else:
        return 0  # Weak

df['Performance Category'] = df['Overall Performance'].apply(categorize)


In [35]:
!pip install xgboost




In [36]:
# Convert all column names to strings
X = df.drop('Performance Category', axis=1)
y = df['Performance Category']
X.columns = X.columns.astype(str)


In [37]:
import re

# Replace everything except letters, numbers, and underscores
X.columns = [re.sub(r'[^\w]', '_', col) for col in X.columns]


In [38]:
print(X.columns.tolist())


['Student_Name', 'SRN', 'Program', 'YEAR_OF_STUDY', 'Age', 'Sex', 'Education__Mother_s_Education_', 'Education__Father_s_Education_', 'Mother_s_Occupation', 'Father_s_Occupation', 'Number_of_Siblings', 'Parents_Relationship_Status____________', 'Medium_of_study', 'Marks_scored_during_previous_study_PU_12th_UG_etc__', 'Accommodation_type__', 'Transportation_to_the_university___1__Bus__2__Private_car_taxi__3__bicycle__4__Other_', 'Daily_Travel_time__in_min__to_and_fro______________________________________', 'Weekly_study_time__________________', 'Participation__Reading_frequency_Scientific_Non_scientific_journals__research_articles__', 'Participation__Taking_notes_in_classes___', 'Participation__Listening_in_classes__', 'Participation__Participation_in_Discussion_about_taught_topics_in_class_', 'collaboration_level', 'Preparation_for_IA_and_SEE_exams_', 'How_do_you_prepare_for_examinations_', 'How_often_do_you_attend_classes_', 'If_you_miss_classes__what_is_the_primary_reason_', 'Extracu

In [39]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [40]:
from xgboost import XGBClassifier

model = XGBClassifier(eval_metric='mlogloss')
model.fit(X_train, y_train)


In [41]:
y_pred = model.predict(X_test)

In [42]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0
Confusion Matrix:
 [[ 9  0  0]
 [ 0 92  0]
 [ 0  0 10]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00        92
           2       1.00      1.00      1.00        10

    accuracy                           1.00       111
   macro avg       1.00      1.00      1.00       111
weighted avg       1.00      1.00      1.00       111



In [43]:
joblib.dump(model, "edu_model.pkl")


['edu_model.pkl']

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

study_materials = [
    "Basic concepts of Data Structures and Algorithms",
    "Intermediate Python practice problems and quizzes",
    "Advanced Machine Learning techniques and research papers",
    "Beginner guide to Time Management and Focus",
    "High-level readings on Data Science Trends",
    "Intro to DBMS with examples",
    "Practice quiz on Operating Systems",
    "Research paper on Deep Learning in Education"
]

queries = {
    0: "beginner topics and concepts",   # Weak
    1: "practice quiz and tests",        # Average
    2: "advanced readings and research"  # Strong
}

def recommend_materials(category, top_n=3):
    query = queries.get(category, "beginner topics")
    docs = [query] + study_materials
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(docs)
    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
    top_indices = cosine_sim.argsort()[-top_n:][::-1]
    return [study_materials[i] for i in top_indices]


In [45]:
student_pred = model.predict([X_test.iloc[0]])[0]
recs = recommend_materials(student_pred)

print("📘 Recommended Materials:")
for rec in recs:
    print("•", rec)


📘 Recommended Materials:
• Basic concepts of Data Structures and Algorithms
• Beginner guide to Time Management and Focus
• Intermediate Python practice problems and quizzes


In [46]:
from sklearn.preprocessing import LabelEncoder

def encode_dataframe(df):
    df = df.copy()
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = LabelEncoder().fit_transform(df[col].astype(str))
    return df

X = encode_dataframe(X)


In [47]:
predictions = model.predict(X)


In [48]:
import joblib

# After model training
model.fit(X_train, y_train)

# Save model
joblib.dump(model, "edu_model.pkl")

# VERY IMPORTANT: Save the exact column order used in training
joblib.dump(X_train.columns.tolist(), "model_features.pkl")


['model_features.pkl']

In [49]:
# Step 1: Train the model
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(X_train, y_train)

# Step 2: Save model the correct way
import joblib
joblib.dump(model, "edu_model.pkl")  # ✅ This saves the full scikit-style model

# Step 3: Also save training feature names (optional but helpful for UI)
joblib.dump(X_train.columns.tolist(), "model_features.pkl")  # optional


['model_features.pkl']

In [50]:
import joblib
joblib.dump(model, "edu_model.pkl")

['edu_model.pkl']

In [51]:
from xgboost import XGBClassifier
import joblib

# Assuming model is already trained
model = XGBClassifier()
model.fit(X_train, y_train)

# ✅ Correct way to save model (this will work in Streamlit)
joblib.dump(model, "edu_model.pkl")   # DO NOT use model.save_model()


['edu_model.pkl']

In [52]:
# Sanity test before upload
model_loaded = joblib.load("edu_model.pkl")
print(model_loaded.predict(X_train[:5]))  # If this works, Streamlit will too


[2 1 1 1 1]


In [53]:
print(X_train.columns.tolist())


['Student_Name', 'SRN', 'Program', 'YEAR_OF_STUDY', 'Age', 'Sex', 'Education__Mother_s_Education_', 'Education__Father_s_Education_', 'Mother_s_Occupation', 'Father_s_Occupation', 'Number_of_Siblings', 'Parents_Relationship_Status____________', 'Medium_of_study', 'Marks_scored_during_previous_study_PU_12th_UG_etc__', 'Accommodation_type__', 'Transportation_to_the_university___1__Bus__2__Private_car_taxi__3__bicycle__4__Other_', 'Daily_Travel_time__in_min__to_and_fro______________________________________', 'Weekly_study_time__________________', 'Participation__Reading_frequency_Scientific_Non_scientific_journals__research_articles__', 'Participation__Taking_notes_in_classes___', 'Participation__Listening_in_classes__', 'Participation__Participation_in_Discussion_about_taught_topics_in_class_', 'collaboration_level', 'Preparation_for_IA_and_SEE_exams_', 'How_do_you_prepare_for_examinations_', 'How_often_do_you_attend_classes_', 'If_you_miss_classes__what_is_the_primary_reason_', 'Extracu