<a href="https://colab.research.google.com/github/mahmoudalashwall/M.Recommendation/blob/main/recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
CAPABILITIES = [
    "Programming",
    "Algorithms",
    "Math",
    "Theory",
    "Data",
    "Systems",
    "Hardware",
    "AI",
    "UX",
    "Security",
    "Graphics",
    "Biology"
]


In [None]:
TRACK_PROFILES = {
    "Artificial Intelligence": {
        "AI": 0.35,
        "Math": 0.20,
        "Algorithms": 0.20,
        "Data": 0.15,
        "Programming": 0.10
    },

    "Systems": {
        "Systems": 0.35,
        "Programming": 0.25,
        "Hardware": 0.20,
        "Algorithms": 0.10,
        "Security": 0.10
    },

    "Theory": {
        "Math": 0.45,
        "Theory": 0.35,
        "Algorithms": 0.20
    },

    "Human-Computer Interaction": {
        "UX": 0.40,
        "Programming": 0.25,
        "Data": 0.15,
        "Theory": 0.10,
        "Graphics": 0.10
    },

    "Visual Computing": {
        "Graphics": 0.40,
        "Math": 0.25,
        "AI": 0.20,
        "Programming": 0.15
    },

    "Computer Engineering": {
        "Hardware": 0.40,
        "Systems": 0.30,
        "Programming": 0.15,
        "Security": 0.15
    },

    "Information Track": {
        "Data": 0.40,
        "Programming": 0.25,
        "Math": 0.15,
        "UX": 0.10,
        "Security": 0.10
    },

    "Computational Biology": {
        "Biology": 0.35,
        "Data": 0.25,
        "AI": 0.20,
        "Math": 0.20
    }
}


In [None]:
def generate_student_profile():
    vec = np.random.dirichlet(np.ones(len(CAPABILITIES)))
    return dict(zip(CAPABILITIES, vec))

student = generate_student_profile()
student


{'Programming': np.float64(0.07072438849791603),
 'Algorithms': np.float64(0.24715906387418185),
 'Math': np.float64(0.24785535128766592),
 'Theory': np.float64(0.09795431086310681),
 'Data': np.float64(0.06989744178302701),
 'Systems': np.float64(0.015630544120160596),
 'Hardware': np.float64(0.06555055619030442),
 'AI': np.float64(0.05655297039682653),
 'UX': np.float64(0.009054190134773824),
 'Security': np.float64(0.05958657201833353),
 'Graphics': np.float64(0.046549347428620144),
 'Biology': np.float64(0.013485263405083315)}

In [None]:
def track_to_vector(track_profile):
    return np.array([track_profile.get(cap, 0) for cap in CAPABILITIES])


In [None]:
def recommend_tracks(student_caps, track_profiles, top_k=3):
    student_vec = np.array([student_caps[c] for c in CAPABILITIES]).reshape(1, -1)

    scores = []
    for track, profile in track_profiles.items():
        track_vec = track_to_vector(profile).reshape(1, -1)
        score = cosine_similarity(student_vec, track_vec)[0][0]
        scores.append((track, score))

    scores.sort(key=lambda x: x[1], reverse=True)
    return scores[:top_k]


In [None]:
top_tracks = recommend_tracks(student, TRACK_PROFILES)

for rank, (track, score) in enumerate(top_tracks, 1):
    print(f"Rank {rank}: {track} (Similarity = {score:.3f})")


Rank 1: Theory (Similarity = 0.819)
Rank 2: Artificial Intelligence (Similarity = 0.713)
Rank 3: Visual Computing (Similarity = 0.487)


In [None]:
def explain_recommendation(student_caps, track_name):
    profile = TRACK_PROFILES[track_name]
    explanation = []

    for cap, weight in profile.items():
        contribution = student_caps.get(cap, 0) * weight
        explanation.append((cap, contribution))

    explanation.sort(key=lambda x: x[1], reverse=True)
    return explanation


In [None]:
track_name = top_tracks[0][0]
explain = explain_recommendation(student, track_name)

print(f"\nWhy {track_name}?")
for cap, contrib in explain:
    print(f"- {cap}: contribution {contrib:.3f}")



Why Theory?
- Math: contribution 0.112
- Algorithms: contribution 0.049
- Theory: contribution 0.034


Now we are going to use ML classifier to predict students **track**

In [None]:
import pandas as pd

df = pd.read_csv("students_1000_capabilities_tracks.csv")
df.head()


Unnamed: 0,Student_ID,cap_Programming,cap_Algorithms,cap_Math,cap_Theory,cap_Data,cap_Systems,cap_Hardware,cap_AI,cap_UX,cap_Security,cap_Graphics,cap_Biology,Track_1,Track_2,Track_3
0,1,0.034,0.2182,0.0955,0.0662,0.0123,0.0123,0.0043,0.1458,0.0666,0.0893,0.0015,0.254,Computational Biology,Artificial Intelligence,Theory
1,2,0.2816,0.0376,0.0316,0.0319,0.0572,0.1173,0.0892,0.0543,0.1492,0.0237,0.0545,0.0719,Human-Computer Interaction,Systems,Computer Engineering
2,3,0.0461,0.1163,0.0168,0.0546,0.0679,0.0036,0.0707,0.0141,0.0051,0.2249,0.2549,0.125,Visual Computing,Computer Engineering,Computational Biology
3,4,0.0457,0.0129,0.1452,0.073,0.0164,0.0861,0.0044,0.3023,0.0377,0.1368,0.047,0.0924,Artificial Intelligence,Computational Biology,Visual Computing
4,5,0.0519,0.0134,0.2291,0.0979,0.184,0.1477,0.0597,0.1672,0.0061,0.0143,0.003,0.0258,Theory,Artificial Intelligence,Computational Biology


In [None]:
CAPABILITIES = [
    "cap_Programming", "cap_Algorithms", "cap_Math", "cap_Theory",
    "cap_Data", "cap_Systems", "cap_Hardware", "cap_AI",
    "cap_UX", "cap_Security", "cap_Graphics", "cap_Biology"
]

X = df[CAPABILITIES]
y = df["Track_1"]


In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)

print("Classes:", le.classes_)


Classes: ['Artificial Intelligence' 'Computational Biology' 'Computer Engineering'
 'Human-Computer Interaction' 'Information Track' 'Systems' 'Theory'
 'Visual Computing']


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)


In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    random_state=42
)

clf.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n",
      classification_report(y_test, y_pred, target_names=le.classes_))


Accuracy: 0.745

Classification Report:
                             precision    recall  f1-score   support

   Artificial Intelligence       0.75      0.67      0.71        18
     Computational Biology       0.94      0.74      0.83        23
      Computer Engineering       0.93      0.84      0.88        31
Human-Computer Interaction       0.86      0.69      0.77        26
         Information Track       0.56      0.64      0.60        22
                   Systems       0.65      0.89      0.76        19
                    Theory       0.63      0.79      0.70        34
          Visual Computing       0.78      0.67      0.72        27

                  accuracy                           0.74       200
                 macro avg       0.76      0.74      0.74       200
              weighted avg       0.77      0.74      0.75       200



In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(
    objective="multi:softprob",
    num_class=len(le.classes_),
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="mlogloss",
    random_state=42
)

xgb.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = xgb.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=le.classes_))


Accuracy: 0.79
                            precision    recall  f1-score   support

   Artificial Intelligence       0.72      0.72      0.72        18
     Computational Biology       0.89      0.74      0.81        23
      Computer Engineering       0.90      0.84      0.87        31
Human-Computer Interaction       1.00      0.69      0.82        26
         Information Track       0.58      0.86      0.69        22
                   Systems       0.82      0.95      0.88        19
                    Theory       0.75      0.79      0.77        34
          Visual Computing       0.80      0.74      0.77        27

                  accuracy                           0.79       200
                 macro avg       0.81      0.79      0.79       200
              weighted avg       0.81      0.79      0.79       200



In [None]:
def format_prediction(results):
    formatted = []
    for rank, (track, prob) in enumerate(results, start=1):
        formatted.append({
            "rank": rank,
            "track": track,
            "probability": round(prob * 100, 2)
        })
    return formatted


In [None]:

ORIGINAL_CAPABILITIES_NAMES = [
    "Programming", "Algorithms", "Math", "Theory",
    "Data", "Systems", "Hardware", "AI",
    "UX", "Security", "Graphics", "Biology"
]

student_index = 0
student_vector_existing = X.iloc[student_index]

student_tracks_true = [df.iloc[student_index]["Track_1"]]

y_prob_all_classes = clf.predict_proba(student_vector_existing.to_frame().T)[0]
# Map probabilities to track names using le.classes_
track_probs_map = {track_name: prob for track_name, prob in zip(le.classes_, y_prob_all_classes)}

top_tracks = student_tracks_true
top_probs = [track_probs_map.get(t, 0) for t in top_tracks]

track_profiles_data_for_df = {}
for track_name, profile_dict in TRACK_PROFILES.items():
    row_data = {}
    for original_cap_name in ORIGINAL_CAPABILITIES_NAMES:
        ml_cap_name = f"cap_{original_cap_name}"
        row_data[ml_cap_name] = profile_dict.get(original_cap_name, 0)
    track_profiles_data_for_df[track_name] = row_data

track_profiles_df = pd.DataFrame.from_dict(track_profiles_data_for_df, orient='index')

results = []
student_vector_for_similarity = student_vector_existing.values.reshape(1, -1)

for i, track in enumerate(top_tracks):
    prob = top_probs[i]
    if track in track_profiles_df.index: # Check if track profile exists
        profile_vector = track_profiles_df.loc[track].values.reshape(1, -1)
        similarity = cosine_similarity(student_vector_for_similarity, profile_vector)[0,0]

        diff = profile_vector.flatten() - student_vector_existing.values # Compare numpy arrays
        diff_series = pd.Series(diff, index=student_vector_existing.index)

        # Weaknesses: capabilities where student is significantly lower than required by track profile
        weak_indices = diff_series.nlargest(3).index.tolist()
        weaknesses = weak_indices

        # Strengths: capabilities where student is significantly higher than required by track profile
        strength_indices = (-diff_series).nlargest(3).index.tolist()
        strengths = strength_indices

        results.append({
            "track": track,
            "prob": prob,
            "similarity": similarity,
            "strengths": strengths,
            "weaknesses": weaknesses
        })
    else:
        results.append({
            "track": track,
            "prob": prob,
            "similarity": None,
            "strengths": [],
            "weaknesses": []
        })

print(f"=== student from data set number {student_index+1})===")
for rec in results:
    print(f"Track: {rec['track']}")
    print(f"Probability: {rec['prob']:.3f}")
    print(f"Cosine similarity: {rec['similarity']:.3f}" if rec['similarity'] is not None else "Cosine similarity: N/A")
    print(f"Strengths: {rec['strengths']}")
    print(f"Weaknesses: {rec['weaknesses']}")
    print("-"*40)


=== student from data set number 1)===
Track: Computational Biology
Probability: 0.593
Cosine similarity: 0.679
Strengths: ['cap_Algorithms', 'cap_Security', 'cap_UX']
Weaknesses: ['cap_Data', 'cap_Math', 'cap_Biology']
----------------------------------------


In [None]:
def predict_track(student_caps, top_k=3):
    """
    Predict best tracks for a student based on capability profile.

    Args:
        student_caps (dict): capability -> value
        top_k (int): number of tracks to return

    Returns:
        dict: structured prediction result
    """

    # build input vector
    x = np.array([[student_caps[c] for c in CAPABILITIES]])

    # predict probabilities
    probs = clf.predict_proba(x)[0]

    # rank
    top_idx = probs.argsort()[::-1][:top_k]
    raw_results = [(le.classes_[i], probs[i]) for i in top_idx]

    return {
        "top_tracks": format_prediction(raw_results),
        "recommended_track": raw_results[0][0]
    }


In [None]:
student = {
    "cap_Programming": 0.18,
    "cap_Algorithms": 0.12,
    "cap_Math": 0.22,
    "cap_Theory": 0.15,
    "cap_Data": 0.15,
    "cap_Systems": 0.20,
    "cap_Hardware": 0.04,
    "cap_AI": 0.09,
    "cap_UX": 0.02,
    "cap_Security": 0.02,
    "cap_Graphics": 0.01,
    "cap_Biology": 0.00
}


In [None]:
result = predict_track(student)

print("\nðŸŽ“ AI Career Recommendation")

for r in result["top_tracks"]:
    print(f"Rank {r['rank']}: {r['track']} ({r['probability']}%)")







ðŸŽ“ AI Career Recommendation
Rank 1: Systems (40.18%)
Rank 2: Theory (23.55%)
Rank 3: Information Track (16.89%)


