<a href="https://colab.research.google.com/github/nipunnirmal21/Z-score-based-university-course-recommendation-system/blob/main/Z_score.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [3]:
try:
    df = pd.read_csv('university_dataset.csv')
    print("Dataset loaded successfully!")
    print(f"\nTotal data points: {len(df)}")
except FileNotFoundError:
    print("Error: 'university_dataset.csv' not found. Please ensure it's in the correct location.")

    exit()

Dataset loaded successfully!

Total data points: 1575


In [4]:
df['Stream'] = df['Stream'].str.strip().str.lower()
df['District'] = df['District'].str.strip().str.lower()
df['Degree Program'] = df['Degree Program'].str.strip().str.lower()

In [5]:
unique_streams = df['Stream'].unique().tolist()
unique_districts = df['District'].unique().tolist()
unique_degree_programs = df['Degree Program'].unique().tolist()

In [6]:
encoder_stream = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_streams = encoder_stream.fit_transform(df[['Stream']])
encoded_stream_df = pd.DataFrame(encoded_streams, columns=encoder_stream.get_feature_names_out(['Stream']))

In [7]:
encoder_district = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_districts = encoder_district.fit_transform(df[['District']])
encoded_district_df = pd.DataFrame(encoded_districts, columns=encoder_district.get_feature_names_out(['District']))

In [8]:
df_processed = pd.concat([df[['Min Z-score']], encoded_stream_df, encoded_district_df], axis=1)


In [11]:
def recommend_courses_rule_based(z_score, stream, district, dataset):
    """
    Recommends university degree programs based on Z-score, stream, and district
    using a rule-based filtering approach.

    Args:
        z_score (float): The student's Z-score.
        stream (str): The student's A/L stream (e.g., 'physical science').
        district (str): The student's district (e.g., 'colombo').
        dataset (pd.DataFrame): The DataFrame containing university admission data.

    Returns:
        pd.DataFrame: A DataFrame of recommended courses.
    """

    stream = stream.strip().lower()
    district = district.strip().lower()

    recommended = dataset[
        (dataset['Stream'] == stream) &
        (dataset['District'] == district) &
        (dataset['Min Z-score'] <= z_score)
    ]

    recommended = recommended.sort_values(by='Min Z-score', ascending=False)
    return recommended[['Degree Program', 'Stream', 'District', 'Min Z-score']]


In [12]:
df['Program_ID'] = df['Degree Program'].astype('category').cat.codes
program_id_to_name = dict(enumerate(df['Degree Program'].astype('category').cat.categories))

X = df_processed
y = df['Program_ID']

print("\n--- ML Model Training Status ---")
print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)


--- ML Model Training Status ---
Features (X) shape: (1575, 29)
Target (y) shape: (1575,)
Training set size: 1260
Testing set size: 315


In [13]:
y_pred = dt_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Decision Tree Classifier Accuracy: {accuracy:.2f}")

Decision Tree Classifier Accuracy: 0.84


In [23]:
def recommend_courses_ml_based(z_score, stream, district, ml_model, encoder_stream, encoder_district, program_id_to_name, df_full_data):
    """
    Recommends university degree programs using a trained ML model.

    Args:
        z_score (float): The student's Z-score.
        stream (str): The student's A/L stream (e.g., 'physical science').
        district (str): The student's district (e.g., 'colombo').
        ml_model: The trained machine learning model (e.g., DecisionTreeClassifier).
        encoder_stream (OneHotEncoder): The encoder fitted on 'Stream' column.
        encoder_district (OneHotEncoder): The encoder fitted on 'District' column.
        program_id_to_name (dict): Mapping from numerical Program ID to Program Name.
        df_full_data (pd.DataFrame): The original full dataset for filtering by Z-score.

    Returns:
        pd.DataFrame: A DataFrame of recommended courses.
    """
    # Normalize inputs
    stream = stream.strip().lower()
    district = district.strip().lower()

    # Create a DataFrame for the new student's input
    # Initialize with zeros for all possible encoded columns
    all_encoded_cols = encoder_stream.get_feature_names_out(['Stream']).tolist() + \
                       encoder_district.get_feature_names_out(['District']).tolist()
    student_input_df = pd.DataFrame(0, index=[0], columns=all_encoded_cols)
    student_input_df['Min Z-score'] = z_score # Add Z-score as a feature

    # Set the appropriate one-hot encoded columns to 1
    stream_col = f'Stream_{stream}'
    if stream_col in student_input_df.columns:
        student_input_df[stream_col] = 1
    else:
        print(f"Warning: Stream '{stream}' not found in training data. This might affect prediction accuracy.")

    district_col = f'District_{district}'
    if district_col in student_input_df.columns:
        student_input_df[district_col] = 1
    else:
        print(f"Warning: District '{district}' not found in training data. This might affect prediction accuracy.")


    # Ensure the input DataFrame has the same columns as the training data used by the model
    # This is crucial if handle_unknown='ignore' was used in OneHotEncoder
    # Get the training columns from the model (assuming the model was trained on df_processed)
    training_columns = df_processed.columns.tolist()

    # Reindex the student input DataFrame to match the training columns, filling missing columns with 0
    student_input_df = student_input_df.reindex(columns=training_columns, fill_value=0)


    # Predict the program ID using the trained ML model
    predicted_program_id = ml_model.predict(student_input_df)[0]

    # Convert the predicted program ID back to the program name
    predicted_program_name = program_id_to_name.get(predicted_program_id, "Unknown Program")

    print(f"\nML Model Predicted Program ID: {predicted_program_id}")
    print(f"ML Model Predicted Program Name: {predicted_program_name}")


    # Filter the original dataset to find courses matching the predicted program and the student's criteria
    # This step helps to ensure the recommended course is actually available for the given stream and district
    # and that the student's Z-score is sufficient based on the original data
    recommended_courses = df_full_data[
        (df_full_data['Stream'] == stream) &
        (df_full_data['District'] == district) &
        (df_full_data['Degree Program'] == predicted_program_name) &
        (df_full_data['Min Z-score'] <= z_score) # Ensure Z-score is sufficient based on original data
    ]

    # If no exact match is found based on the predicted program and original data,
    # you might want to implement a fallback strategy, like recommending based on Z-score alone
    # within the student's stream and district, or exploring similar programs.
    # For now, we'll return the found recommendations or an empty DataFrame.

    if recommended_courses.empty:
        print(f"\nML model predicted '{predicted_program_name}', but no exact match found in the dataset for your stream and district with sufficient Z-score.")
        # Fallback: Recommend any course in the student's stream and district with sufficient Z-score
        print("Falling back to recommending based on Z-score within your stream and district:")
        recommended_courses_fallback = df_full_data[
            (df_full_data['Stream'] == stream) &
            (df_full_data['District'] == district) &
            (df_full_data['Min Z-score'] <= z_score)
        ].sort_values(by='Min Z-score', ascending=False)
        return recommended_courses_fallback[['Degree Program', 'Stream', 'District', 'Min Z-score']]


    recommended_courses = recommended_courses.sort_values(by='Min Z-score', ascending=False)

    return recommended_courses[['Degree Program', 'Stream', 'District', 'Min Z-score']]

In [26]:
# Example Usage of the ML-based recommendation function
example_z_score = 1.7
example_stream = 'physical science'
example_district = 'colombo'

ml_recommended_courses = recommend_courses_ml_based(
    example_z_score,
    example_stream,
    example_district,
    dt_classifier,  # Pass the trained Decision Tree Classifier
    encoder_stream,
    encoder_district,
    program_id_to_name,
    df # Pass the original full dataset
)

print("\n--- ML-Based Recommended Courses ---")
if not ml_recommended_courses.empty:
    display(ml_recommended_courses)
else:
    print("No ML-based recommendations found for the given criteria.")


ML Model Predicted Program ID: 21
ML Model Predicted Program Name: bsc engineering (electrical)

--- ML-Based Recommended Courses ---


Unnamed: 0,Degree Program,Stream,District,Min Z-score
91,bsc engineering (electrical),physical science,colombo,1.7
