In [27]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re

# Load necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
df = pd.read_csv("Comprehensive_Student_Dataset.csv")

# Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if isinstance(text, str):
        text = re.sub(r'[^\w\s]', '', text, re.UNICODE)
        text = text.lower()
        tokens = nltk.word_tokenize(text)
        tokens = [token for token in tokens if token not in stop_words]
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        return ' '.join(tokens)
    else:
        return ''

text_columns = ["Extracurricular Activities", "Skills_x", "Career Aspirations", "Social Interests", "Desired Activities"]
for col in text_columns:
    df[col] = df[col].astype(str).apply(preprocess_text)

# Handling Missing Values
numerical_cols = ['Age', 'GPA', 'Internships', 'Work Experience', 'Year of Study_x', 'Year of Study_y']
categorical_cols = ['Gender', 'Major_x', 'Learning Style', 'Major_y', 'Major']

for col in numerical_cols:
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].mean())

for col in categorical_cols:
    df[col] = df[col].fillna('Unknown')

# Encoding Categorical Variables
label_encoder = LabelEncoder()
for col in ['Gender', 'Learning Style']:
    df[col] = label_encoder.fit_transform(df[col])

df = pd.get_dummies(df, columns=['Major_x', 'Major_y', 'Major'])

# Scaling Numerical Features
scaler = StandardScaler()
numerical_cols = ['Age', 'GPA', 'Internships', 'Work Experience', 'Year of Study_x', 'Year of Study_y']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

print(df.head())

   Unnamed: 0       Name       Age  Gender       GPA  \
0           0  Student_1  1.600736       1 -0.260731   
1           1  Student_2  0.733128       2  1.608790   
2           2  Student_3  1.166932       1 -1.362413   
3           3  Student_4 -1.435890       0 -1.095339   
4           4  Student_5  0.299325       0  0.540492   

            Extracurricular Activities       Career Interest  Internships  \
0  music band robotics club basketball               Finance    -0.320806   
1               debate club music band            Counseling     1.324352   
2  debate club volunteering basketball               Finance     0.501773   
3                        robotics club  Software Development     1.324352   
4  robotics club basketball music band     Mechanical Design    -0.320806   

  Job Preferences  Work Experience  ... Major_y_Business Administration  \
0         On-site        -1.265596  ...                           False   
1          Hybrid         0.025828  ...           

[nltk_data] Downloading package punkt to /Users/tanya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/tanya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/tanya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [28]:
# Feature Engineering

# 1. Weighted sum of course performance based on learning style
def calculate_weighted_course_performance(row):
    try:
        course_performance = eval(row['Course Performance']) # Safely evaluate the string as a dictionary
        learning_style = row['Learning Style']
        total_weighted_score = 0
        total_courses = 0

        # Define weights for each learning style
        weights = {
            0: {'Computer Science': 0.8, 'History': 0.2, 'Physics': 0.7, 'Mathematics': 0.9, 'Engineering': 0.7},   # Auditory
            1: {'Computer Science': 0.9, 'History': 0.9, 'Physics': 0.8, 'Mathematics': 0.7, 'Engineering': 0.6},   # Kinesthetic
            2: {'Computer Science': 0.7, 'History': 0.6, 'Physics': 0.9, 'Mathematics': 0.8, 'Engineering': 0.9},    # Reading/Writing
            3: {'Computer Science': 0.6, 'History': 0.7, 'Physics': 0.6, 'Mathematics': 0.6, 'Engineering': 0.6}     # Visual
        }
        # Ensure learning_style is within the valid range
        learning_style = learning_style % len(weights)

        for course, performance in course_performance.items():
            if course in weights[learning_style]:
                weight = weights[learning_style][course]
                total_weighted_score += weight * performance
                total_courses += 1

        if total_courses > 0:
            return total_weighted_score / total_courses
        else:
            return 0  # Return 0 if no courses are found
    except Exception as e:
        print(f"Error in calculate_weighted_course_performance: {e}")
        return 0

df['Weighted_Course_Performance'] = df.apply(calculate_weighted_course_performance, axis=1)

# 2. Extract information from "Courses Taken"
def safe_eval(x):
    try:
        return eval(x)
    except (SyntaxError, NameError, TypeError):
        return []

df['Number_of_Courses_Taken'] = df['Courses Taken'].apply(lambda x: len(safe_eval(x)) if isinstance(x, str) else 0)


# Function to safely evaluate the string and calculate average performance
def calculate_average_performance(row):
    try:
        performance_dict = eval(row['Course Performance'])
        return np.mean(list(performance_dict.values()))
    except (SyntaxError, NameError, TypeError):
        return np.nan  # Handle cases where evaluation fails

df['Average_Course_Performance'] = df.apply(calculate_average_performance, axis=1)

# 3. Interaction feature between "Major" and "Career Interest"
# Combining major columns into one
major_cols = [col for col in df.columns if 'Major_' in col]
df['Combined_Major'] = df[major_cols].apply(lambda row: '_'.join(row[row == 1].index), axis=1)

# Creating interaction feature
df['Major_Career_Interaction'] = df['Combined_Major'] + '_' + df['Career Interest']

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [29]:
from sklearn.metrics.pairwise import cosine_similarity

In [30]:
# Content-Based Filtering

# 1. Activity Profiles & User Profiles
# Combine user features into a single string
user_profile_cols = ['Combined_Major', 'Skills_x', 'Career Aspirations', 'Social Interests']
df['user_profile'] = df[user_profile_cols].apply(lambda x: ' '.join(x), axis=1)

# Use the same TfidfVectorizer for both activities and users
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit on both activity descriptions and user profiles
tfidf_vectorizer.fit(df['Desired Activities'].astype(str).tolist() + df['user_profile'].astype(str).tolist())

# Transform activity descriptions and user profiles
activity_tfidf_matrix = tfidf_vectorizer.transform(df['Desired Activities'])
user_tfidf_matrix = tfidf_vectorizer.transform(df['user_profile'])


# 2. Calculate Similarity
cosine_sim_users_activities = cosine_similarity(user_tfidf_matrix, activity_tfidf_matrix)

def recommend_activities_content_based(student_name, cosine_sim=cosine_sim_users_activities, df=df):
    """
    Recommends activities for a given student based on content-based filtering.
    """
    # Get the index of the student
    idx = df[df['Name'] == student_name].index[0]

    # Get the pairwise similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the activities based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 5 most similar activities
    sim_scores = sim_scores[0:5]

    # Get the indices of the similar activities
    activity_indices = [i[0] for i in sim_scores]

    # Return the top activities
    top_activities = df['Desired Activities'].iloc[activity_indices].values.tolist()
    return top_activities

# Example usage:
student_name = "Student_1"
recommended_activities = recommend_activities_content_based(student_name)
print(f"Recommended activities for {student_name}: {recommended_activities}")

Recommended activities for Student_1: ['hackathons dance club', 'community service debate society', 'debate society dance club', 'music club', 'community service hackathons']


In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [32]:
# Prepare data for modeling
# Combine relevant features for the model
feature_cols = ['Age', 'GPA', 'Internships', 'Work Experience', 'Year of Study_x',
                'Year of Study_y', 'Gender', 'Learning Style', 'Weighted_Course_Performance',
                'Number_of_Courses_Taken', 'Average_Course_Performance'] + \
               [col for col in df.columns if 'Major_' in col]

# Exclude the 'Major_Career_Interaction' feature, as it's a string
feature_cols = [col for col in feature_cols if col != 'Major_Career_Interaction']

X = df[feature_cols]

# Encode the 'Career Interest' column
label_encoder = LabelEncoder()
df['Career Interest Encoded'] = label_encoder.fit_transform(df['Career Interest'])
y = df['Career Interest Encoded']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.65, random_state=164)

# Handle NaN values in X_train and X_test by filling with the mean
# Select only numeric features for filling with mean
numeric_features = X_train.select_dtypes(include=np.number).columns
X_train[numeric_features] = X_train[numeric_features].fillna(X_train[numeric_features].mean())
X_test[numeric_features] = X_test[numeric_features].fillna(X_test[numeric_features].mean())


# Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

<IPython.core.display.Javascript object>

              precision    recall  f1-score   support

           0       0.30      0.18      0.22        17
           1       0.11      0.20      0.14        10
           2       0.10      0.27      0.15        11
           3       0.17      0.08      0.11        12
           4       1.00      0.07      0.12        15

    accuracy                           0.15        65
   macro avg       0.34      0.16      0.15        65
weighted avg       0.37      0.15      0.15        65

