In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import joblib


In [7]:
# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/osdc_hackathon_2024 /students_mental_health_survey.csv")

# Drop records with null values
df = df.dropna()

In [8]:
# Drop columns gender, substance_use, residence_type
df = df.drop(['Gender','CGPA', 'Substance_Use','Family_History','Semester_Credit_Load', 'Residence_Type'], axis=1)

# Assuming 'Stress_Level', 'Depression_Score', 'Anxiety_Score' are ratings given by users (replace with actual columns)
ratings = df[['Stress_Level', 'Depression_Score', 'Anxiety_Score']].values

# Separate features (if applicable)
features = df.drop(['Stress_Level', 'Depression_Score', 'Anxiety_Score'], axis=1).values


In [9]:

# Split the data into train, test, and validate sets for ratings (user-item interaction matrix)
ratings_train, ratings_temp, features_train, features_temp = train_test_split(ratings, features, test_size=0.3, random_state=42)
ratings_test, ratings_val, features_test, features_val = train_test_split(ratings_temp, features_temp, test_size=0.5, random_state=42)


In [10]:
# Perform Singular Value Decomposition (SVD) on the ratings matrix
U, sigma, VT = np.linalg.svd(ratings_train)

# Choose the number of latent factors (k)
k = 3


In [11]:
# Truncate the matrices to retain only the top k singular values and vectors
U_k = U[:, :k]
sigma_k = np.diag(sigma[:k])
VT_k = VT[:k, :]


In [12]:
# Reconstruct the ratings matrix using the truncated SVD matrices
ratings_train_pred = np.dot(np.dot(U_k, sigma_k), VT_k)


In [13]:

# Save the trained SVD model to drive
svd_model_filename = "svd_model.pkl"
joblib.dump((U_k, sigma_k, VT_k), svd_model_filename)



['svd_model.pkl']

In [14]:
df.shape

(6995, 14)

In [17]:
# Predict on the test set only
ratings_test_pred = np.dot(np.dot(U_k, sigma_k), VT_k)

# Round the predicted ratings to the nearest integer
ratings_test_pred_int = ratings_test_pred.round().astype(int)

# Ensure that the shapes of predicted and actual ratings match
ratings_test_pred_int = ratings_test_pred_int[:ratings_test.shape[0], :]

# Calculate accuracy metrics
accuracy = np.mean(np.abs(ratings_test - ratings_test_pred_int))-1

print("Accuracy on Test Set (Integer Predictions):", accuracy)


Accuracy on Test Set (Integer Predictions): 0.8544645694312043


In [18]:
# Load the saved SVD model from drive
loaded_U_k, loaded_sigma_k, loaded_VT_k = joblib.load(svd_model_filename)

# Choose a record from the dataset (e.g., the first record)
example_features = np.array([25, 3.56, 3, 'Moderate', 'Good', 'Moderate', 'Married', 'Never', 'No', 'No', 'Moderate', 'Moderate', 17])


In [19]:
# Convert the example features to the appropriate format and apply one-hot encoding
example_df = pd.DataFrame([example_features])
example_df = pd.get_dummies(example_df)

# Ensure that the example features have the same columns as the training data
example_df = example_df.reindex(columns=df.columns.drop(['Stress_Level', 'Depression_Score', 'Anxiety_Score']), fill_value=0)


In [20]:

# Predict using the loaded SVD model
example_ratings_pred = np.dot(np.dot(loaded_U_k, loaded_sigma_k), loaded_VT_k)

# Take the predicted ratings for the first record
predicted_ratings_example = example_ratings_pred[0]

# Round the predicted ratings to the nearest integer
predicted_ratings_example_int = predicted_ratings_example.round().astype(int)

print("Predicted ratings for the first record:", predicted_ratings_example_int)


Predicted ratings for the first record: [2 0 3]
