In [20]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import r2_score

import pickle
import os

In [21]:
data = pd.read_csv("/Users/natelo/Downloads/XTern_AI_Data.csv") #use your own path

#Convert the raw data to numerical, processable by the machine learning model
converted_data = data.copy()

year_mapping = {'Year 1': 1, 'Year 2': 2, 'Year 3': 3, 'Year 4': 4}
unique_majors = data['Major'].unique()
unique_universities = data['University'].unique()
unique_orders = data['Order'].unique()

major_mapping = {value: index for index, value in enumerate(unique_majors)}
uni_mapping = {value: index for index, value in enumerate(unique_universities)}
order_mapping = {value: index for index, value in enumerate(unique_orders)}

converted_data['Year'] = data['Year'].map(year_mapping)
converted_data['Major'] = data['Major'].map(major_mapping)
converted_data['University'] = data['University'].map(uni_mapping)  
converted_data['Order'] = data['Order'].map(order_mapping)  

In [22]:
X = converted_data[['University', 'Major', 'Year', 'Time']]
y = converted_data['Order']

scaler = StandardScaler()
X = scaler.fit_transform(X)

#Splitting data in training and testing sets. 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=52)

#Train the model
model = RandomForestRegressor(n_estimators=100, random_state=52)
model.fit(X_train, y_train)

#Test the model
y_pred = model.predict(X_test)

#Test analytics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared (R2): {r2}")

#cross-validation
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
mse_scores = -scores
print("Cross-Validation Scores (Mean Squared Error):", mse_scores)
print("Mean MSE:", mse_scores.mean())

Mean Squared Error: 5.037719171576576
Mean Absolute Error: 1.5914834405257916
R-squared (R2): 0.41241796701885736
Cross-Validation Scores (Mean Squared Error): [4.88999315 4.92735459 4.72780535 4.89377206 5.35770691]
Mean MSE: 4.9593264112851925


In [23]:
#Code to pickle the model:
# download_path = "/------"  #create a path to download the model
# filename = 'random_forest_model.pkl'
# file_path = os.path.join(download_path, filename)
# with open(file_path, 'wb') as model_file:
#     pickle.dump(model, model_file)

# # check if the file exists in the directory
# if os.path.exists(file_path):
#     print("File exists!")
# else:
#     print("File does not exist.")
    
#the attached .pkl file is the model that was trained on the data