In [2]:
# Step 1: Import necessary libraries for model training
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

In [3]:
# Step 2: Load the training dataset
movie_data_train_path = Path("source_data/movie_data_train.csv")
movie_data_df = pd.read_csv(movie_data_train_path)

In [4]:
# Step 3: Categorize 'vote_average' into 'Low', 'Medium', and 'High' based on quantiles
vote_average_thresholds = movie_data_df['vote_average'].quantile([0.33, 0.67]).values
movie_data_df['vote_average_category'] = pd.cut(
    movie_data_df['vote_average'], 
    bins=[-float('inf'), vote_average_thresholds[0], vote_average_thresholds[1], float('inf')],
    labels=['Low', 'Medium', 'High']
)


In [None]:
# Step 4: Drop the original 'vote_average' column
movie_data_df = movie_data_df.drop(columns=['vote_average'])

In [None]:
# Step 5: Define features and target variable
X = movie_data_df.drop(columns=['vote_average_category'])  # Features
y = movie_data_df['vote_average_category']  # Target variable

In [None]:
# Step 6: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 7: Train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)

In [None]:
# Step 8: Predict and evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [None]:
# Step 9: Output the results
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", report)

In [None]:
# Step 1: Import necessary libraries for evaluation
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Step 2: Generate and plot the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Low', 'Medium', 'High'], 
            yticklabels=['Low', 'Medium', 'High'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix for Vote Average Prediction")
plt.show()

# Step 3: Display the classification report for detailed metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))