In [None]:
target_column_name = 'Final Grade'
print(f"The new target variable is '{target_column_name}'. Regression models will use this numerical variable, and classification models will use its categorized version.")


missing_values = df.isnull().sum()
print("\nMissing values per column:")
print(missing_values[missing_values > 0])


columns_to_drop = [
    'Daily alcohol label', 'going out with friends label', 'famrel label', 'Total Grades'
]
df_processed_new = df.drop(columns=columns_to_drop, errors='ignore')


categorical_cols_new = df_processed_new.select_dtypes(include='object').columns

# Apply one-hot encoding with drop_first=True
df_encoded_new = pd.get_dummies(df_processed_new, columns=categorical_cols_new, drop_first=True)

# Separate features (X) and the new target variable (y)
X = df_encoded_new.drop(columns=[target_column_name])
y = df_encoded_new[target_column_name]

# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80/20 ratio)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nShape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

# Define the new categorization function for 'Final Grade'
def categorize_final_grades(final_grade):
    if final_grade < 10:
        return 'Low'
    elif 10 <= final_grade < 15:
        return 'Medium'
    else:
        return 'High'

# Apply the new categorization to y_train and y_test
y_train_cat = y_train.apply(categorize_final_grades)
y_test_cat = y_test.apply(categorize_final_grades)

print("\nCategorized y_train head (Final Grade):")
print(y_train_cat.head())
print("\nCategorized y_test head (Final Grade):")
print(y_test_cat.head())

print("\nValue counts for y_train_cat (Final Grade):")
print(y_train_cat.value_counts())
print("\nValue counts for y_test_cat (Final Grade):")
print(y_test_cat.value_counts())

The new target variable is 'Final Grade'. Regression models will use this numerical variable, and classification models will use its categorized version.

Missing values per column:
Series([], dtype: int64)

Shape of X_train: (316, 54)
Shape of X_test: (79, 54)
Shape of y_train: (316,)
Shape of y_test: (79,)

Categorized y_train head (Final Grade):
181       Low
194    Medium
173       Low
63       High
253    Medium
Name: Final Grade, dtype: object

Categorized y_test head (Final Grade):
78        Low
371    Medium
248       Low
55       High
390       Low
Name: Final Grade, dtype: object

Value counts for y_train_cat (Final Grade):
Final Grade
Medium    155
Low       102
High       59
Name: count, dtype: int64

Value counts for y_test_cat (Final Grade):
Final Grade
Medium    37
Low       28
High      14
Name: count, dtype: int64


## Build and Evaluate Linear Regression Model for Final Grade


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


linear_reg_model_final_grade = LinearRegression()

# Train the model
linear_reg_model_final_grade.fit(X_train, y_train)


y_pred_linear_reg_final_grade = linear_reg_model_final_grade.predict(X_test)

# Calculate evaluation metrics
mae_linear_reg_final_grade = mean_absolute_error(y_test, y_pred_linear_reg_final_grade)
mse_linear_reg_final_grade = mean_squared_error(y_test, y_pred_linear_reg_final_grade)
r2_linear_reg_final_grade = r2_score(y_test, y_pred_linear_reg_final_grade)

# Print the metrics
print(f"Linear Regression (Final Grade) - Mean Absolute Error (MAE): {mae_linear_reg_final_grade:.2f}")
print(f"Linear Regression (Final Grade) - Mean Squared Error (MSE): {mse_linear_reg_final_grade:.2f}")
print(f"Linear Regression (Final Grade) - R-squared (R2): {r2_linear_reg_final_grade:.2f}")

Linear Regression (Final Grade) - Mean Absolute Error (MAE): 1.44
Linear Regression (Final Grade) - Mean Squared Error (MSE): 4.23
Linear Regression (Final Grade) - R-squared (R2): 0.77


## Build and Evaluate Logistic Regression Model for Categorized Final Grade



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


logistic_reg_model_final_grade = LogisticRegression(max_iter=1000, random_state=42)

# Train the model
logistic_reg_model_final_grade.fit(X_train, y_train_cat)

# Make predictions on the test data
y_pred_logistic_reg_final_grade = logistic_reg_model_final_grade.predict(X_test)

# Calculate evaluation metrics
accuracy_logistic_reg_final_grade = accuracy_score(y_test_cat, y_pred_logistic_reg_final_grade)
precision_logistic_reg_final_grade = precision_score(y_test_cat, y_pred_logistic_reg_final_grade, average='weighted', zero_division=0)
recall_logistic_reg_final_grade = recall_score(y_test_cat, y_pred_logistic_reg_final_grade, average='weighted', zero_division=0)
f1_logistic_reg_final_grade = f1_score(y_test_cat, y_pred_logistic_reg_final_grade, average='weighted', zero_division=0)

# Generate confusion matrix
conf_matrix_logistic_reg_final_grade = confusion_matrix(y_test_cat, y_pred_logistic_reg_final_grade)

# Print the metrics
print(f"Logistic Regression (Final Grade) - Accuracy: {accuracy_logistic_reg_final_grade:.2f}")
print(f"Logistic Regression (Final Grade) - Precision (weighted): {precision_logistic_reg_final_grade:.2f}")
print(f"Logistic Regression (Final Grade) - Recall (weighted): {recall_logistic_reg_final_grade:.2f}")
print(f"Logistic Regression (Final Grade) - F1-score (weighted): {f1_logistic_reg_final_grade:.2f}")
print("\nLogistic Regression (Final Grade) - Confusion Matrix:")
print(conf_matrix_logistic_reg_final_grade)

Logistic Regression (Final Grade) - Accuracy: 0.91
Logistic Regression (Final Grade) - Precision (weighted): 0.91
Logistic Regression (Final Grade) - Recall (weighted): 0.91
Logistic Regression (Final Grade) - F1-score (weighted): 0.91

Logistic Regression (Final Grade) - Confusion Matrix:
[[12  0  2]
 [ 0 25  3]
 [ 1  1 35]]


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Instantiate an SVM model (using a linear kernel for simplicity, can be adjusted)
# Set random_state for reproducibility
svm_model_final_grade = SVC(kernel='linear', random_state=42)

# Train the model
svm_model_final_grade.fit(X_train, y_train_cat)

# Make predictions on the test data
y_pred_svm_final_grade = svm_model_final_grade.predict(X_test)

# Calculate evaluation metrics
accuracy_svm_final_grade = accuracy_score(y_test_cat, y_pred_svm_final_grade)
precision_svm_final_grade = precision_score(y_test_cat, y_pred_svm_final_grade, average='weighted', zero_division=0)
recall_svm_final_grade = recall_score(y_test_cat, y_pred_svm_final_grade, average='weighted', zero_division=0)
f1_svm_final_grade = f1_score(y_test_cat, y_pred_svm_final_grade, average='weighted', zero_division=0)

# Generate confusion matrix
conf_matrix_svm_final_grade = confusion_matrix(y_test_cat, y_pred_svm_final_grade)

# Print the metrics
print(f"SVM (Final Grade) - Accuracy: {accuracy_svm_final_grade:.2f}")
print(f"SVM (Final Grade) - Precision (weighted): {precision_svm_final_grade:.2f}")
print(f"SVM (Final Grade) - Recall (weighted): {recall_svm_final_grade:.2f}")
print(f"SVM (Final Grade) - F1-score (weighted): {f1_svm_final_grade:.2f}")
print("\nSVM (Final Grade) - Confusion Matrix:")
print(conf_matrix_svm_final_grade)

SVM (Final Grade) - Accuracy: 0.90
SVM (Final Grade) - Precision (weighted): 0.90
SVM (Final Grade) - Recall (weighted): 0.90
SVM (Final Grade) - F1-score (weighted): 0.90

SVM (Final Grade) - Confusion Matrix:
[[11  0  3]
 [ 0 25  3]
 [ 1  1 35]]


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Instantiate a Random Forest Classifier model
# Set random_state for reproducibility
random_forest_model_final_grade = RandomForestClassifier(random_state=42)

# Train the model
random_forest_model_final_grade.fit(X_train, y_train_cat)

# Make predictions on the test data
y_pred_random_forest_final_grade = random_forest_model_final_grade.predict(X_test)

# Calculate evaluation metrics
accuracy_random_forest_final_grade = accuracy_score(y_test_cat, y_pred_random_forest_final_grade)
precision_random_forest_final_grade = precision_score(y_test_cat, y_pred_random_forest_final_grade, average='weighted', zero_division=0)
recall_random_forest_final_grade = recall_score(y_test_cat, y_pred_random_forest_final_grade, average='weighted', zero_division=0)
f1_random_forest_final_grade = f1_score(y_test_cat, y_pred_random_forest_final_grade, average='weighted', zero_division=0)

# Generate confusion matrix
conf_matrix_random_forest_final_grade = confusion_matrix(y_test_cat, y_pred_random_forest_final_grade)

# Print the metrics
print(f"Random Forest (Final Grade) - Accuracy: {accuracy_random_forest_final_grade:.2f}")
print(f"Random Forest (Final Grade) - Precision (weighted): {precision_random_forest_final_grade:.2f}")
print(f"Random Forest (Final Grade) - Recall (weighted): {recall_random_forest_final_grade:.2f}")
print(f"Random Forest (Final Grade) - F1-score (weighted): {f1_random_forest_final_grade:.2f}")
print("\nRandom Forest (Final Grade) - Confusion Matrix:")
print(conf_matrix_random_forest_final_grade)

Random Forest (Final Grade) - Accuracy: 0.86
Random Forest (Final Grade) - Precision (weighted): 0.86
Random Forest (Final Grade) - Recall (weighted): 0.86
Random Forest (Final Grade) - F1-score (weighted): 0.86

Random Forest (Final Grade) - Confusion Matrix:
[[10  0  4]
 [ 0 25  3]
 [ 1  3 33]]


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Instantiate a Decision Tree Classifier model
# Set random_state for reproducibility
decision_tree_model_final_grade = DecisionTreeClassifier(random_state=42)

# Train the model
decision_tree_model_final_grade.fit(X_train, y_train_cat)

# Make predictions on the test data
y_pred_decision_tree_final_grade = decision_tree_model_final_grade.predict(X_test)

# Calculate evaluation metrics
accuracy_decision_tree_final_grade = accuracy_score(y_test_cat, y_pred_decision_tree_final_grade)
precision_decision_tree_final_grade = precision_score(y_test_cat, y_pred_decision_tree_final_grade, average='weighted', zero_division=0)
recall_decision_tree_final_grade = recall_score(y_test_cat, y_pred_decision_tree_final_grade, average='weighted', zero_division=0)
f1_decision_tree_final_grade = f1_score(y_test_cat, y_pred_decision_tree_final_grade, average='weighted', zero_division=0)

# Generate confusion matrix
conf_matrix_decision_tree_final_grade = confusion_matrix(y_test_cat, y_pred_decision_tree_final_grade)

# Print the metrics
print(f"Decision Tree (Final Grade) - Accuracy: {accuracy_decision_tree_final_grade:.2f}")
print(f"Decision Tree (Final Grade) - Precision (weighted): {precision_decision_tree_final_grade:.2f}")
print(f"Decision Tree (Final Grade) - Recall (weighted): {recall_decision_tree_final_grade:.2f}")
print(f"Decision Tree (Final Grade) - F1-score (weighted): {f1_decision_tree_final_grade:.2f}")
print("\nDecision Tree (Final Grade) - Confusion Matrix:")
print(conf_matrix_decision_tree_final_grade)


Decision Tree (Final Grade) - Accuracy: 0.81
Decision Tree (Final Grade) - Precision (weighted): 0.81
Decision Tree (Final Grade) - Recall (weighted): 0.81
Decision Tree (Final Grade) - F1-score (weighted): 0.81

Decision Tree (Final Grade) - Confusion Matrix:
[[11  0  3]
 [ 0 22  6]
 [ 1  5 31]]


## Summary of Model Performance for Final Grade Prediction

This section summarizes the performance of all built models for predicting the 'Final Grade', both numerically and categorically. The models include Linear Regression (for numerical prediction) and Logistic Regression, SVM, Random Forest, and Decision Tree (for categorical prediction).

### Linear Regression Model Performance (Numerical 'Final Grade')
*   **Mean Absolute Error (MAE)**: 1.44
*   **Mean Squared Error (MSE)**: 4.23
*   **R-squared (R2)**: 0.77

The Linear Regression model achieved a good R-squared value of 0.77, indicating that 77% of the variance in the 'Final Grade' can be explained by the features. The MAE and MSE are relatively low, suggesting accurate numerical predictions.

### Classification Models Performance (Categorized 'Final Grade')

| Model                  | Accuracy | Precision (weighted) | Recall (weighted) | F1-score (weighted) |
| :--------------------- | :------- | :------------------- | :---------------- | :------------------ |
| Logistic Regression    | 0.91     | 0.91                 | 0.91              | 0.91                |
| Support Vector Machine | 0.90     | 0.90                 | 0.90              | 0.90                |
| Random Forest          | 0.86     | 0.86                 | 0.86              | 0.86                |
| Decision Tree          | 0.81     | 0.81                 | 0.81              | 0.81                |

### Notable Observations:
*   **Logistic Regression** and **Support Vector Machine (SVM)** models showed the highest performance for classifying 'Final Grade' into 'Low', 'Medium', or 'High' categories, both achieving an accuracy of around 0.90-0.91. This indicates strong predictive power for these classification tasks.
*   **Random Forest** performed slightly lower than Logistic Regression and SVM, with an accuracy of 0.86, but still demonstrated good performance.
*   The **Decision Tree** model had the lowest classification performance among the ensemble and kernel-based methods, with an accuracy of 0.81. This could be due to its tendency to overfit or its limited ability to capture complex non-linear relationships compared to other models without specific hyperparameter tuning.
*   Overall, the classification models performed exceptionally well in predicting the categorical 'Final Grade', suggesting that the features are highly predictive of the students' performance levels.

## Summary:

### Data Analysis Key Findings

*   The target variable was successfully changed to 'Final Grade'. Numerical 'Final Grade' was also categorized into 'Low' (\<10), 'Medium' (10-14), and 'High' (\>=15) for classification tasks.
*   **Linear Regression** model for numerical 'Final Grade' achieved an R-squared of 0.77, indicating it explains 77% of the variance in 'Final Grade'. It had a Mean Absolute Error (MAE) of 1.44 and a Mean Squared Error (MSE) of 4.23.
*   For predicting the *categorized* 'Final Grade', the classification models performed as follows:
    *   **Logistic Regression** showed the highest performance with an accuracy of 0.91, precision of 0.91, recall of 0.91, and F1-score of 0.91.
    *   **Support Vector Machine (SVM)** followed closely with an accuracy of 0.90, precision of 0.90, recall of 0.90, and F1-score of 0.90.
    *   **Random Forest** achieved good performance with an accuracy of 0.86, precision of 0.86, recall of 0.86, and F1-score of 0.86.
    *   **Decision Tree** had the lowest performance among classification models, with an accuracy of 0.81, precision of 0.81, recall of 0.81, and F1-score of 0.81.

