In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv("student-mat.csv", sep=";")

# Display basic information
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [2]:
# Check for missing values
print("Missing values per column:\n", df.isnull().sum())

# Since there are no missing values in this dataset, we proceed to encoding.

Missing values per column:
 school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
G1            0
G2            0
G3            0
dtype: int64


In [3]:
# Convert categorical features using Label Encoding
categorical_cols = ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian']

le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,0,0,18,1,0,0,4,4,0,4,...,4,3,4,1,1,3,6,5,6,6
1,0,0,17,1,0,1,1,1,0,2,...,5,3,3,1,1,3,4,5,5,6
2,0,0,15,1,1,1,1,1,0,2,...,4,3,2,2,3,3,10,7,8,10
3,0,0,15,1,0,1,4,2,1,3,...,3,2,2,1,1,5,2,15,14,15
4,0,0,16,1,0,1,3,3,2,2,...,4,3,2,1,2,5,4,6,10,10


In [4]:
# Define a binary classification target: Pass (>=10) or Fail (<10)
df['Pass'] = df['G3'].apply(lambda x: 1 if x >= 10 else 0)

# Drop the original grades columns
df.drop(['G1', 'G2', 'G3'], axis=1, inplace=True)

df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,Pass
0,0,0,18,1,0,0,4,4,0,4,...,no,no,4,3,4,1,1,3,6,0
1,0,0,17,1,0,1,1,1,0,2,...,yes,no,5,3,3,1,1,3,4,0
2,0,0,15,1,1,1,1,1,0,2,...,yes,no,4,3,2,2,3,3,10,1
3,0,0,15,1,0,1,4,2,1,3,...,yes,yes,3,2,2,1,1,5,2,1
4,0,0,16,1,0,1,3,3,2,2,...,no,no,4,3,2,1,2,5,4,1


In [5]:
# Define features (X) and target (y)
X = df.drop(columns=['Pass'])
y = df['Pass']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training data shape:", X_train.shape)
print("Test data shape:", X_test.shape)

Training data shape: (316, 30)
Test data shape: (79, 30)


In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

# Create a copy to avoid SettingWithCopyWarning
X_train_encoded = X_train.copy()
X_test_encoded = X_test.copy()

# Identify categorical columns (non-numeric)
categorical_cols = X_train_encoded.select_dtypes(include=['object', 'category']).columns

# Label encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X_train_encoded[col] = le.fit_transform(X_train_encoded[col])
    X_test_encoded[col] = le.transform(X_test_encoded[col])  # Use same encoding as train
    label_encoders[col] = le  # Store for reference if needed

# Verify no more string values
print("Data types after encoding:")
print(X_train_encoded.dtypes)

# Now train the models
try:
    # Train Logistic Regression Model
    logreg = LogisticRegression(max_iter=1000)  # Increased max_iter for convergence
    logreg.fit(X_train_encoded, y_train)
    print("Logistic Regression trained successfully!")
    
    # Train Decision Tree Model
    tree = DecisionTreeClassifier(max_depth=5)
    tree.fit(X_train_encoded, y_train)
    print("Decision Tree trained successfully!")
    
except Exception as e:
    print(f"Error during training: {e}")

Data types after encoding:
school        int32
sex           int32
age           int64
address       int32
famsize       int32
Pstatus       int32
Medu          int64
Fedu          int64
Mjob          int32
Fjob          int32
reason        int32
guardian      int32
traveltime    int64
studytime     int64
failures      int64
schoolsup     int32
famsup        int32
paid          int32
activities    int32
nursery       int32
higher        int32
internet      int32
romantic      int32
famrel        int64
freetime      int64
goout         int64
Dalc          int64
Walc          int64
health        int64
absences      int64
dtype: object
Logistic Regression trained successfully!
Decision Tree trained successfully!


In [15]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

# Make sure X_test is encoded the same way as X_train was
X_test_encoded = X_test.copy()

# Apply the same label encoding used on training data
for col in X_test_encoded.select_dtypes(include=['object', 'category']).columns:
    if col in label_encoders:  # Use the encoder we saved during training
        X_test_encoded[col] = label_encoders[col].transform(X_test_encoded[col])
    else:
        # For any new columns not seen during training
        le = LabelEncoder()
        X_test_encoded[col] = le.fit_transform(X_test_encoded[col])

# Now evaluate models properly
try:
    # Evaluate Logistic Regression
    y_pred_logreg = logreg.predict(X_test_encoded)
    print("=== Logistic Regression ===")
    print("Accuracy:", accuracy_score(y_test, y_pred_logreg))
    print("F1 Score:", f1_score(y_test, y_pred_logreg))
    print("Classification Report:\n", classification_report(y_test, y_pred_logreg))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_logreg))
    
    # Evaluate Decision Tree
    y_pred_tree = tree.predict(X_test_encoded)
    print("\n=== Decision Tree ===")
    print("Accuracy:", accuracy_score(y_test, y_pred_tree))
    print("F1 Score:", f1_score(y_test, y_pred_tree))
    print("Classification Report:\n", classification_report(y_test, y_pred_tree))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_tree))
    
    # Compare models
    print("\n=== Model Comparison ===")
    if f1_score(y_test, y_pred_logreg) > f1_score(y_test, y_pred_tree):
        print("Logistic Regression performs better (higher F1 score).")
    else:
        print("Decision Tree performs better (higher F1 score).")
        
except Exception as e:
    print(f"Evaluation error: {e}")
    print("Please ensure:")
    print("1. All categorical variables are properly encoded")
    print("2. X_test has the same features as X_train")
    print("3. There are no missing values")

=== Logistic Regression ===
Accuracy: 0.759493670886076
F1 Score: 0.8347826086956521
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.44      0.56        27
           1       0.76      0.92      0.83        52

    accuracy                           0.76        79
   macro avg       0.76      0.68      0.70        79
weighted avg       0.76      0.76      0.74        79

Confusion Matrix:
 [[12 15]
 [ 4 48]]

=== Decision Tree ===
Accuracy: 0.7341772151898734
F1 Score: 0.8264462809917356
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.30      0.43        27
           1       0.72      0.96      0.83        52

    accuracy                           0.73        79
   macro avg       0.76      0.63      0.63        79
weighted avg       0.75      0.73      0.69        79

Confusion Matrix:
 [[ 8 19]
 [ 2 50]]

=== Model Comparison ===
Logistic Regression performs b