# Project 2: DROP OUT CLASSIFIER
(by: Martin Marsal, Benedikt Allmendinger, Christian Diegmann; Heilbronn University, Germany, November 2024)


# Preprocessing Data

- Nullen raus (braucht nochmal Diskussionsbedarf, Nullen werden bei manchen Features nämlich gebraucht)
- Long floats kürzen auf 2te kommastelle (done)
- Normalisieren von Daten? Daruch wird training schneller, ohne Zusammenhangsverlust
- Biased Features raus? nö, alle wichtig
- Ausreißer raus

In [None]:
import pandas as pd

In [None]:
# Convert csv file to a pandas DataFrame
df = pd.read_csv('student_data.csv')

# Strip any leading/trailing spaces from column names
df.columns = df.columns.str.strip()

# Rounding to two decimal places
df['Curricular units 2nd sem (grade)'] = df['Curricular units 2nd sem (grade)'].round(2)

# Mapping the target values to level of risk:
df['Target'] = df['Target'].map({'Dropout': 2, 'Enrolled': 1, 'Graduate': 0})

# Print the DataFrame
print(df)

In [None]:
# Shuffle the DataFrame and reset the index
shuffle_df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Calculate the size of the training set (80% of the data)
train_size = int(0.8 * len(shuffle_df))

# Split the DataFrame into training and test sets
train_df = shuffle_df.iloc[:train_size]
test_df = shuffle_df.iloc[train_size:]

# Print the training and test sets
print(train_df)
print(test_df)

# Identify categorical columns
categorical_cols = train_df.select_dtypes(include=['object', 'category']).columns
print("Categorical columns:", categorical_cols)

# Combine training and test sets for consistent encoding
combined_df = pd.concat([train_df, test_df], axis=0)

# Apply one-hot encoding
combined_df = pd.get_dummies(combined_df, columns=categorical_cols)

# Split back into training and test sets
train_df = combined_df.iloc[:len(train_df), :]
test_df = combined_df.iloc[len(train_df):, :]

# Separate features and target
X_train = train_df.drop('Target', axis=1) # replace real with targets
y_train = train_df['Target']
X_test = test_df.drop('Target', axis=1)
y_test = test_df['Target']

# Train at least four machine learning algorithms

## Model 1 probabilistic

In [None]:
# Code Cell

## Model 2 tree based - B

In [None]:
# Code Cell
from sklearn.tree import DecisionTreeClassifier

# Initialize the model with a random state for reproducibility
clf = DecisionTreeClassifier(random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Generate confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Initialize the model with hyperparameters
clf = DecisionTreeClassifier(random_state=42, max_depth=5, min_samples_split=10)

# Retrain the model
clf.fit(X_train, y_train)

from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(clf, X_train, y_train, cv=5)

print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV score: {cv_scores.mean():.2f}")


## Model 3 distance-based  - M

In [None]:
# Import necessary libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Define a range of k values to test
k_values = range(1, 51)  # Test k values from 1 to 50
accuracies = []

# Loop through different k values to find the best one
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)  # Train with the current k
    y_pred = knn.predict(X_test)  # Predict on the test set
    acc = accuracy_score(y_test, y_pred)  # Calculate accuracy
    accuracies.append(acc)  # Store the accuracy for each k

# Find the k with the highest accuracy
best_k = k_values[accuracies.index(max(accuracies))]
print(f"The best k value is: {best_k} with accuracy of {max(accuracies) * 100:.2f}%")

# Plot k vs accuracy
import matplotlib.pyplot as plt

plt.plot(k_values, accuracies, marker='o')
plt.xlabel("k (Number of Neighbors)")
plt.ylabel("Accuracy")
plt.title("Accuracy for Different k Values")
plt.show()

In [None]:
# 1. Import necessary libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import numpy as np

# 3. Initialize the k-NN classifier
k = 7  # Choose the number of neighbors
knn = KNeighborsClassifier(n_neighbors=k)

# 4. Perform k-fold cross-validation
cv_scores = cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy')  # 5-fold cross-validation

# Output cross-validation results
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation accuracy: {np.mean(cv_scores) * 100:.2f}%")
print(f"Standard deviation of cross-validation accuracy: {np.std(cv_scores) * 100:.2f}%")

# 5. Train the k-NN Classifier on the entire training set
knn.fit(X_train, y_train)

# 6. Make predictions on the test set
y_pred = knn.predict(X_test)

# 7. Evaluate the Model on the Test Set
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test set: {accuracy * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# 8. Plot Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(6, 6))
ax.matshow(conf_matrix, cmap="Blues", alpha=0.6)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i, s=conf_matrix[i, j], va='center', ha='center', color="black")

# Add labels and title
ax.set_xlabel("Predicted Labels")
ax.set_ylabel("True Labels")
ax.set_title("Confusion Matrix")
plt.show()

## Model 4 Ensemble method - C

In [None]:
# Code Cell

## Discussion

Are all models equally well suited for this task? Discuss your conclusion.

In [None]:
# Code Cell

# Evaluation

Evaluate the four models using k-fold cross validation and give at least accuracy (mean and standard deviation) and
confusion matrix for the trained models. Is one of the models significantly better than the others?

In [None]:
# Code Cell

## Model 1 probabilistic

In [None]:
# Code Cell

## Model 2 tree based - B

In [None]:
# Code Cell

## Model 3 distance-based  - M

In [None]:
# Code Cell

## Model 4 Ensemble method - C

In [None]:
# Code Cell

# Pick your favorite model. 
Which features were most relevant for the for the students’ success?


In [None]:
# Code Cell

# Save your favorite model as pickle-file with https://scikit-learn.org/stable/model_persistence.html. Call the file “best_model.pkl”.


The submission consists of two files:
1. A Jupyter Notebook containing the preprocessing, the training, and the evaluation of
your models.
2. A pickle-file “best_model.pkl”

In [None]:
# Code Cell