## Load Data 


In [1]:

import pandas as pd

# Load your dataset (replace 'your_file.csv' with the actual file path)
data = pd.read_csv('fine_data_set-elt.csv')

# Check the first few rows and the columns of the dataset
print(data.head())
print(data.columns)  # This will print out all the column names in your DataFrame

# Fill missing values for numerical columns
data['score'] = data['score'].fillna(data['score'].mean())

# If 'assessment_date' is a numerical column, this is fine; if it's a date, handle it differently
data['assessment_date'] = data['assessment_date'].fillna(data['assessment_date'].mean())

# Check if 'category_column' exists before filling missing values
category_column_name = 'your_actual_category_column_name'  # Replace this with the correct name
if category_column_name in data.columns:
    data[category_column_name] = data[category_column_name].fillna(data[category_column_name].mode()[0])
else:
    print(f"Column '{category_column_name}' does not exist in the DataFrame.")


   id_student gender                region      highest_education imd_band  \
0       11391      M   East Anglian Region       HE Qualification  90-100%   
1       28400      F              Scotland       HE Qualification   20-30%   
2       31604      F     South East Region  A Level or Equivalent   50-60%   
3       32885      F  West Midlands Region     Lower Than A Level   50-60%   
4       38053      M                 Wales  A Level or Equivalent   80-90%   

  age_band  num_of_prev_attempts  studied_credits disability final_result  \
0     55<=                     0              240          N         Pass   
1    35-55                     0               60          N         Pass   
2    35-55                     0               60          N         Pass   
3     0-35                     0               60          N         Pass   
4    35-55                     0               60          N         Pass   

   id_assessment assessment_type  assessment_date  assessment_weight

# DecisionTree

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Load your dataset (replace 'your_file.csv' with the actual file path)
data = pd.read_csv('fine_data_set-elt.csv')

# Fill missing values for numerical columns
data['score'] = data['score'].fillna(data['score'].mean())
data['assessment_date'] = data['assessment_date'].fillna(data['assessment_date'].mean())

# Fill missing values for categorical columns
data['imd_band'] = data['imd_band'].fillna(data['imd_band'].mode()[0])

# Encode categorical variables
label_encoders = {}
categorical_columns = ['gender', 'region', 'highest_education', 'imd_band', 'age_band', 'disability', 'final_result', 'assessment_type', 'code_module', 'code_presentation']

for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Define features and target variable
X = data.drop('final_result', axis=1)
y = data['final_result']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Decision Tree Classifier
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)

# Make predictions on the test set
y_pred = decision_tree.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Display accuracy and classification report
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)


Accuracy: 0.57
Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.52      0.50      5869
           1       0.39      0.41      0.40      6529
           2       0.69      0.68      0.69     22409
           3       0.37      0.38      0.38      5036

    accuracy                           0.57     39843
   macro avg       0.49      0.49      0.49     39843
weighted avg       0.58      0.57      0.57     39843



# Radndom Forest 

In [5]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Assuming X_train, y_train, X_test, y_test are already defined

# Train a Random Forest Classifier
random_forest = RandomForestClassifier(random_state=42, n_estimators=100)
random_forest.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = random_forest.predict(X_test)

# Evaluate the Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
report_rf = classification_report(y_test, y_pred_rf)

accuracy_rf, report_rf

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

Accuracy: 0.57
Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.52      0.50      5869
           1       0.39      0.41      0.40      6529
           2       0.69      0.68      0.69     22409
           3       0.37      0.38      0.38      5036

    accuracy                           0.57     39843
   macro avg       0.49      0.49      0.49     39843
weighted avg       0.58      0.57      0.57     39843



# Logistic Regression

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd

# Load your dataset
data = pd.read_csv('fine_data_set-elt.csv')

# Fill missing values for numerical columns
data['score'] = data['score'].fillna(data['score'].mean())
data['assessment_date'] = data['assessment_date'].fillna(data['assessment_date'].mean())

# Fill missing values for categorical columns
data['imd_band'] = data['imd_band'].fillna(data['imd_band'].mode()[0])

# Encode categorical variables
label_encoders = {}
categorical_columns = ['gender', 'region', 'highest_education', 'imd_band', 'age_band', 'disability', 'final_result', 'assessment_type', 'code_module', 'code_presentation']

for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Define features and target variable
X = data.drop('final_result', axis=1)
y = data['final_result']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train a Logistic Regression Classifier with increased max_iter and balanced class weights
logistic_regression = LogisticRegression(max_iter=2000, class_weight='balanced')
logistic_regression.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logistic_regression.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=0)

# Display accuracy and classification report
print(f"Logistic Regression Accuracy: {accuracy:.2f}")
print("Logistic Regression Classification Report:")
print(report)


Logistic Regression Accuracy: 0.41
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.31      0.69      0.43      5869
           1       0.33      0.43      0.37      6529
           2       0.73      0.32      0.44     22409
           3       0.27      0.44      0.33      5036

    accuracy                           0.41     39843
   macro avg       0.41      0.47      0.39     39843
weighted avg       0.54      0.41      0.41     39843



# Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest Classifier
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)

# Make predictions on the test set
y_pred = random_forest.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Display accuracy and classification report
print(f"Random Forest Accuracy: {accuracy:.2f}")
print("Random Forest Classification Report:")
print(report)


Random Forest Accuracy: 0.68
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.50      0.58      5869
           1       0.61      0.39      0.48      6529
           2       0.70      0.87      0.78     22409
           3       0.55      0.38      0.45      5036

    accuracy                           0.68     39843
   macro avg       0.64      0.54      0.57     39843
weighted avg       0.67      0.68      0.66     39843



# Support Vecotr 

In [11]:
from sklearn.svm import SVC

# Train a Support Vector Classifier
svc = SVC(random_state=42)
svc.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svc.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Display accuracy and classification report
print(f"SVC Accuracy: {accuracy:.2f}")
print("SVC Classification Report:")
print(report)


SVC Accuracy: 0.61
SVC Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.13      0.22      5869
           1       0.60      0.17      0.26      6529
           2       0.61      0.97      0.75     22409
           3       0.62      0.14      0.23      5036

    accuracy                           0.61     39843
   macro avg       0.64      0.35      0.36     39843
weighted avg       0.63      0.61      0.52     39843



# Logistc regrstion , Support Vector,  Random Forest 

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Load your dataset
data = pd.read_csv('fine_data_set-elt.csv')

# Fill missing values for numerical columns
data['score'] = data['score'].fillna(data['score'].mean())
data['assessment_date'] = data['assessment_date'].fillna(data['assessment_date'].mean())

# Fill missing values for categorical columns
data['imd_band'] = data['imd_band'].fillna(data['imd_band'].mode()[0])

# Encode categorical variables
label_encoders = {}
categorical_columns = ['gender', 'region', 'highest_education', 'imd_band', 'age_band', 'disability', 'final_result', 'assessment_type', 'code_module', 'code_presentation']

for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Define features and target variable
X = data.drop('final_result', axis=1)
y = data['final_result']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
logistic_regression = LogisticRegression(max_iter=1000)
logistic_regression.fit(X_train, y_train)
y_pred = logistic_regression.predict(X_test)
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred))

# Random Forest
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
print(f"Random Forest Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred))

# Support Vector Classifier
svc = SVC(random_state=42)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print(f"SVC Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("SVC Classification Report:")
print(classification_report(y_test, y_pred))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Logistic Regression Accuracy: 0.56
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      5869
           1       0.28      0.00      0.00      6529
           2       0.57      0.98      0.72     22409
           3       0.35      0.07      0.12      5036

    accuracy                           0.56     39843
   macro avg       0.30      0.26      0.21     39843
weighted avg       0.41      0.56      0.42     39843

Random Forest Accuracy: 0.68
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.50      0.58      5869
           1       0.61      0.39      0.48      6529
           2       0.70      0.87      0.78     22409
           3       0.55      0.38      0.45      5036

    accuracy                           0.68     39843
   macro avg       0.64      0.54      0.57     39843
weighted avg       0.67      0.68      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# K-Neighbors

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Load your dataset (replace 'your_file.csv' with the actual file path)
data = pd.read_csv('fine_data_set-elt.csv')

# Fill missing values for numerical columns
data['score'] = data['score'].fillna(data['score'].mean())
data['assessment_date'] = data['assessment_date'].fillna(data['assessment_date'].mean())

# Fill missing values for categorical columns
data['imd_band'] = data['imd_band'].fillna(data['imd_band'].mode()[0])

# Encode categorical variables
label_encoders = {}
categorical_columns = ['gender', 'region', 'highest_education', 'imd_band', 'age_band', 'disability', 'final_result', 'assessment_type', 'code_module', 'code_presentation']

for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Define features and target variable
X = data.drop('final_result', axis=1)
y = data['final_result']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a K-Nearest Neighbors Classifier
knn = KNeighborsClassifier(n_neighbors=5)  # You can change n_neighbors as needed
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=0)

# Display accuracy and classification report
print(f"KNN Accuracy: {accuracy:.2f}")
print("KNN Classification Report:")
print(report)


KNN Accuracy: 0.75
KNN Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.74      0.71      5869
           1       0.67      0.60      0.64      6529
           2       0.81      0.86      0.83     22409
           3       0.57      0.44      0.50      5036

    accuracy                           0.75     39843
   macro avg       0.69      0.66      0.67     39843
weighted avg       0.74      0.75      0.74     39843

