In [6]:
import pandas as pd
import numpy as np

data = pd.read_excel('ml-data.xlsx')

# Extract the relevant columns (E to J)
X = data[['Header_and_Main_Declaration', 'Incomprehensible_Code', 
          'Comprehensible_Code_with_logical_errors', 
          'Comprehensible_code_with_syntax_errors', 
          'Correct_code_and_output']]

y = data['Final_Marks']

# Handle any missing values if they exist
X.fillna(0, inplace=True)

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the models: Perceptron and MLPClassifier
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Define Perceptron model and hyperparameters
perceptron = Perceptron()
param_dist_perceptron = {
    'penalty': ['l2', 'l1', 'elasticnet'],
    'alpha': np.logspace(-5, 1, 10),
    'max_iter': [500, 1000, 1500, 2000],
    'tol': [1e-3, 1e-4, 1e-5],
}

# Perform RandomizedSearchCV on Perceptron
random_search_perceptron = RandomizedSearchCV(perceptron, param_distributions=param_dist_perceptron,
                                              n_iter=10, cv=5, random_state=42, n_jobs=-1)
random_search_perceptron.fit(X_train, y_train)

# Best hyperparameters for Perceptron
print("Best Perceptron Hyperparameters:", random_search_perceptron.best_params_)

# Define MLPClassifier model and hyperparameters
mlp = MLPClassifier()
param_dist_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50), (150, 100, 50)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam', 'sgd'],
    'alpha': np.logspace(-5, 1, 10),
    'learning_rate': ['constant', 'adaptive'],
    'max_iter': [200, 300, 500]
}

# Perform RandomizedSearchCV on MLPClassifier
random_search_mlp = RandomizedSearchCV(mlp, param_distributions=param_dist_mlp, 
                                       n_iter=10, cv=5, random_state=42, n_jobs=-1)
random_search_mlp.fit(X_train, y_train)

# Best hyperparameters for MLP
print("Best MLP Hyperparameters:", random_search_mlp.best_params_)

# Evaluate models on test set
from sklearn.metrics import accuracy_score

y_pred_perceptron = random_search_perceptron.predict(X_test)
y_pred_mlp = random_search_mlp.predict(X_test)

# Print accuracy
print("Perceptron Accuracy:", accuracy_score(y_test, y_pred_perceptron))
print("MLP Accuracy:", accuracy_score(y_test, y_pred_mlp))


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(0, inplace=True)


Best Perceptron Hyperparameters: {'tol': 1e-05, 'penalty': 'l2', 'max_iter': 1500, 'alpha': 4.641588833612782e-05}
Best MLP Hyperparameters: {'solver': 'adam', 'max_iter': 300, 'learning_rate': 'constant', 'hidden_layer_sizes': (150, 100, 50), 'alpha': 4.641588833612782e-05, 'activation': 'tanh'}
Perceptron Accuracy: 0.3342776203966006
MLP Accuracy: 1.0


In [9]:
# Import necessary libraries
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier  # Ensure xgboost is installed
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import pandas as pd

# Extract features (E to I) and target (J column)
X = data[['Header_and_Main_Declaration', 'Incomprehensible_Code', 
          'Comprehensible_Code_with_logical_errors', 
          'Comprehensible_code_with_syntax_errors', 
          'Correct_code_and_output']].copy()  # Use .copy() to avoid SettingWithCopyWarning

y = data['Final_Marks']

# Handle any missing values by reassigning X
X = X.fillna(0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the classifiers (Removed CatBoostClassifier)
classifiers = {
    'Support Vector Machine': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    'Naive Bayes': GaussianNB()
}

# Initialize a dataframe to store results
results = pd.DataFrame(columns=['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])

# Loop over classifiers, fit and predict, and store results
for classifier_name, classifier in classifiers.items():
    # Train the classifier
    classifier.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = classifier.predict(X_test)
    
    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    # Create a temporary DataFrame for this result
    temp_result = pd.DataFrame({
        'Classifier': [classifier_name],
        'Accuracy': [accuracy],
        'Precision': [precision],
        'Recall': [recall],
        'F1-Score': [f1]
    })
    
    # Append the result to the results DataFrame using pd.concat
    results = pd.concat([results, temp_result], ignore_index=True)

# Display the results
results


  results = pd.concat([results, temp_result], ignore_index=True)
Parameters: { "use_label_encoder" } are not used.



Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1-Score
0,Support Vector Machine,0.968839,0.964508,0.968839,0.966334
1,Decision Tree,0.929178,0.929824,0.929178,0.929137
2,Random Forest,0.929178,0.930304,0.929178,0.929217
3,AdaBoost,0.29745,0.289378,0.29745,0.230668
4,XGBoost,0.929178,0.929816,0.929178,0.929155
5,Naive Bayes,0.470255,0.410575,0.470255,0.429052


In [10]:
# Import necessary libraries
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier  # Ensure xgboost is installed
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import pandas as pd

# Import CatBoost (optional, you can comment this if it's not installed)
try:
    from catboost import CatBoostClassifier
except ImportError:
    CatBoostClassifier = None

# Extract features (E to I) and target (J column)
X = data[['Header_and_Main_Declaration', 'Incomprehensible_Code', 
          'Comprehensible_Code_with_logical_errors', 
          'Comprehensible_code_with_syntax_errors', 
          'Correct_code_and_output']].copy()  # Use .copy() to avoid SettingWithCopyWarning

y = data['Final_Marks']

# Handle any missing values
X = X.fillna(0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the classifiers (with CatBoost if available)
classifiers = {
    'Support Vector Machine': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    'Naive Bayes': GaussianNB()
}

if CatBoostClassifier:
    classifiers['CatBoost'] = CatBoostClassifier(silent=True)

# Initialize a dataframe to store results
results = pd.DataFrame(columns=['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])

# Loop over classifiers, fit and predict, and store results
for classifier_name, classifier in classifiers.items():
    # Train the classifier
    classifier.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = classifier.predict(X_test)
    
    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    # Create a temporary DataFrame for this result
    temp_result = pd.DataFrame({
        'Classifier': [classifier_name],
        'Accuracy': [accuracy],
        'Precision': [precision],
        'Recall': [recall],
        'F1-Score': [f1]
    })
    
    # Append the result to the results DataFrame using pd.concat
    results = pd.concat([results, temp_result], ignore_index=True)

# Display the results in a tabular form
results


  results = pd.concat([results, temp_result], ignore_index=True)
Parameters: { "use_label_encoder" } are not used.



Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1-Score
0,Support Vector Machine,0.968839,0.964508,0.968839,0.966334
1,Decision Tree,0.929178,0.929538,0.929178,0.9291
2,Random Forest,0.929178,0.9304,0.929178,0.929301
3,AdaBoost,0.29745,0.289378,0.29745,0.230668
4,XGBoost,0.929178,0.929816,0.929178,0.929155
5,Naive Bayes,0.470255,0.410575,0.470255,0.429052


In [11]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-win_amd64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting python-dateutil>=2.8.2 (from pandas>=0.24->catboost)
  Downloading python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Downloading catboost-1.2.7-cp311-cp311-win_amd64.whl (101.7 MB)
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.7 MB 640.0 kB/s eta 0:02:39
   ---------------------------------------- 0.1/101.7 MB 919.0 kB/s eta 0:01:51
   ---------------------------------------- 0.1/101.7 MB 1.2 MB/s eta 0:01:27
   ---------------------------------------- 0.1/101.7 MB 1.2 MB/s eta 0:01:27
   ---------------------------------------- 0.1/101.7 MB 1.2 MB/s eta 0:01:27
   ---------------------------------------- 0.1/101.7 MB 1.2 MB/s eta 0:01:27
   ---------------------------------------- 0.1/101.7 M

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
conda-repo-cli 1.0.75 requires requests_mock, which is not installed.
conda-repo-cli 1.0.75 requires clyent==1.2.1, but you have clyent 1.2.2 which is incompatible.
conda-repo-cli 1.0.75 requires python-dateutil==2.8.2, but you have python-dateutil 2.9.0.post0 which is incompatible.
onelogin 3.1.6 requires python-dateutil~=2.7.0, but you have python-dateutil 2.9.0.post0 which is incompatible.
onelogin 3.1.6 requires typing-extensions~=4.3.0, but you have typing-extensions 4.12.2 which is incompatible.


In [13]:
# Import necessary libraries
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import pandas as pd

# Extract features (E to I) and target (J column)
X = data[['Header_and_Main_Declaration', 'Incomprehensible_Code', 
          'Comprehensible_Code_with_logical_errors', 
          'Comprehensible_code_with_syntax_errors', 
          'Correct_code_and_output']]

y = data['Final_Marks']

# Handle any missing values by assigning the result to X
X = X.fillna(0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the classifiers
classifiers = {
    'Support Vector Machine': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    'CatBoost': CatBoostClassifier(silent=True),
    'Naive Bayes': GaussianNB()
}

# Initialize a dataframe to store results
results = pd.DataFrame(columns=['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])

# Loop over classifiers, fit and predict, and store results
for classifier_name, classifier in classifiers.items():
    # Train the classifier
    classifier.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = classifier.predict(X_test)
    
    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    # Append the result using pd.concat
    temp_result = pd.DataFrame({
        'Classifier': [classifier_name],
        'Accuracy': [accuracy],
        'Precision': [precision],
        'Recall': [recall],
        'F1-Score': [f1]
    })
    
    results = pd.concat([results, temp_result], ignore_index=True)

# Display the results
results


  results = pd.concat([results, temp_result], ignore_index=True)
Parameters: { "use_label_encoder" } are not used.



Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1-Score
0,Support Vector Machine,0.968839,0.964508,0.968839,0.966334
1,Decision Tree,0.932011,0.932697,0.932011,0.932049
2,Random Forest,0.929178,0.930304,0.929178,0.929217
3,AdaBoost,0.29745,0.289378,0.29745,0.230668
4,XGBoost,0.929178,0.929816,0.929178,0.929155
5,CatBoost,0.934844,0.935726,0.934844,0.934917
6,Naive Bayes,0.470255,0.410575,0.470255,0.429052
