In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


# 1. Convert to DataFrame
excel_file = 'Categorized_mocks.xlsx'
df = pd.read_excel(excel_file, header=1)
df = df.iloc[:, :18]
#df.iloc[3:,3:] = df.iloc[3:,3:].fillna(0)
df = df.drop(df.index[0])  #Remove the Open-Closed row

# 2. Process labels
label_columns = ['R2-1', 'R2_2B', 'R2_2D', 'R2_2SD', 'R2_3', 'R2_3YN', 'R2_OP', 'R2_4QG', 'R2_4QL', 'R2_4QP', 'R2_4QR', 'R2_4QI', 'R2_4QV', 'R2_5','R2_6']
df[label_columns] = df[label_columns].apply(pd.to_numeric, errors='coerce').fillna(0)

def find_first_label(row):
    for label in label_columns:
        if row[label] == 1:
            return label
    return None  # Return None if no 1 is found 

df['label'] = df.apply(find_first_label, axis=1)
df = df[df['label'].notnull()] # Remove rows where 'label' is None
#print(df[['Question', 'label']].head())  


# 3. Split the dataset into training and testing sets
X = df['Question']  
y = df['label'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Convert text data into TF-IDF features
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(kernel='linear'),
    'Random Forest': RandomForestClassifier(n_estimators=100)
}

# Train, predict, and evaluate models
for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    # Evaluate model
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))
    print("-" * 50)

# # 5. Train the model with Logistic Regression model
# logreg = LogisticRegression(multi_class='auto', solver='lbfgs', max_iter=1000)
# logreg.fit(X_train_tfidf, y_train)

# # 6. Make predictions on the test set
# y_pred = logreg.predict(X_test_tfidf)

# # 7. Print classification report and accuracy
# print("Classification Report:\n", classification_report(y_test, y_pred))
# print("Accuracy:", accuracy_score(y_test, y_pred))



Model: Logistic Regression
Accuracy: 0.7569
Classification Report:
              precision    recall  f1-score   support

        R2-1       0.71      0.67      0.69        15
       R2_2B       0.91      0.92      0.92        66
       R2_2D       0.72      0.70      0.71        47
      R2_2SD       0.43      0.23      0.30        13
        R2_3       0.76      0.85      0.80       137
      R2_3YN       0.73      0.74      0.74       149
      R2_4QG       0.00      0.00      0.00         2
      R2_4QL       0.00      0.00      0.00        10
      R2_4QP       0.00      0.00      0.00         1
      R2_4QR       0.00      0.00      0.00         4
      R2_4QV       0.00      0.00      0.00         1
        R2_5       0.74      0.84      0.79        88
       R2_OP       1.00      0.20      0.33        10

    accuracy                           0.76       543
   macro avg       0.46      0.40      0.41       543
weighted avg       0.73      0.76      0.74       543

------------