In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import ColumnTransformer

# 1. Load Data
df = pd.read_csv("/kaggle/input/q2-pp2/Q2_pp2_dataset_complete.csv")

# Identify and remove rare classes (classes with only one sample)
class_counts = df['Q2_Topics'].value_counts()
rare_classes = class_counts[class_counts == 1].index
df = df[~df['Q2_Topics'].isin(rare_classes)]

# 2. Feature Engineering
label_encoder = LabelEncoder()
df['Q2_Topics'] = df['Q2_Topics'].astype(str)
df['Q2_Topic_Encoded'] = label_encoder.fit_transform(df['Q2_Topics'])

ct = ColumnTransformer(
    [('onehot', OneHotEncoder(handle_unknown='ignore'), ['Paper_Session'])],
    remainder='passthrough'
)

X = df[['Year', 'Paper_Session', 'Paper_Varient']]
X.columns = X.columns.astype(str)
X = ct.fit_transform(X)
X = pd.DataFrame(X)
X.columns = X.columns.astype(str)
y = df['Q2_Topic_Encoded']

# 3. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# 4. Model Training
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
}

for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)

    # 5. Evaluation
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy ({model_name}): {accuracy}")
    print(classification_report(y_test, y_pred, zero_division=0))

    # 6. Prediction for Next Year
    next_year_data = pd.DataFrame({
        'Year': [2024],
        'Paper_Session': ['MJ'],
        'Paper_Varient': ['22']
    })

    X_next_year = next_year_data[['Year', 'Paper_Session', 'Paper_Varient']]
    X_next_year.columns = X_next_year.columns.astype(str)
    X_next_year = ct.transform(X_next_year)
    X_next_year = pd.DataFrame(X_next_year)
    X_next_year.columns = X_next_year.columns.astype(str)

    predicted_topic_encoded = model.predict(X_next_year)
    predicted_topic = label_encoder.inverse_transform(predicted_topic_encoded)
    print(f"Predicted Topic for Next Year ({model_name}): {predicted_topic}")
    print("-" * 50)

Training Random Forest...
Accuracy (Random Forest): 0.0
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           1       0.00      0.00      0.00       1.0
           2       0.00      0.00      0.00       1.0

    accuracy                           0.00       3.0
   macro avg       0.00      0.00      0.00       3.0
weighted avg       0.00      0.00      0.00       3.0

Predicted Topic for Next Year (Random Forest): ['their use in legal thinking, and their relationship with the Qur’an, consensus (ijma‘) and analogy (qiyas)']
--------------------------------------------------


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import ColumnTransformer
import string

# 1. Load and Preprocess Data
df = pd.read_csv("/kaggle/input/2013-dataset/Q2_dataset_complete2.csv")

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Add more cleaning steps if needed (e.g., stemming, stop word removal)
    return text

df['Q2'] = df['Q2'].apply(preprocess_text)

# 2. Feature Engineering
label_encoder = LabelEncoder()
df['Q2_Topic'] = df['Q2_Topic'].astype(str)
df['Q2_Topic_Encoded'] = label_encoder.fit_transform(df['Q2_Topic'])

tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Include bigrams
tfidf_features = tfidf_vectorizer.fit_transform(df['Q2'])

ct = ColumnTransformer(
    [('onehot', OneHotEncoder(handle_unknown='ignore'), ['Paper_Session'])],
    remainder='passthrough'
)

X = pd.concat([pd.DataFrame(tfidf_features.toarray()), df[['Year', 'Paper_Session', 'Paper_Varient']]], axis=1)
X.columns = X.columns.astype(str)
X = ct.fit_transform(X)
X = pd.DataFrame(X)
X.columns = X.columns.astype(str)
y = df['Q2_Topic_Encoded']

# 3. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 4. Model Training (Random Forest with Hyperparameter Tuning)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# 5. Evaluation
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred, zero_division=0))

# 6. Prediction for Next Year
next_year_data = pd.DataFrame({
    'Year': [2024],
    'Paper_Session': ['ON'],
    'Paper_Varient': ['11'],
    'Q2': [""]
})

next_year_data['Q2'] = next_year_data['Q2'].apply(preprocess_text)
next_year_tfidf = tfidf_vectorizer.transform(next_year_data['Q2'])

X_next_year = pd.concat([pd.DataFrame(next_year_tfidf.toarray()), next_year_data[['Year', 'Paper_Session', 'Paper_Varient']]], axis=1)
X_next_year.columns = X_next_year.columns.astype(str)
X_next_year = ct.transform(X_next_year)
X_next_year = pd.DataFrame(X_next_year)
X_next_year.columns = X_next_year.columns.astype(str)

predicted_topic_encoded = best_model.predict(X_next_year)
predicted_topic = label_encoder.inverse_transform(predicted_topic_encoded)
print(f"Predicted Topic for Next Year: {predicted_topic}")

KeyError: 'Q2'

In [10]:
!pip install imbalanced-learn xgboost

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE

# 1) Load & Prep
df = pd.read_csv("/kaggle/input/2013-dataset/Q2_dataset_complete2.csv")

# remove classes w only 1 sample
class_counts = df['Q2_Topics'].value_counts()
rare_classes = class_counts[class_counts == 1].index
df = df[~df['Q2_Topics'].isin(rare_classes)]

# encode target
label_encoder = LabelEncoder()
df['Q2_Topics'] = df['Q2_Topics'].astype(str)
df['Q2_Topic_Encoded'] = label_encoder.fit_transform(df['Q2_Topics'])

# separate features/target
numeric_features = ['Year', 'Paper_Varient']
categorical_features = ['Paper_Session']

X = df[numeric_features + categorical_features]
y = df['Q2_Topic_Encoded']

# column transform (OneHot + Scale numeric)
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('num', StandardScaler(), numeric_features)
])

X_trans = preprocessor.fit_transform(X)

# 2) SMOTE once on entire dataset
sm = SMOTE(random_state=42, k_neighbors=1)  # if smallest class has at least 2 samples
X_smote, y_smote = sm.fit_resample(X_trans, y)

print("After SMOTE:", X_smote.shape, y_smote.shape)

# 3) Train-test or CV on X_smote, y_smote
models = {
    "XGB": XGBClassifier(random_state=42),
    "MLP": MLPClassifier(max_iter=500, random_state=42),
    "GB": GradientBoostingClassifier(random_state=42),
    "SVM": SVC(random_state=42),
    "RF": RandomForestClassifier(random_state=42),
}

param_grids = {
    "XGB": {
        'n_estimators': [50, 100],
        'max_depth': [3, 5],
        'learning_rate': [0.01, 0.1,0.001]
    },
    "MLP": {
        'hidden_layer_sizes': [(100,), (50,50)],
        'activation': ['relu', 'tanh'],
        'alpha': [0.001, 0.01,0.0001]
    },
    "GB": {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1,0.001],
        'max_depth': [3, 5]
    },
    "SVM": {
        'C': [1, 10],
        'kernel': ['linear', 'rbf']
    },
    "RF": {
        'n_estimators': [50, 100],
        'max_depth': [3, 5, None]
    }
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)  # or 5

for name, model in models.items():
    print(f"\n--- {name} ---")
    if name in param_grids:
        grid = GridSearchCV(
            model,
            param_grids[name],
            cv=cv,
            scoring='accuracy',
            n_jobs=-1
        )
        grid.fit(X_smote, y_smote)
        best_model = grid.best_estimator_
        print("Best Params:", grid.best_params_)
        print("CV Best Score:", grid.best_score_)
    else:
        model.fit(X_smote, y_smote)
        best_model = model
    
    # Evaluate holdout if you want
    X_train, X_test, y_train, y_test = train_test_split(
        X_smote, y_smote, test_size=0.3, random_state=42, stratify=y_smote
    )
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"Holdout Accuracy: {acc:.3f}")
    print(classification_report(y_test, y_pred, zero_division=0))

    # Next year
    next_data = pd.DataFrame({'Year':[2025], 'Paper_Varient':[12], 'Paper_Session':['MJ']})
    next_data_trans = preprocessor.transform(next_data)
    pred_label = best_model.predict(next_data_trans)
    print("Predicted Topic:", label_encoder.inverse_transform(pred_label))


After SMOTE: (20, 3) (20,)

--- XGB ---
Best Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
CV Best Score: 0.3968253968253968
Holdout Accuracy: 0.167
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         2
           3       0.20      1.00      0.33         1

    accuracy                           0.17         6
   macro avg       0.05      0.25      0.08         6
weighted avg       0.03      0.17      0.06         6

Predicted Topic: ['the revelation of the Qur’an to the Prophet (pbuh) between the years 610 and 632']

--- MLP ---




Best Params: {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (100,)}
CV Best Score: 0.3968253968253968




Holdout Accuracy: 0.333
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.20      1.00      0.33         1
           2       0.00      0.00      0.00         2
           3       1.00      1.00      1.00         1

    accuracy                           0.33         6
   macro avg       0.30      0.50      0.33         6
weighted avg       0.20      0.33      0.22         6

Predicted Topic: ['the revelation of the Qur’an to the Prophet (pbuh) between the years 610 and 632']

--- GB ---
Best Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
CV Best Score: 0.5
Holdout Accuracy: 0.333
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.20      1.00      0.33         1
           2       0.00      0.00      0.00         2
           3       1.00      1.00      1.00         1

    accuracy                           