In [6]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import ColumnTransformer

# Dictionary mapping question columns to CSV paths
QUESTION_FILES = {
    "Q2_Topics": "/kaggle/input/paper1-dataset/Q2_prediction_dataset.csv",
    "Q3_Topic": "/kaggle/input/paper1-dataset/Q3_prediction_dataset.csv",
    "Q4_Topic": "/kaggle/input/paper1-dataset/Q4_prediction_dataset.csv",
    "Q5_Topic": "/kaggle/input/paper1-dataset/Q5_prediction_complete.csv"
}

# Store final predictions in a dictionary
next_year_predictions = {}

for question_col, csv_path in QUESTION_FILES.items():
    print(f"\n========== Processing {question_col} ==========")
    
    # 1. Load data specific to this question
    df_question = pd.read_csv(csv_path)

    # 2. Identify and remove rare classes
    class_counts = df_question[question_col].value_counts()
    rare_classes = class_counts[class_counts == 1].index
    df_question = df_question[~df_question[question_col].isin(rare_classes)]

    # Handle case where after removing rare classes, there's insufficient data
    if df_question[question_col].nunique() < 2:
        print(f"Skipping {question_col} — not enough data/classes.")
        continue

    # 3. Encode target
    label_encoder = LabelEncoder()
    df_question[question_col] = df_question[question_col].astype(str)
    df_question[f"{question_col}_encoded"] = label_encoder.fit_transform(df_question[question_col])
    
    # 4. Prepare features (X) and target (y)
    #    Adjust if your CSV columns are named differently
    X = df_question[['Year', 'Paper_Session', 'Paper_Varient']]
    y = df_question[f"{question_col}_encoded"]
    
    # One-hot encoding on Paper_Session
    ct = ColumnTransformer(
        [('onehot', OneHotEncoder(handle_unknown='ignore'), ['Paper_Session'])],
        remainder='passthrough'
    )
    X_transformed = ct.fit_transform(X)

    # 5. Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_transformed, y, test_size=0.3, random_state=42, stratify=y
    )

    # 6. Model training
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # 7. Evaluation
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # 8. Predict for next year
    next_year_data = pd.DataFrame({
        'Year': [2025],
        'Paper_Session': ['MJ'],
        'Paper_Varient': ['12']
    })
    X_next_year = ct.transform(next_year_data)
    predicted_topic_encoded = model.predict(X_next_year)
    predicted_topic = label_encoder.inverse_transform(predicted_topic_encoded)
    
    next_year_predictions[question_col] = predicted_topic[0]
    print(f"Predicted Topic for 2025 ({question_col}): {predicted_topic[0]}")

    # 9. Save the pipeline objects
    joblib.dump(model, f"rf_model_{question_col}.pkl")
    joblib.dump(ct, f"column_transformer_{question_col}.pkl")
    joblib.dump(label_encoder, f"label_encoder_{question_col}.pkl")

# Print all predictions
print("\n========== Final Next Year Predictions for All Questions ==========")
for question, topic in next_year_predictions.items():
    print(f"{question} => {topic}")



Predicted Topic for 2025 (Q2_Topics): the account of the compilation of the Qur’an under the Rightly Guided Caliphs 

Predicted Topic for 2025 (Q3_Topic): the importance of his actions as examples for Muslim individuals in their personal conduct and relations with others including women and non-Muslims

Predicted Topic for 2025 (Q4_Topic): the main events of his activities in Madina, his leadership of the community there and his conflicts with the Makkans and others

Predicted Topic for 2025 (Q5_Topic): his leading Companions, including the Ten Blessed Companions during his lifetime.

Q2_Topics => the account of the compilation of the Qur’an under the Rightly Guided Caliphs 
Q3_Topic => the importance of his actions as examples for Muslim individuals in their personal conduct and relations with others including women and non-Muslims
Q4_Topic => the main events of his activities in Madina, his leadership of the community there and his conflicts with the Makkans and others
Q5_Topic => h

In [None]:
!pip install transformers datasets imbalanced-learn difflib2 --quiet

import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import ColumnTransformer

import torch
from torch.utils.data import Dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments


###########################################################
# A) CLASSIFICATION (Random Forest) FOR MULTIPLE QUESTIONS
###########################################################

# Example CSV paths for classification
CLASSIFICATION_FILES = {
    "Q2_Topics": "/kaggle/input/paper1-dataset/Q2_prediction_dataset.csv",
    "Q3_Topic": "/kaggle/input/paper1-dataset/Q3_prediction_dataset.csv",
    "Q4_Topic": "/kaggle/input/paper1-dataset/Q4_prediction_dataset.csv",
    "Q5_Topic": "/kaggle/input/paper1-dataset/Q5_prediction_complete.csv"
}

# Dictionary to store the final predicted topic for each question
next_year_predictions = {}

for question_col, csv_path in CLASSIFICATION_FILES.items():
    print(f"\n========== CLASSIFICATION for {question_col} ==========")
    
    df = pd.read_csv(csv_path)

    # 1) Remove rare classes (appear only once)
    class_counts = df[question_col].value_counts()
    rare_classes = class_counts[class_counts == 1].index
    df = df[~df[question_col].isin(rare_classes)]

    if df[question_col].nunique() < 2:
        print(f"Skipping {question_col} — not enough classes after removing rare ones.")
        continue

    # 2) Label encode the target
    label_encoder = LabelEncoder()
    df[question_col] = df[question_col].astype(str)
    df[f"{question_col}_encoded"] = label_encoder.fit_transform(df[question_col])

    # 3) ColumnTransformer for Paper_Session
    ct = ColumnTransformer(
        [('onehot', OneHotEncoder(handle_unknown='ignore'), ['Paper_Session'])],
        remainder='passthrough'
    )

    X = df[['Year', 'Paper_Session', 'Paper_Varient']]
    y = df[f"{question_col}_encoded"]
    X_transformed = ct.fit_transform(X)

    # 4) Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_transformed, y, test_size=0.3, random_state=42, stratify=y
    )

    # 5) Train RandomForest
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # 6) Evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for {question_col}: {accuracy:.3f}")
    print(classification_report(y_test, y_pred, zero_division=0))

    # 7) Predict next year's topic (2025)
    next_year_data = pd.DataFrame({
        'Year': [2025],
        'Paper_Session': ['MJ'],
        'Paper_Varient': ['12']
    })
    X_next_year = ct.transform(next_year_data)
    predicted_topic_encoded = model.predict(X_next_year)
    predicted_topic = label_encoder.inverse_transform(predicted_topic_encoded)
    
    # 8) Store the predicted topic
    next_year_predictions[question_col] = predicted_topic[0]
    print(f"Predicted Topic for 2025 ({question_col}): {predicted_topic[0]}")

    # (Optional) Save the model pieces
    joblib.dump(model, f"rf_model_{question_col}.pkl")
    joblib.dump(ct, f"column_transformer_{question_col}.pkl")
    joblib.dump(label_encoder, f"label_encoder_{question_col}.pkl")

# Print out the final classification predictions
print("\n========== Final Next Year Predictions for All Questions ==========")
for question, topic in next_year_predictions.items():
    print(f"{question} => {topic}")


###########################################################
# B) GPT-2 QUESTION GENERATION — ONE MODEL PER QUESTION
###########################################################

# Example CSV paths for GPT-2 (adjust to your actual GPT-2 data)
GENERATION_FILES = {
    "Q2_Topics": "/kaggle/input/generation-dataset/Q2_complete_dataset.csv",
    "Q3_Topic": "/kaggle/input/generation-dataset/Q3_dataset_complete.csv",
    "Q4_Topic": "/kaggle/input/generation-dataset/Q4_dataset_complete.csv",
    "Q5_Topic": "/kaggle/input/generation-dataset/Q5_dataset_complete.csv",
}

class TopicQuestionDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.encodings = []
        for txt in texts:
            enc = tokenizer(
                txt,
                truncation=True,
                max_length=max_length,
                padding="max_length"
            )
            self.encodings.append(enc)

    def __len__(self):
        return len(self.encodings)

    def __getitem__(self, idx):
        item = self.encodings[idx]
        return {
            "input_ids": torch.tensor(item["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(item["attention_mask"], dtype=torch.long)
        }

def data_collator(batch):
    input_ids = torch.stack([f["input_ids"] for f in batch])
    attention_mask = torch.stack([f["attention_mask"] for f in batch])
    labels = input_ids.clone()
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

final_generated = []

for question_key, gen_csv_path in GENERATION_FILES.items():
    print(f"\n========== GPT-2 Generation for {question_key} ==========")

    # 1) Load dataset for GPT-2 training
    df_gen = pd.read_csv(gen_csv_path)

    # e.g. "Q2_Topics" => topic_col, "Q2" => question_col
    topic_col = question_key
    question_col = question_key.replace("_Topics", "")  # e.g. "Q2_Topics" => "Q2"

    df_gen = df_gen.dropna(subset=[topic_col, question_col])

    # 2) Prepare training text: "Topic: ...\nQuestion: ...\n<|endoftext|>"
    train_texts = []
    for _, row in df_gen.iterrows():
        t_str = str(row[topic_col]).strip()
        q_str = str(row[question_col]).strip()
        combined = f"Topic: {t_str}\nQuestion: {q_str}\n<|endoftext|>"
        train_texts.append(combined)

    # 3) Initialize GPT-2 from scratch for this question
    model_name = "gpt2"  # or "distilgpt2"
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Fix the pad token
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

    # Create dataset & trainer
    train_dataset = TopicQuestionDataset(train_texts, tokenizer, max_length=500)

    training_args = TrainingArguments(
        output_dir=f"./temp-output-{question_key}",
        overwrite_output_dir=True,
        num_train_epochs=20,   # adjust as needed
        per_device_train_batch_size=2,
        logging_steps=5,
        logging_strategy="steps",
        save_strategy="no",
        report_to=[]
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator
    )

    # 4) Train GPT-2 on this question's data only
    trainer.train()
    print(f"Finished GPT-2 training for {question_key}.")

    # 5) Save the model (optional)
    model_save_path = f"./fine-tuned-gpt2-{question_key}"
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)

    # 6) Generate a question for the predicted topic (from Random Forest)
    #    No fallback, no uniqueness checks
    predicted_topic_for_this_key = next_year_predictions[question_key]

    def generate_question_for_topic(topic, max_length=500, temperature=0.7, top_p=0.9):
        prompt = f"Topic: {topic}\nQuestion:"
        input_ids = tokenizer.encode(prompt, return_tensors='pt').to(model.device)
        output = model.generate(
            input_ids=input_ids,
            max_length=max_length,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            num_return_sequences=1
        )
        text = tokenizer.decode(output[0], skip_special_tokens=True)
        if "Question:" in text:
            splitted = text.split("Question:")
            return splitted[-1].strip()
        return text.strip()

    final_question = generate_question_for_topic(predicted_topic_for_this_key)

    print(f"--- Predicted Topic (RF) for {question_key}: {predicted_topic_for_this_key}")
    print(f"--- Generated Question: {final_question}")

    final_generated.append({
        "question_key": question_key,
        "predicted_topic": predicted_topic_for_this_key,
        "generated_question": final_question
    })

########################################
# PART C: PRINT ALL GENERATED QUESTIONS
########################################
print("\n========== ALL GENERATED QUESTIONS ==========")
for item in final_generated:
    print(f"Question Key: {item['question_key']}")
    print(f"  Predicted Topic: {item['predicted_topic']}")
    print(f"  Generated Question: {item['generated_question']}")
    print("--------------------------------")


[31mERROR: Could not find a version that satisfies the requirement difflib2 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for difflib2[0m[31m
[0m
Accuracy for Q2_Topics: 0.400
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.50      0.50      0.50         2
           2       0.50      1.00      0.67         1
           3       0.00      0.00      0.00         1

    accuracy                           0.40         5
   macro avg       0.25      0.38      0.29         5
weighted avg       0.30      0.40      0.33         5

Predicted Topic for 2025 (Q2_Topics): the account of the compilation of the Qur’an under the Rightly Guided Caliphs 

Accuracy for Q3_Topic: 0.167
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



Step,Training Loss
5,4.8486
10,0.6087
15,0.5155
20,0.3766
25,0.3084
30,0.217
35,0.2438
40,0.2137
45,0.1605
50,0.1777


Finished GPT-2 training for Q2_Topics.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


--- Predicted Topic (RF) for Q2_Topics: the account of the compilation of the Qur’an under the Rightly Guided Caliphs 
--- Generated Question: (a) Write about the ways in which Abu ’ ’Umar was involved in the compilation and preservation of the Qur’an.  (b) ‘The Qur’an is not preserved in written form because it is not preserved in writing.’ Agree or disagree with this statement, giving reasons for your answer.





Step,Training Loss
5,5.2319
10,0.2327
15,0.1572
20,0.0967
25,0.0369
30,0.0326
35,0.0204
40,0.0165
45,0.0157
50,0.0095


Finished GPT-2 training for Q3_Topic.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


--- Predicted Topic (RF) for Q3_Topic: the importance of his actions as examples for Muslim individuals in their personal conduct and relations with others including women and non-Muslims
--- Generated Question: the importance of his actions as examples for Muslim individuals in their personal conduct and relations with others including women and non-Muslims





Step,Training Loss
5,5.0014
10,0.2494
15,0.153
20,0.0877
25,0.0634
30,0.0523
35,0.0408
40,0.0242
45,0.021
50,0.0103
