In [1]:
import pandas as pd
import joblib  # for saving/loading models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import ColumnTransformer

# 1. Load Data
df = pd.read_csv("/kaggle/input/complete-dataset/Q2_dataset_complete.csv")

# Identify and remove rare classes (classes with only one sample)
class_counts = df['Q2_Topics'].value_counts()
rare_classes = class_counts[class_counts == 1].index
df = df[~df['Q2_Topics'].isin(rare_classes)]

# 2. Feature Engineering
label_encoder = LabelEncoder()
df['Q2_Topics'] = df['Q2_Topics'].astype(str)
df['Q2_Topic_Encoded'] = label_encoder.fit_transform(df['Q2_Topics'])

ct = ColumnTransformer(
    [('onehot', OneHotEncoder(handle_unknown='ignore'), ['Paper_Session'])],
    remainder='passthrough'
)

X = df[['Year', 'Paper_Session', 'Paper_Varient']]
X.columns = X.columns.astype(str)
X_transformed = ct.fit_transform(X)  # fit_transform for training data
y = df['Q2_Topic_Encoded']

# 3. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_transformed, y, test_size=0.3, random_state=42, stratify=y
)

# 4. Model Training
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# 5. Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")
print(classification_report(y_test, y_pred, zero_division=0))

# 6. Prediction for Next Year (example)
next_year_data = pd.DataFrame({
    'Year': [2025],
    'Paper_Session': ['MJ'],
    'Paper_Varient': ['12']
})
# We must transform it with the same ColumnTransformer
X_next_year = ct.transform(next_year_data)
predicted_topic_encoded = model.predict(X_next_year)
predicted_topic = label_encoder.inverse_transform(predicted_topic_encoded)
print(f"Predicted Topic for 2025: {predicted_topic[0]}")

# 7. SAVE EVERYTHING
# We'll save: the trained model, the column transformer, and the label encoder
joblib.dump(model, "rf_model.pkl")            # saves the RandomForest
joblib.dump(ct, "column_transformer.pkl")     # saves the ColumnTransformer
joblib.dump(label_encoder, "label_encoder.pkl")


Accuracy: 0.400
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.50      0.50      0.50         2
           2       0.50      1.00      0.67         1
           3       0.00      0.00      0.00         1

    accuracy                           0.40         5
   macro avg       0.25      0.38      0.29         5
weighted avg       0.30      0.40      0.33         5

Predicted Topic for 2025: the account of the compilation of the Qur’an under the Rightly Guided Caliphs 


['label_encoder.pkl']

In [15]:
!pip install transformers datasets imbalanced-learn difflib2 --quiet

import pandas as pd
import torch
import difflib
from torch.utils.data import Dataset
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    Trainer,
    TrainingArguments
)

#############################
# 1) Load & Combine Dataset
#############################
df = pd.read_csv("/kaggle/input/fazul-dataset/Q2_dataset_fazul.csv")
df = df.dropna(subset=["Q2_Topic", "Q2"])  # remove any empty rows

# We'll store all existing questions in a set for uniqueness checks
existing_questions = set(df["Q2"].str.strip())

# For GPT-2 training, combine topic and question into a single string
train_texts = []
for _, row in df.iterrows():
    topic_str = str(row["Q2_Topic"]).strip()
    question_str = str(row["Q2"]).strip()
    combined = f"Topic: {topic_str}\nQuestion: {question_str}\n<|endoftext|>"
    train_texts.append(combined)

#############################
# 2) Custom Dataset
#############################
class TopicQuestionDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.encodings = []
        for txt in texts:
            enc = tokenizer(
                txt,
                truncation=True,
                max_length=max_length,
                padding="max_length"
            )
            self.encodings.append(enc)

    def __len__(self):
        return len(self.encodings)

    def __getitem__(self, idx):
        item = self.encodings[idx]
        return {
            "input_ids": torch.tensor(item["input_ids"]),
            "attention_mask": torch.tensor(item["attention_mask"])
        }

#############################
# 3) Setup GPT-2
#############################
model_name = "gpt2"  # or "distilgpt2" for smaller
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# GPT-2 pad fix
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

#############################
# 4) Create Dataset & Collator
#############################
train_dataset = TopicQuestionDataset(train_texts, tokenizer, max_length=250)

def data_collator(batch):
    input_ids = torch.stack([f["input_ids"] for f in batch])
    attention_mask = torch.stack([f["attention_mask"] for f in batch])
    labels = input_ids.clone()  # causal language modeling
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

#############################
# 5) TrainingArguments & Trainer
#############################
training_args = TrainingArguments(
    output_dir="./temp-output",
    overwrite_output_dir=True,
    num_train_epochs=20,
    per_device_train_batch_size=2,
    logging_steps=5,
    logging_strategy="steps",
    save_strategy="no",   # no checkpoint saving
    report_to=[]
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)

#############################
# 6) Fine-Tune GPT-2
#############################
trainer.train()
print("Training completed.")

#############################
# 7) Fuzzy Matching Helper
#############################
def is_too_similar(new_q, existing_set, threshold=0.8):
    """
    Returns True if 'new_q' is >= threshold similarity
    with any question in 'existing_set'.
    Using difflib.SequenceMatcher ratio.
    """
    for q in existing_set:
        ratio = difflib.SequenceMatcher(None, new_q, q).ratio()
        if ratio >= threshold:
            return True
    return False

#############################
# 8) Generate Unique Question
#############################
def generate_unique_question_for_topic(
    topic,
    tokenizer=tokenizer,
    model=model,
    existing_set=existing_questions,
    max_length=250,
    temperature=0.7,
    top_p=0.9,
    fuzzy_threshold=0.5,
    max_tries=5
):
    """
    Generates a new question for the given 'topic' by prompting GPT-2 with:
    "Topic: {topic}\nQuestion:"
    Skips if the question is exactly or too similar to existing dataset questions.
    Tries up to 'max_tries'.
    """
    for attempt in range(max_tries):
        # 1) Prepare prompt
        prompt = f"Topic: {topic}\nQuestion:"
        input_ids = tokenizer.encode(prompt, return_tensors='pt')
        input_ids = input_ids.to(model.device)

        # 2) Generate
        output = model.generate(
            input_ids=input_ids,
            max_length=max_length,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            num_return_sequences=1
        )

        text = tokenizer.decode(output[0], skip_special_tokens=True)
        
        # 3) Extract the question portion
        if "Question:" in text:
            splitted = text.split("Question:")
            gen_question = splitted[-1].strip()
        else:
            gen_question = text.strip()

        # # 4) Check for duplicates or near-duplicates
        # if gen_question in existing_set:
        #     print("Regenerating (exact match in dataset)...")
        #     continue
        # if is_too_similar(gen_question, existing_set, threshold=fuzzy_threshold):
        #     print("Regenerating (too similar to dataset)...")
        #     continue

        # If we get here, it's a new question
        return gen_question

    return "No unique question found after multiple attempts."

#############################
# 9) Example Usage
#############################
new_question = generate_unique_question_for_topic(
    topic=predicted_topic,
    max_length=250,
    temperature=0.7,
    top_p=0.9,
    fuzzy_threshold=0.4,
    max_tries=5
)
print(f"For topic: '{predicted_topic}'\nNew question:\n{new_question}")


[31mERROR: Could not find a version that satisfies the requirement difflib2 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for difflib2[0m[31m
[0m



Step,Training Loss
5,4.3298
10,1.0879
15,0.8348
20,0.6823
25,0.5651
30,0.4376
35,0.3735
40,0.3298
45,0.2722
50,0.2544


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Training completed.
For topic: '['the account of the compilation of the Qur’an under the Rightly Guided Caliphs ']'
New question:
(a) Write about the different ways in which the Qur’an was compiled after the Prophet’s death. (b) How can Muslims today use the Qur’an to develop a closer connection with God?
