In [1]:
import pandas as pd
import joblib  # for saving/loading models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import ColumnTransformer

# 1. Load Data
df = pd.read_csv("/kaggle/input/paper1-dataset/Q5_prediction_complete.csv")

# Identify and remove rare classes (classes with only one sample)
class_counts = df['Q5_Topic'].value_counts()
rare_classes = class_counts[class_counts == 1].index
df = df[~df['Q5_Topic'].isin(rare_classes)]

# 2. Feature Engineering
label_encoder = LabelEncoder()
df['Q5_Topic'] = df['Q5_Topic'].astype(str)
df['Q5_Topic_Encoded'] = label_encoder.fit_transform(df['Q5_Topic'])

ct = ColumnTransformer(
    [('onehot', OneHotEncoder(handle_unknown='ignore'), ['Paper_Session'])],
    remainder='passthrough'
)

X = df[['Year', 'Paper_Session', 'Paper_Varient']]
X.columns = X.columns.astype(str)
X_transformed = ct.fit_transform(X)  # fit_transform for training data
y = df['Q5_Topic_Encoded']

# 3. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_transformed, y, test_size=0.3, random_state=42, stratify=y
)

# 4. Model Training
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# 5. Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")
print(classification_report(y_test, y_pred, zero_division=0))

# 6. Prediction for Next Year (example)
next_year_data = pd.DataFrame({
    'Year': [2025],
    'Paper_Session': ['MJ'],
    'Paper_Varient': ['12']
})
# We must transform it with the same ColumnTransformer
X_next_year = ct.transform(next_year_data)
predicted_topic_encoded = model.predict(X_next_year)
predicted_topic = label_encoder.inverse_transform(predicted_topic_encoded)
print(f"Predicted Topic for 2025: {predicted_topic[0]}")

# 7. SAVE EVERYTHING
# We'll save: the trained model, the column transformer, and the label encoder
joblib.dump(model, "rf_model.pkl")            # saves the RandomForest
joblib.dump(ct, "column_transformer.pkl")     # saves the ColumnTransformer
joblib.dump(label_encoder, "label_encoder.pkl")


Accuracy: 0.133
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         1
           2       0.33      0.33      0.33         3
           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           6       0.33      0.25      0.29         4

    accuracy                           0.13        15
   macro avg       0.10      0.08      0.09        15
weighted avg       0.16      0.13      0.14        15

Predicted Topic for 2025: his leading Companions, including the Ten Blessed Companions during his lifetime.


['label_encoder.pkl']

In [2]:
!pip install transformers datasets imbalanced-learn difflib2 --quiet

import pandas as pd
import torch
import difflib
from torch.utils.data import Dataset
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    Trainer,
    TrainingArguments
)

#############################
# 1) Load & Combine Dataset
#############################
df = pd.read_csv("/kaggle/input/generation-dataset/Q5_dataset_complete.csv")
df = df.dropna(subset=["Q5_Topic", "Q5"])  # remove any empty rows

# We'll store all existing questions in a set for uniqueness checks
existing_questions = set(df["Q5"].str.strip())

# For GPT-2 training, combine topic and question into a single string
train_texts = []
for _, row in df.iterrows():
    topic_str = str(row["Q5_Topic"]).strip()
    question_str = str(row["Q5"]).strip()
    combined = f"Topic: {topic_str}\nQuestion: {question_str}\n<|endoftext|>"
    train_texts.append(combined)

#############################
# 2) Custom Dataset
#############################
class TopicQuestionDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.encodings = []
        for txt in texts:
            enc = tokenizer(
                txt,
                truncation=True,
                max_length=max_length,
                padding="max_length"
            )
            self.encodings.append(enc)

    def __len__(self):
        return len(self.encodings)

    def __getitem__(self, idx):
        item = self.encodings[idx]
        return {
            "input_ids": torch.tensor(item["input_ids"]),
            "attention_mask": torch.tensor(item["attention_mask"])
        }


model_name = "gpt2"  # or "distilgpt2" for smaller
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# GPT-2 pad fix
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id


train_dataset = TopicQuestionDataset(train_texts, tokenizer, max_length=250)

def data_collator(batch):
    input_ids = torch.stack([f["input_ids"] for f in batch])
    attention_mask = torch.stack([f["attention_mask"] for f in batch])
    labels = input_ids.clone()  # causal language modeling
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }


training_args = TrainingArguments(
    output_dir="./temp-output",
    overwrite_output_dir=True,
    num_train_epochs=20,
    per_device_train_batch_size=2,
    logging_steps=5,
    logging_strategy="steps",
    save_strategy="no",   # no checkpoint saving
    report_to=[]
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)

#############################
# 6) Fine-Tune GPT-2
#############################
trainer.train()
print("Training completed.")


# def is_too_similar(new_q, existing_set, threshold=0.8):
#     """
#     Returns True if 'new_q' is >= threshold similarity
#     with any question in 'existing_set'.
#     Using difflib.SequenceMatcher ratio.
#     """
#     for q in existing_set:
#         ratio = difflib.SequenceMatcher(None, new_q, q).ratio()
#         if ratio >= threshold:
#             return True
#     return False

def generate_unique_question_for_topic(
    topic,
    tokenizer=tokenizer,
    model=model,
    existing_set=existing_questions,
    max_length=300,
    temperature=0.5,
    top_p=0.9,
    fuzzy_threshold=0.5,
    max_tries=5
):
    """
    Generates a new question for the given 'topic' by prompting GPT-2 with:
    "Topic: {topic}\nQuestion:"
    Skips if the question is exactly or too similar to existing dataset questions.
    Tries up to 'max_tries'.
    """
    for attempt in range(max_tries):
        # 1) Prepare prompt
        prompt = f"Topic: {topic}\nQuestion:"
        input_ids = tokenizer.encode(prompt, return_tensors='pt')
        input_ids = input_ids.to(model.device)

        # 2) Generate
        output = model.generate(
            input_ids=input_ids,
            max_length=max_length,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            num_return_sequences=1
        )

        text = tokenizer.decode(output[0], skip_special_tokens=True)
        
        # 3) Extract the question portion
        if "Question:" in text:
            splitted = text.split("Question:")
            gen_question = splitted[-1].strip()
        else:
            gen_question = text.strip()

    
        return gen_question

    return "No unique question found after multiple attempts."


new_question = generate_unique_question_for_topic(
    topic=predicted_topic,
    max_length=300,
    temperature=0.7,
    top_p=0.9,
    fuzzy_threshold=0.4,
    max_tries=5
)
print(f"For topic: '{predicted_topic}'\nNew question:\n{new_question}")


[31mERROR: Could not find a version that satisfies the requirement difflib2 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for difflib2[0m[31m
[0m

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



Step,Training Loss
5,4.4438
10,1.0971
15,0.8452
20,0.742
25,0.574
30,0.5141
35,0.4222
40,0.445
45,0.3296
50,0.3063


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Training completed.
For topic: '['his leading Companions, including the Ten Blessed Companions during his lifetime.']'
New question:
(a) Write about the lives of the Companions Khalid ibn al-Khattab and Abu Sufyan ibn Harb. [10] (b) How can Muslims follow the example of the Prophet’s wives?
