In [5]:
!pip install transformers datasets peft accelerate evaluate



# **Download libiraries**

In [6]:
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer ,AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
import evaluate
import pandas as pd
import re
import evaluate

# **Load Data**

In [7]:
df1 = pd.read_csv('/content/full_interview_questions_dataset.csv')
df2 = pd.read_csv('/content/dataset (1).csv')

# **Data Exploration**

In [8]:
df1.head()

Unnamed: 0,question,role,category,difficulty
0,Explain the difference between list and tuple.,Software Engineer,Technical,medium
1,Explain the difference between list and array.,Software Engineer,Technical,medium
2,Explain the difference between list and hashmap.,Software Engineer,Technical,medium
3,Explain the difference between list and recurs...,Software Engineer,Technical,easy
4,Explain the difference between list and inheri...,Software Engineer,Technical,hard


In [9]:
df2.head()

Unnamed: 0,question,answer
0,What is supervised machine learning? 👶,Supervised learning is a type of machine learn...
1,What is regression? Which models can you use t...,Regression is a part of supervised ML. Regress...
2,What is linear regression? When do we use it? 👶,Linear regression is a model that assumes a li...
3,What are the main assumptions of linear regres...,There are several assumptions of linear regres...
4,What’s the normal distribution? Why do we care...,The normal distribution is a continuous probab...


In [10]:
print(df1.shape)
print(df2.shape)

(660, 4)
(166, 2)


In [11]:
print(df1.isnull().sum())
print("*"*10)
print(df2.isnull().sum())

question      0
role          0
category      0
difficulty    0
dtype: int64
**********
question    0
answer      0
dtype: int64


In [12]:
df1['role'].value_counts()

Unnamed: 0_level_0,count
role,Unnamed: 1_level_1
Software Engineer,600
HR,60


# **Data modification**

In [13]:
role = ["data scientist" for i in range(0 , df2.shape[0])]
df2['role'] = role

In [14]:
df1.drop(columns =['difficulty' , 'category'] , inplace=True)
df2.drop(columns ='answer' , inplace=True)

In [15]:
print(df1.columns)
print(df2.columns)

Index(['question', 'role'], dtype='object')
Index(['question', 'role'], dtype='object')


In [16]:
df = pd.concat([df1 , df2])
df.head()

Unnamed: 0,question,role
0,Explain the difference between list and tuple.,Software Engineer
1,Explain the difference between list and array.,Software Engineer
2,Explain the difference between list and hashmap.,Software Engineer
3,Explain the difference between list and recurs...,Software Engineer
4,Explain the difference between list and inheri...,Software Engineer


In [17]:
df.shape

(826, 2)

# **Data preprocessing**

In [18]:
def clean_text(text):
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"[^a-zA-Z0-9\s\.\,\?\!]", " ", text)
    text = re.sub(r'\s+' , ' ' , text).strip().lower()
    return text

In [19]:
df['clean_question'] = df['question'].apply(clean_text)

In [20]:
df['Text'] = df.apply(lambda x : "role : " + x['role'] + " Question : " + x['clean_question'] , axis=1)

# **FineTuning**

In [21]:
dataset = Dataset.from_pandas(df[["Text"]])

In [22]:
model_name = "EleutherAI/gpt-neo-1.3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [23]:
def preprocess_function(examples):
    inputs = tokenizer(
        examples["Text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    inputs["labels"] = inputs["input_ids"].copy()
    return inputs

In [24]:
dataset_split = dataset.train_test_split(test_size=0.1, seed=42)

train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

model = AutoModelForCausalLM.from_pretrained(model_name)

Map:   0%|          | 0/826 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

In [25]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)

args = TrainingArguments(
    output_dir="./lora-job-qa",
    per_device_train_batch_size=4,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=50,
    save_total_limit=2,
    report_to="none",
    fp16=torch.cuda.is_available()
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [27]:
trainer.train()

In [None]:
trainer.evaluate()

# **Generating questions**

In [None]:
def generate_questions(job_title, num_questions=5):
    prompt = f"Role: {job_title}\nQuestion:"


    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)

    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=50,
        num_return_sequences=num_questions,
        do_sample=True,
        temperature=0.8,
        top_p=0.9
    )

    return [tokenizer.decode(o, skip_special_tokens=True) for o in outputs]

print(generate_questions("software engineer"))

In [None]:
rouge = evaluate.load("rouge")
references = [ref for ref in df["text"].tolist()[:len(generated_questions)]]

results = rouge.compute(predictions=generated_questions, references=references)
print("ROUGE Scores:", results)