<a href="https://colab.research.google.com/github/mmilannaik/BigOCheatSheet/blob/master/NLP_3_Synthetic_NER_from_Housing_BERT%20NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🏗️ Generate Synthetic NER Data from Bangalore Housing CSV
This notebook takes structured property data (area_type, location, size, society) and turns it into:
- Natural-language style sentences (e.g., "Looking for a 2 BHK in Indiranagar")
- CoNLL-style NER training format

**Target Labels:**
- `B-LOCALITY`, `B-CONFIG`, `B-SOCIETY`

# Configurations

In [3]:
# 1. Install the Kaggle CLI
!pip install kaggle --quiet

# 2. Upload your Kaggle API token
#    • On Kaggle: Account → Create New API Token → download kaggle.json
#    • In Colab:
from google.colab import files
files.upload()   # select your kaggle.json

# 3. Configure the CLI
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [4]:
!kaggle datasets download -d ameythakur20/bangalore-house-prices
!unzip bangalore-house-prices.zip

Dataset URL: https://www.kaggle.com/datasets/ameythakur20/bangalore-house-prices
License(s): unknown
Archive:  bangalore-house-prices.zip
  inflating: bengaluru_house_prices.csv  


# Data Preparation

In [5]:
# Step 1: Load structured data
import pandas as pd

df = pd.read_csv('/content/bengaluru_house_prices.csv')

In [6]:
df.shape

(13320, 9)

In [7]:
df.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [8]:
df = df[['area_type', 'location', 'size', 'society']].dropna()
df = df[df['location'].str.strip().astype(bool)]
df.head()

Unnamed: 0,area_type,location,size,society
0,Super built-up Area,Electronic City Phase II,2 BHK,Coomee
1,Plot Area,Chikka Tirupathi,4 Bedroom,Theanmp
3,Super built-up Area,Lingadheeranahalli,3 BHK,Soiewre
5,Super built-up Area,Whitefield,2 BHK,DuenaTa
6,Super built-up Area,Old Airport Road,4 BHK,Jaades


In [9]:
# Step 2: Generate synthetic sentences + NER tags
synthetic_data = []

for _, row in df.iterrows():
    locality = row['location'].strip()
    config = row['size'].strip()
    society = row['society'].strip()
    base = f"Looking for a {config} in {locality}"
    if society:
        base += f" near {society} society."
    else:
        base += "."

    tokens = base.replace(',', '').replace('.', '').split()
    tags = []
    for token in tokens:
        if token in config.split():
            tags.append("B-CONFIG" if not tags or tags[-1] != "B-CONFIG" else "I-CONFIG")
        elif token in locality.split():
            tags.append("B-LOCALITY" if not tags or tags[-1] != "B-LOCALITY" else "I-LOCALITY")
        elif society and token in society.split():
            tags.append("B-SOCIETY" if not tags or tags[-1] != "B-SOCIETY" else "I-SOCIETY")
        else:
            tags.append("O")
    synthetic_data.append(list(zip(tokens, tags)))

In [10]:
# Step 3: Split and export CoNLL-format text files
from sklearn.model_selection import train_test_split
from pathlib import Path

train_data, test_data = train_test_split(synthetic_data, test_size=0.2, random_state=42)

def write_conll(data, file_path):
    with open(file_path, 'w') as f:
        for sentence in data:
            for token, tag in sentence:
                f.write(f"{token} {tag}\n")
            f.write("\n")

write_conll(train_data, "/content/synthetic_ner_train.txt")
write_conll(test_data, "/content/synthetic_ner_test.txt")

print("✅ Export complete: synthetic_ner_train.txt and synthetic_ner_test.txt")

✅ Export complete: synthetic_ner_train.txt and synthetic_ner_test.txt


# Bert Starting

In [11]:
# Step 1: Install dependencies
!pip install transformers datasets evaluate seqeval -q

In [2]:
import torch
torch.cuda.is_available()

True

In [17]:
from datasets import Dataset
from pathlib import Path

# def read_conll(filepath):
#     tokens, tags = [], []
#     sentence_tokens, sentence_tags = [], []
#     with open(filepath, 'r') as file:
#         for line in file:
#             if line.strip() == '':
#                 if sentence_tokens:
#                     tokens.append(sentence_tokens)
#                     tags.append(sentence_tags)
#                     sentence_tokens, sentence_tags = []
#             else:
#                 token, tag = line.strip().split()
#                 sentence_tokens.append(token)
#                 sentence_tags.append(tag)
#     return {'tokens': tokens, 'ner_tags': tags}

def read_conll(filepath):
    tokens = []
    tags = []
    sentence_tokens, sentence_tags = [], []

    with open(filepath, 'r') as file:
        for line_num, line in enumerate(file, 1):
            line = line.strip()

            if line == '':
                if sentence_tokens:
                    tokens.append(sentence_tokens)
                    tags.append(sentence_tags)
                    sentence_tokens, sentence_tags = [], []
            else:
                parts = line.split()
                if len(parts) != 2:
                    raise ValueError(f"Line {line_num} is malformed: {line!r}")
                token, tag = parts
                sentence_tokens.append(token)
                sentence_tags.append(tag)

    return {'tokens': tokens, 'ner_tags': tags}


train_path = '/content/synthetic_ner_test.txt'
test_path = '/content/synthetic_ner_test.txt'

train_data = read_conll(train_path)
test_data = read_conll(test_path)

# Build label list
unique_labels = sorted(set(tag for sent in train_data['ner_tags'] for tag in sent))
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}

# Map labels to IDs
train_data['labels'] = [[label2id[tag] for tag in tags] for tags in train_data['ner_tags']]
test_data['labels'] = [[label2id[tag] for tag in tags] for tags in test_data['ner_tags']]

train_ds = Dataset.from_dict({'tokens': train_data['tokens'], 'ner_tags': train_data['labels']})
test_ds = Dataset.from_dict({'tokens': test_data['tokens'], 'ner_tags': test_data['labels']})

# Tokenization

In [18]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

def tokenize_and_align_labels(example):
    tokenized = tokenizer(example['tokens'], truncation=True, is_split_into_words=True)
    word_ids = tokenized.word_ids()
    aligned_labels = []
    prev_word_idx = None
    for idx in word_ids:
        if idx is None:
            aligned_labels.append(-100)
        elif idx != prev_word_idx:
            aligned_labels.append(example['ner_tags'][idx])
        else:
            aligned_labels.append(-100)
        prev_word_idx = idx
    tokenized['labels'] = aligned_labels
    return tokenized

train_ds = train_ds.map(tokenize_and_align_labels)
test_ds = test_ds.map(tokenize_and_align_labels)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/1561 [00:00<?, ? examples/s]

Map:   0%|          | 0/1561 [00:00<?, ? examples/s]

In [19]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import evaluate

model = AutoModelForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(label2id), id2label=id2label, label2id=label2id)
args = TrainingArguments(
    output_dir='bert-realestate-ner',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy='epoch',
    num_train_epochs=3,
    logging_dir='./logs'
)
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = evaluate.load('seqeval')

def compute_metrics(p):
    predictions, labels = p
    preds = predictions.argmax(axis=-1)
    true_preds = [[id2label[p] for (p, l) in zip(pred, lab) if l != -100] for pred, lab in zip(preds, labels)]
    true_labels = [[id2label[l] for (p, l) in zip(pred, lab) if l != -100] for pred, lab in zip(preds, labels)]
    return metric.compute(predictions=true_preds, references=true_labels)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
trainer.train()

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'