In [33]:
! pip install -U spacy -q

In [34]:
!python -m spacy info

[1m

spaCy version    3.7.4                         
Location         /usr/local/lib/python3.10/dist-packages/spacy
Platform         Linux-6.1.58+-x86_64-with-glibc2.35
Python version   3.10.12                       
Pipelines        en_core_web_sm (3.7.1)        



In [35]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object

In [36]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [37]:
import json
f = open('test_data.json')
TRAIN_DATA = json.load(f)

In [38]:
TRAIN_DATA

{'classes': ['DATE',
  'PERSON',
  'PLACE',
  'CRIME NAME',
  'CRIME ACT SECTION',
  'ORGANISATION',
  'ENTITY',
  'AGE',
  'TIME',
  'POLICE RANK',
  'POLICE STATION',
  'DAY',
  'YEAR'],
 'annotations': [['Mangaluru: In a case of display of brutality, a purported video which captured a daughter-in-law brutally beating her octogenarian father-in-law with a steel walking stick has sent shockwaves across social media platforms.\r',
   {'entities': [[0, 9, 'PLACE'],
     [35, 44, 'CRIME NAME'],
     [97, 113, 'CRIME NAME'],
     [152, 171, 'ENTITY']]}],
  ['The footage, which has swiftly gone viral, showcases an act of abuse that has sparked widespread condemnation and calls for justice among the civil society in Mangaluru.\r',
   {'entities': [[4, 11, 'ENTITY'],
     [56, 68, 'CRIME NAME'],
     [159, 168, 'PLACE']]}],
  ['\r', {'entities': []}],
  ['In the incident caught on CCTV, a 87-year-old man, Pa dmanabha Suvarna, was subjected to vicious beating by his daughter-in-law, Uma Shank

In [39]:
for text, annot in tqdm(TRAIN_DATA['annotations']):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./training_data.spacy") # save the docbin object

100%|██████████| 214/214 [00:00<00:00, 2512.90it/s]


In [40]:
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency


[38;5;1m✘ The provided output file already exists. To force overwriting the
config file, set the --force or -F flag.[0m



In [41]:
! python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     53.85    0.00    0.00    0.00    0.00
  4     200       2237.31   3445.96   68.36   71.23   65.72    0.68
 11     400        370.95   1128.00   94.06   94.30   93.81    0.94
 19     600        210.68    361.03   98.84   98.97   98.71    0.99
 29     800        343.28    215.85   99.74   99.74   99.74    1.00
 41    1000         43.52     21.35  100.00  100.00  100.00    1.00
 56    1200        101.45     47.04  100.00  100.00  100.00    1.00
 75    1400         83.19     23.29  100.00  100.00  100.00    1.00
 98    1600        259.87     74.89  100.00  100.00  100.00    1.00
127    1800         65.23     20.16  100.00  100.00

In [42]:
nlp_ner = spacy.load("/content/model-best")

In [43]:
doc = nlp_ner('''A 22-year-old man was arrested on Saturday for allegedly raping a woman here and threatening to upload her private video online, police said. Rahul Yadav, a resident of Handia in Prayagraj district, was booked on Friday under charges of rape and criminal intimidation for allegedly raping a 20-year-old woman.
He was arrested from near Gyanpur-Gopiganj tri-section here, police said. Gyanpur Police Station SHO Vinod Kumar Tiwari Yadav used to talk to the woman while visiting his elder's brother's in-laws who live in her village.

On March 3, he took her on a motorcycle to a place and raped her and also made a video of it, he said. According to police, after the incident, he blackmailed her into having sex with him several times till March 5 using the video.

Tiwari said the victim herself approached police on Friday evening and lodged a complaint against Yadav. The medical examination of the woman is yet to be conducted, the SHO said.''')

In [44]:
spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

In [45]:
doc = nlp_ner('''A 17-year-old girl was kidnapped from the Ubhaon area of Uttar Pradesh's Ballia district and was taken to Karnataka, where she was raped for almost two months. She was rescued by police, officials said on Wednesday.

The accused man was also arrested on Tuesday and sent to jail, they said.

The girl was allegedly abducted by a 20-year-old man from her village on the night of August 14. On the complaint of the girl's mother, a case was registered under sections 363 (kidnapping) and 366 (kidnapping or inducing a woman to compel for marriage) of the Indian Penal Code (IPC) on September 11, Ubhaon SHO Rajiv Mishra said.

On Tuesday, the police rescued the girl from near Bilthra Roadways and arrested the accused, he said.

In her statement to the police, the girl said the accused kidnapped and took her to Karnataka where he raped her for almost two months, the officer said.

Based on the statement, the police have added Section 376 (rape) of the IPC and provisions of the Protection of Children from Sexual Offenses (POCSO) Act in the case, he added.''')

In [46]:
spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter