In [79]:
import pandas as pd
import spacy
from collections import Counter 
import numpy as np
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
import json

In [80]:
NER = spacy.load('en_core_web_sm')

In [81]:
def load_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

d_types = load_data("../data/disaster_types.json")
print(d_types)

{'disasters': ['Earthquake', 'Seismic Event', 'Tremor', 'Tsunami', 'Tidal Wave', 'Volcanic Eruption', 'Volcanic Explosion', 'Landslide', 'Rockslide', 'Mudslide', 'Avalanche', 'Snow Avalanche', 'Sinkhole', 'Ground Collapse', 'Hurricane', 'Cyclone', 'Typhoon', 'Superstorm', 'Tornado', 'Twister', 'Blizzard', 'Snowstorm', 'Ice Storm', 'Freezing Rain', 'Hailstorm', 'Hailstorm Event', 'Dust Storm', 'Sandstorm', 'Derecho', 'Windstorm', 'Flood', 'Severe Flooding', 'Flash Flood', 'Sudden Flood', 'Storm Surge', 'Coastal Flooding', 'Seiche', 'Lake Surge', 'Drought', 'Severe Drought', 'Heatwave', 'Extreme Heatwave', 'Wildfire', 'Forest Fire', 'Brush Fire', 'Firestorm', 'Cold Snap', 'Extreme Cold', 'Polar Vortex', 'Pandemic', 'Global Pandemic', 'Epidemic', 'Disease Outbreak', 'Insect Plague', 'Locust Swarm', 'Animal Stampede', 'Asteroid Impact', 'Meteor Strike', 'Solar Flare', 'Geomagnetic Storm', 'Gamma-Ray Burst', 'Nuclear Disaster', 'Radiation Leak', 'Nuclear Meltdown', 'Chemical Spill', 'Toxic 

In [91]:
def create_training_data(file, type):
    data = load_data(file)
    disasters = data["disasters"]
    patterns = []
    for item in disasters:
        pattern = {
            "label": type,
            "pattern": item
        } 
        patterns.append(pattern)
    return patterns

def generate_rules(patterns):
    nlp = English()
    ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
    ruler.add_patterns(patterns)

    print(f"Total patterns added: {len(ruler.patterns)}")
    print(f"Sample patterns: {ruler.patterns[:5]}")  # Print first 5 to verify
    
    nlp.to_disk("disaster_ner")

patterns = create_training_data("../data/disaster_types.json", "DISASTER")
generate_rules(patterns)
#Rules already generated (check app folder)

Total patterns added: 102
Sample patterns: [{'label': 'DISASTER', 'pattern': 'Earthquake'}, {'label': 'DISASTER', 'pattern': 'Seismic Event'}, {'label': 'DISASTER', 'pattern': 'Tremor'}, {'label': 'DISASTER', 'pattern': 'Tsunami'}, {'label': 'DISASTER', 'pattern': 'Tidal Wave'}]


In [83]:
df = pd.read_json("../data/california_wildfires_final_data.json")
df = df[["tweet_id", "tweet_text"]]
df1 = df.head()

In [93]:
def test_model(text):
    nlp = spacy.load("disaster_ner")
    print(nlp.pipe_names)
    doc = nlp(text)

    if not doc.ents:
        print("No entities detected")
    for ent in doc.ents:
        print("Entities found:")
        print(ent.text, ent.label_)

test_model("earthquake")
    

['entity_ruler']
No entities detected


In [52]:
def extract_locations(text):
    doc = NER(text)
    return [ent.text for ent in doc.ents if ent.label_ == "GPE" or ent.label_ == "LOC"]

location_dict = {}
for i in range(len(df1)):
    locations = extract_locations(df1.loc[i, "tweet_text"])
    location_dict[i] = locations
    print(df1.loc[i, "tweet_text"])
    print(location_dict[i])

all_locations = [loc for locs in location_dict.values() for loc in locs]

location_counts = Counter(all_locations)

print("Extracted locations: ", location_dict)
print("Location counts: ", location_counts)

RT @Gizmodo: Wildfires raging through Northern California are terrifying  
['Northern California']
PHOTOS: Deadly wildfires rage in California  
['California']
RT @Cal_OES: PLS SHARE: Were capturing wildfire response, recovery info here:  
[]
RT @Cal_OES: PLS SHARE: Were capturing wildfire response, recovery info here:  
[]
RT @TIME: California's raging wildfires as you've never seen them before  
['California']
Extracted locations:  {0: ['Northern California'], 1: ['California'], 2: [], 3: [], 4: ['California']}
Location counts:  Counter({'California': 2, 'Northern California': 1})


In [13]:
#checking all entities that were found
def extract_all_entities(text):
    doc = NER(text)
    for ent in doc.ents:
        print(f"{i} -> {ent.text}, {ent.label_} \n")

for i in range(len(df1)):
    extract_all_entities(df1.loc[i, "tweet_text"])

0 -> RT @Gizmodo, ORG 

0 -> Northern California, LOC 

1 -> California, GPE 

2 -> PLS SHARE, ORG 

3 -> PLS SHARE, ORG 

4 -> RT @TIME, PERSON 

4 -> California, GPE 

