In [84]:
import pandas as pd
import spacy
from collections import Counter 
import numpy as np
from spacy.lang.en import English
from spacy.lookups import Lookups
from spacy.pipeline import EntityRuler
import json

In [94]:
def load_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

d_types = load_data("../data/disaster_types.json")
print(d_types)

{'disasters': ['Earthquake', 'Seismic Event', 'Tremor', 'Tsunami', 'Tidal Wave', 'Volcanic Eruption', 'Volcanic Explosion', 'Landslide', 'Rockslide', 'Mudslide', 'Avalanche', 'Snow Avalanche', 'Sinkhole', 'Ground Collapse', 'Hurricane', 'Cyclone', 'Typhoon', 'Superstorm', 'Tornado', 'Twister', 'Blizzard', 'Snowstorm', 'Ice Storm', 'Freezing Rain', 'Hailstorm', 'Hailstorm Event', 'Dust Storm', 'Sandstorm', 'Derecho', 'Windstorm', 'Flood', 'Severe Flooding', 'Flash Flood', 'Sudden Flood', 'Storm Surge', 'Coastal Flooding', 'Seiche', 'Lake Surge', 'Drought', 'Severe Drought', 'Heatwave', 'Extreme Heatwave', 'Wildfire', 'Forest Fire', 'Brush Fire', 'Firestorm', 'Cold Snap', 'Extreme Cold', 'Polar Vortex', 'Pandemic', 'Global Pandemic', 'Epidemic', 'Disease Outbreak', 'Insect Plague', 'Locust Swarm', 'Animal Stampede', 'Asteroid Impact', 'Meteor Strike', 'Solar Flare', 'Geomagnetic Storm', 'Gamma-Ray Burst', 'Nuclear Disaster', 'Radiation Leak', 'Nuclear Meltdown', 'Chemical Spill', 'Toxic 

In [129]:
nlp = spacy.load("en_core_web_sm", disable=["parser"])

def create_training_data(file, type):
    data = load_data(file)
    disasters = data["disasters"]
    patterns = []
    for item in disasters:
        doc = nlp(item)
        #print(item)
        pattern_tokens = [{"LEMMA": token.lemma_.lower()} for token in doc]
        #print(pattern_tokens)
        
        patterns.append({"label": type, "pattern": pattern_tokens})
    return patterns

# {"label": "DISASTER", "pattern": [{"lemma": "wildfire"}]}

def generate_rules_n_lemma(patterns):
    ruler = nlp.add_pipe("entity_ruler", before="ner")
    ruler.add_patterns(patterns)
    
    print(f"Total patterns added: {len(ruler.patterns)}")
    print(f"Sample patterns: {ruler.patterns[:5]}")  # Print first 5 to verify
    
    nlp.to_disk("disaster_ner")

patterns = create_training_data("../data/disaster_types.json", "DISASTER")
generate_rules_n_lemma(patterns)
#Rules already generated (check app folder)

Total patterns added: 102
Sample patterns: [{'label': 'DISASTER', 'pattern': [{'LEMMA': 'earthquake'}]}, {'label': 'DISASTER', 'pattern': [{'LEMMA': 'seismic'}, {'LEMMA': 'event'}]}, {'label': 'DISASTER', 'pattern': [{'LEMMA': 'tremor'}]}, {'label': 'DISASTER', 'pattern': [{'LEMMA': 'tsunami'}]}, {'label': 'DISASTER', 'pattern': [{'LEMMA': 'tidal'}, {'LEMMA': 'wave'}]}]


In [160]:
df = pd.read_json("../data/california_wildfires_final_data.json")
df = df[["tweet_id", "tweet_text"]]
df1 = df.head()

In [141]:
nlp = spacy.load("disaster_ner")
def test_model(text):
    doc = nlp(text.lower())
    for ent in doc.ents:
        print(f"Entity: {ent.text} | Label: {ent.label_}")
print(df1.loc[0, "tweet_text"])

test_model("RT @Cal_OES: PLS SHARE: Were capturing Wildfire response, recovery info here:")
test_model("PHOTOS: Deadly wildfires rage in California")
test_model(df1.loc[0, "tweet_text"])
test_model("Huge tidal waves hit Tokyo")


      

RT @Gizmodo: Wildfires raging through Northern California are terrifying
Entity: wildfire | Label: DISASTER
Entity: wildfires | Label: DISASTER
Entity: california | Label: GPE
Entity: wildfires | Label: DISASTER
Entity: northern california | Label: LOC
Entity: tidal waves | Label: DISASTER
Entity: tokyo | Label: GPE


In [161]:
def extract_entities(text):
    print(text)
    doc = nlp(text.lower())
    disasters = []
    locations = []

    for ent in doc.ents:
        if ent.label_ == "DISASTER":
            disasters.append(ent.text)
        elif ent.label_ in ["GPE", "LOC", "FAC"]:
            locations.append(ent.text)

    return {"disasters": disasters, "locations": locations}

df1.loc[:, ["disasters", "locations"]] = df1["tweet_text"].apply(
    lambda x: pd.Series(extract_entities(x))
)

print(df1)

             tweet_id                                         tweet_text
0  917791044158185472  RT @Gizmodo: Wildfires raging through Northern...
1  917791130590183424      PHOTOS: Deadly wildfires rage in California  
2  917791291823591424  RT @Cal_OES: PLS SHARE: Were capturing wildfir...
3  917791291823591424  RT @Cal_OES: PLS SHARE: Were capturing wildfir...
4  917792092100988928  RT @TIME: California's raging wildfires as you...
RT @Gizmodo: Wildfires raging through Northern California are terrifying
PHOTOS: Deadly wildfires rage in California  
RT @Cal_OES: PLS SHARE: Were capturing wildfire response, recovery info here:  
RT @Cal_OES: PLS SHARE: Were capturing wildfire response, recovery info here:  
RT @TIME: California's raging wildfires as you've never seen them before  
             tweet_id                                         tweet_text  \
0  917791044158185472  RT @Gizmodo: Wildfires raging through Northern...   
1  917791130590183424      PHOTOS: Deadly wildfires rage

In [144]:
#checking all entities that were found
def extract_all_entities(text):
    doc = nlp(text)
    for ent in doc.ents:
        print(f"{i} -> {ent.text}, {ent.label_} \n")

for i in range(len(df1)):
    extract_all_entities(df1.loc[i, "tweet_text"])

0 -> RT @Gizmodo, ORG 

0 -> Northern California, LOC 

1 -> wildfires, DISASTER 

1 -> California, GPE 

2 -> PLS SHARE, ORG 

2 -> wildfire, DISASTER 

3 -> PLS SHARE, ORG 

3 -> wildfire, DISASTER 

4 -> RT @TIME, PERSON 

4 -> California, GPE 

4 -> wildfires, DISASTER 

