In [1]:
import random
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('data/viirs-archive.csv')

In [3]:
df = df[df['confidence'].isin(['h', 'n'])]

In [4]:
df.shape

(3453287, 15)

In [5]:
df = df.drop_duplicates(subset=['latitude', 'longitude', 'acq_date', 'acq_time'])

In [6]:
df.columns

Index(['latitude', 'longitude', 'brightness', 'scan', 'track', 'acq_date',
       'acq_time', 'satellite', 'instrument', 'confidence', 'version',
       'bright_t31', 'frp', 'daynight', 'type'],
      dtype='object')

In [7]:
df['type'].value_counts()

type
0    3075759
2     362885
3      14457
1        185
Name: count, dtype: int64

In [8]:
model_data = df[['latitude', 'longitude', 'acq_date', 'daynight']]

In [9]:
model_data['acq_date'] = pd.to_datetime(df['acq_date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data['acq_date'] = pd.to_datetime(df['acq_date'])


In [10]:
model_data['daynight'] = model_data['daynight'].map({'D': 0, 'N': 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data['daynight'] = model_data['daynight'].map({'D': 0, 'N': 1})


In [11]:
model_data = model_data.rename(columns={
    'latitude': 'lat',
    'longitude': 'lon',
    'acq_date': 'date',
    'daynight': 'night'
})

In [12]:
model_data = model_data.sample(n=100000, random_state=42)

In [14]:
model_data['label'] = 1

In [16]:
model_data.head()

Unnamed: 0,lat,lon,date,night,label
2650995,18.19149,95.5533,2024-03-25,0,1
1571830,15.54539,102.95925,2023-04-21,0,1
2574488,9.78367,105.51168,2024-03-17,0,1
437807,22.77363,96.01568,2023-02-07,0,1
417287,20.69721,94.09389,2023-02-05,1,1


In [17]:
# for fast lookup
fire_keys = set(model_data.apply(lambda row: f"{round(row.lat, 3)}_{round(row.lon, 3)}_{row.date}", axis=1))

In [None]:
def generate_negative_sample(lat, lon, date, night, radius_km=50):
    attempts = 0
    while attempts < 10:
        # random bearing + distance
        angle = random.uniform(0, 360)
        distance_km = random.uniform(10, radius_km)

        # Rough earth radius projection
        delta_lat = distance_km / 111  # approx 1 deg = 111km
        delta_lon = delta_lat / np.cos(np.radians(lat))

        new_lat = lat + delta_lat * np.cos(np.radians(angle))
        new_lon = lon + delta_lon * np.sin(np.radians(angle))

        key = f"{round(new_lat, 3)}_{round(new_lon, 3)}_{date}"
        if key not in fire_keys:
            return {'lat': new_lat, 'lon': new_lon, 'date': date, 'night': night, 'label': 0}
        attempts += 1
    return None

# How many negative samples per fire point
NEG_PER_POS = 2

# Generate negatives
negatives = []
for _, row in model_data.iterrows():
    for _ in range(NEG_PER_POS):
        neg = generate_negative_sample(row.lat, row.lon, row.date, row.night)
        if neg:
            negatives.append(neg)

neg_df = pd.DataFrame(negatives)


In [26]:
neg_df

Unnamed: 0,lat,lon,date,night,label
0,18.080140,95.549437,2024-03-25,0,0
1,18.084024,95.455099,2024-03-25,0,0
2,15.268696,103.311455,2023-04-21,0,0
3,15.688964,103.266818,2023-04-21,0,0
4,10.056063,105.642383,2024-03-17,0,0
...,...,...,...,...,...
199995,23.927267,80.016759,2023-11-23,0,0
199996,15.545627,100.924812,2025-01-12,0,0
199997,15.673956,101.383327,2025-01-12,0,0
199998,29.319594,82.439699,2025-01-09,1,0


In [27]:
combined = pd.concat([model_data, neg_df], ignore_index=True)

In [30]:
combined = combined.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
combined = combined.rename(columns={
    'lat': 'latitude',
    'lon': 'longitude',
})

In [31]:
combined.to_csv('data/viirs-model-data.csv', index=False)