### 0. Import necessary libraries and methods

In [28]:
from project.preprocess_data import clean_data, parse_categories
from project.linear_regression import hashed_reg
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score

In [None]:
import pandas as pd
import numpy as np

def datetime_to_hours_int(x):
    return [i.total_seconds()/3600 for i in x]

def clean_data(df, remove_false_alarms=True) -> pd.DataFrame:
    print('df initial length: ', len(df))

    data_clean = df[df['Natureza'] != '1']
    data_clean = data_clean[data_clean['EstadoOcorrencia'] != '2']
    data_clean = data_clean[data_clean['Distrito'] != '0']
    data_clean = data_clean[data_clean['Concelho'] != '0']
    data_clean = data_clean[data_clean['Numero'] != """\t\t\t\t\t\t\t\t"""]
    # data_clean = data_clean[data_clean['EstadoOcorrencia'] != 'Falso Alerta']
    # data_clean = data_clean[data_clean['EstadoOcorrencia'] != 'Falso Alarme']
    data_clean = pd.concat([data_clean[data_clean['EstadoOcorrencia'] == 'Encerrada'], data_clean[data_clean['EstadoOcorrencia'] == 'Falso Alarme']])

    # We replace the "," with "." to facilitate processing
    data_clean['Latitude'] = pd.to_numeric(
        data_clean['Latitude'].str.replace(',', '.')
    )
    data_clean['Longitude'] = pd.to_numeric(
        data_clean['Longitude'].str.replace(',', '.')
    )    

    print(data_clean['Longitude'][0], ' :', type(data_clean['Longitude'][0]))
    print(data_clean['Latitude'][0], ' :', type(data_clean['Latitude'][0]))

    data_clean['DataOcorrencia'] = pd.to_datetime(data_clean['DataOcorrencia'], format='%d/%m/%Y %H:%M:%S')
    data_clean['DataFechoOperacional'] = pd.to_datetime(data_clean['DataFechoOperacional'], format='%d/%m/%Y %H:%M:%S')

    print(data_clean['DataOcorrencia'][0], ' :', type(data_clean['DataOcorrencia'][0]))
    print(data_clean['DataFechoOperacional'][0], ' :', type(data_clean['DataFechoOperacional'][0]))

    if remove_false_alarms == True:
        data_clean = data_clean[data_clean['EstadoOcorrencia'] != 'Falso Alarme']

    data_clean.dropna(inplace=True)
    print('df length: ', len(data_clean))

    data_clean['hh'] = np.multiply(
        (data_clean['NumeroOperacionaisAereosEnvolvidos'] + data_clean['NumeroOperacionaisTerrestresEnvolvidos']),
        datetime_to_hours_int(data_clean['DataFechoOperacional'] - data_clean['DataOcorrencia'])
    )

    return data_clean

    # Define a function to extract category from a string
def _extract_category(string, cat_index):
    if string == "nan" or any(str.isdigit(c) for c in string):
        return
    else:
        parts = string.split("/", 4)
        return parts[cat_index-1].strip()


#Reformulate categories based on functions defined
def parse_categories(data_clean:pd.DataFrame):
    data_clean['category1'] = data_clean['Natureza'].astype(str).apply(lambda x: _extract_category(x,cat_index=1))
    data_clean['category2'] = data_clean['Natureza'].astype(str).apply(lambda x: _extract_category(x,cat_index=2))
    data_clean['category3'] = data_clean['Natureza'].astype(str).apply(lambda x: _extract_category(x,cat_index=3))
    
    #Consolidate categories with little statistical relevance
    relevance_threshold = 100
    category_counts = data_clean['category3'].value_counts()
    filtered_categories = category_counts[category_counts <= relevance_threshold].index.tolist()
    # Replace the filtered categories with a new "Other" category
    data_clean['category3'] = data_clean['category3'].replace(
    to_replace=filtered_categories,
    value='Outras ocorrências'
    )
    
    return data_clean

import pandas as pd
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import FeatureHasher

# Read the database .CSV
occurrences = pd.read_csv(
    "Data/anpc-2016.csv", sep=",", on_bad_lines="skip", low_memory=False
)

# Get relevant and error-free data
occurrences = parse_categories(clean_data(occurrences))

def hashed_reg(occurrences):
    # Encode the categorical variables using feature hashing with HashingVectorizer
    num_districts = occurrences.nunique()["Distrito"]
    hash_size_increase = 2.5  # Increase number of features to avoid hash collision
    district_vectorizer = HashingVectorizer(n_features=num_districts, norm=None, alternate_sign=False)
    X_cat = district_vectorizer.fit_transform(occurrences["Distrito"])

    num_categories = occurrences.nunique()["category2"]
    hash_size_increase = 2.5  # Increase number of features to avoid hash collision
    category_vectorizer = HashingVectorizer(n_features=num_categories, norm=None, alternate_sign=False)
    X_cat2 = category_vectorizer.fit_transform(occurrences["category2"])

    # Combine the encoded categorical variables and the numerical variable
    X = pd.concat(
        [
        pd.DataFrame(X_cat.toarray()), 
        pd.DataFrame(X_cat2.toarray()),
        pd.DataFrame(datetime_to_hours_int(occurrences['DataOcorrencia'] - min(occurrences['DataOcorrencia'])))
        ], axis=1
    )

    # Fit a linear regression model to the data
    model = LinearRegression()
    model.fit(X, occurrences['hh'])

    return model.predict(X)

def hashed_reg_2(occurrences):
    num_districts = occurrences.nunique()["Distrito"]
    hash_size_increase = 2.5  # Increase number of features to avoid hash collision
    h = FeatureHasher(n_features=int(hash_size_increase * num_districts), input_type="string")
    return h.transform(list(occurrences["Distrito"]))

hashed_reg(occurrences)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

# shp -> shapefile (geometry)
# dbf -> database (extra info)
# prj - projection
# shx - index

# Get shape of municipalities and parishes
portugal_municipalities = gpd.read_file("data_concelhos/concelhos.shp")

# Remove islands
portugal_municipalities = portugal_municipalities[portugal_municipalities.NAME_1 != "Azores"]
portugal_municipalities = portugal_municipalities[portugal_municipalities.NAME_1 != "Madeira"]

# Get point defined by user
lat, lon = 40.9800516, -8.202641
user_point = Point(lon, lat)


def get_municipality(point: Point):
    for index, row in portugal_municipalities.iterrows():
        polygon = row['geometry']
        # Check if the point is inside the polygon
        if polygon.contains(point):
            # Return the index or other properties of the polygon
            return {
                "district": row.NAME_1,
                "municipality": row.NAME_2,
                "municipality_gdf": gpd.GeoDataFrame([row], crs=portugal_municipalities.crs)
            }


def plot_point_in_map(point, municipality_gdf):
    point_df = gpd.GeoDataFrame(geometry=[point])
    pt_map = portugal_municipalities.plot()
    municipality_gdf.plot(ax=pt_map, color='yellow')
    point_df.plot(ax=pt_map, color='red', markersize=10)
    plt.axis('off')
    plt.show()


municipality_data = get_municipality(point=user_point)
plot_point_in_map(user_point, municipality_data["municipality_gdf"])

In [None]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
from preprocess_data import clean_data
import os

#Parses the data and makes list of lists based on unique values for the Districts columns
def ocurrence_frequency(df:pd.DataFrame,categorylevel:str,geoscope:str,datasetisclean = False):
    
    if datasetisclean == False:
        ocorrencias = clean_data(ocorrencias)
    
    geoscopef = geoscope
    geo_list = []
    abreviation_dict = {'Assistência e Prevenção a actividades humanas' : 1, 'Comprometimento total ou parcial de segurança, serviços ou estruturas': 2,
                        'Incêndios Urbanos ou em Área Urbanizável':3, 'Incêndios em Equipamento e Produtos':4}

    for row1 in df[geoscopef].unique():
        cat_list = []
        i = 0
        #Parses the Districts columns to count ocurrences
        for row2 in df[geoscopef]:
            if row1 == row2:
                category = df[categorylevel].iloc[i]
                #Abbreviates some categories for better visualization graphs
                if category in abreviation_dict:
                    value = abreviation_dict[category]
                    if value == 1:
                        category = "Assist e Prev a actv humanas"
                    elif value == 2:
                        category = "Comp total ou parcial de seg"
                    elif value == 3:
                        category = "Incêndios urbanos"
                    elif value == 4:
                        category = "Incêndio em equipm"
                #Verifies if category is in list and adds to occurrence count if it does, adds category to list if it doesn't
                if category in cat_list:
                    cat_list.index(category)
                    cat_list[cat_list.index(category)+1] = cat_list[cat_list.index(category)+1] + 1
                else:
                    cat_list.append(category)
                    cat_list.append(1)
            i = i + 1
        geo_list.append(cat_list)
        

    # Transform the elements in each sublist into tuples of two
    list_of_tuples = [[(lst[2*i], lst[2*i+1]) for i in range(round(len(lst)/2-1))] for lst in geo_list]

    #Define a counter and makes relation between Ocurrences and Districts for visualization of data
    j = 0
    for row1 in df[geoscopef].unique():
        list_of_tuples[j] = sorted(list_of_tuples[j], key = lambda x: (x[0]))
    # Separate the x and y values into two lists
        x_values = [x[0] for x in list_of_tuples[j]]
        y_values = [x[1] for x in list_of_tuples[j]]
    # Plot the data
        plt.bar(x_values, y_values)
        plt.title("Breakdown of ocurrences for" + " " + str(row1))
        plt.xlabel("Ocurrence type")
        plt.xticks(fontsize=8,rotation=45,ha='right')
        plt.ylabel("Number of ocurrences")
        plt.tight_layout()
        plt.savefig(os.path.join('.','images',f'{row1}_ocurrences_{geoscope}_{categorylevel}.png'))
        plt.clf()
        plt.cla()
        j = j+1
        
    return list_of_tuples

In [None]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib import colormaps
import matplotlib.patches as mpatches
from utils import normalize_str


# Show the most prevalent issue category in each municipality
def plot_most_prevalent_issue_by_municipality(
        clean_df: pd.DataFrame,
        category_level: int = 2
):
    issue_by_municipality = {}
    for municipality in clean_df["Concelho"].unique():
        clean_df_by_municipality = clean_df[
            clean_df["Concelho"] == municipality
        ]
        most_prevalent_issue = clean_df_by_municipality.mode()[
            f"category{category_level}"
        ][0]
        issue_by_municipality[municipality] = most_prevalent_issue

    # Declare the color map that will be used
    color_map_names = ["viridis", "plasma", "cividis"]
    base_color_map = colormaps[color_map_names[category_level - 1]].resampled(
        len(set(issue_by_municipality.values()))
    )
    color_by_issue = {
        issue_name: base_color_map.colors[idx]
        for idx, issue_name in enumerate(set(issue_by_municipality.values()))
    }

    # Get shape of municipalities and parishes
    portugal_municipalities = gpd.read_file(
        "../georeferencing/data_concelhos/concelhos.shp"
    )
    # Remove islands
    portugal_municipalities = portugal_municipalities[
        portugal_municipalities.NAME_1 != "Azores"
    ]
    portugal_municipalities = portugal_municipalities[
        portugal_municipalities.NAME_1 != "Madeira"
    ]
    # Normalize municipality name
    portugal_municipalities["NAME_2"] = (
        portugal_municipalities["NAME_2"].astype(str).apply(normalize_str)
    )

    pt_map = portugal_municipalities.plot(color="gray")

    for municipality_name, issue_name in issue_by_municipality.items():
        matching_municipalities = portugal_municipalities.query(
            f"NAME_2=='{normalize_str(municipality_name)}'"
        )
        for _, row in matching_municipalities.iterrows():
            municipality_gdf = gpd.GeoDataFrame([row], crs=portugal_municipalities.crs)
            municipality_gdf.plot(ax=pt_map, color=color_by_issue[issue_name])

    pt_map.legend(
        handles=[
            # mpatches.Patch(color="gray", label="Sem dados"),  # We have all of it!
            *[
                mpatches.Patch(color=base_color_map.colors[idx], label=issue_name)
                for idx, issue_name in enumerate(set(issue_by_municipality.values()))
            ],
        ],
        loc="lower left",
        bbox_to_anchor=(0.1, 1),
    )
    plt.axis("off")
    plt.show()

### 1. Import data

In [29]:
dataset = pd.read_csv("Data/anpc-2016.csv")

  dataset = pd.read_csv("Data/anpc-2016.csv")


### 2. Data preparation

In [30]:
clean_dataset = parse_categories(clean_data(dataset))

df initial length:  121187
-9.002235449  : <class 'numpy.float64'>
38.68091202  : <class 'numpy.float64'>
2016-01-09 14:02:00  : <class 'pandas._libs.tslibs.timestamps.Timestamp'>
2016-01-09 17:30:00  : <class 'pandas._libs.tslibs.timestamps.Timestamp'>
df length:  118168


### 3. Data modelling

In [31]:
y_predicted = hashed_reg(occurrences=clean_dataset)

### 4. Evaluation

In [32]:
r2_score(clean_dataset['hh'], y_predicted)

0.0002740760194571701