In [None]:
import folium
import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
from folium.plugins import HeatMap
from sklearn.cluster import KMeans
from scipy.stats import skew, kurtosis
from sklearn.metrics import accuracy_score
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

#### Filtering the dataset by Borough Name and storing them as CSV

In [2]:
read_data = pd.read_excel(r'datasets/LFB_2019-22.xlsx')
dataset = read_data[read_data.IncGeo_BoroughName.str.contains('HAMMERSMITH AND FULHAM')]
dataset.to_csv('datasets/dataset.csv')

#### Removing Zero-Variance Columns
#### Those columns are IncGeo_BoroughCode, IncGeo_BoroughName, ProperCase, FRS

In [None]:
df = pd.read_csv(r'datasets/dataset.csv')
for column in df.columns:
    if len(df[column].unique()) == 1:
        df = df.drop(column, axis=1)
        print(column)
df.to_csv('datasets/filter_data.csv', index_label='Index')
df = pd.read_csv('datasets/filter_data.csv', index_col='Index', parse_dates=["DateOfCall"], dtype={"CalYear": int, "HourOfCall": int})
df.info()

#### Code for script (re-evaluate)

In [None]:
# load the sample data
data = pd.read_csv("datasets/dataset.csv")

# select relevant columns
features = ["TimeOfCall", "PropertyCategory", "PropertyType", "Postcode_district"]
target = "IncidentGroup"

# convert categorical columns to numeric using one-hot encoding
data = pd.get_dummies(data[features + [target]])
target = "IncidentGroup_False Alarm"

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data.drop(target, axis=1), data[target], test_size=0.2, random_state=42)

# train a decision tree classifier
clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)

# make predictions on the test set and calculate accuracy
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


from sklearn.tree import export_graphviz
import graphviz

dot_data = export_graphviz(clf, out_file=None, 
                           feature_names=X_train.columns,  
                           class_names=["False Alarm", "Other"],  
                           filled=True, rounded=True,  
                           special_characters=True)  
graph = graphviz.Source(dot_data)  
graph.render("false_alarm_tree")  


### Percentage of missing values

In [None]:
# Calculate the percentage of missing values in each column
missing_values = df.isnull().mean() * 100

# Filter out columns with 0% missing values
missing_values = missing_values[missing_values > 0].sort_values(ascending=True)

# Plot the percentage of missing values in each column in a bar graph
fig, ax = plt.subplots(figsize=(8, 10))
missing_values.plot(kind='barh', ax=ax, color='c')
ax.set_xlabel('Percentage of missing values')
ax.set_title('Percentage of missing values by column')
# ax.set_xticklabels(missing_values.index, rotation=45, ha='right')
# ax.set_xticks(range(len(missing_values.index)))

plt.show()

### Graph of missing values

In [None]:
msno.matrix(df)

### Heatmap

In [None]:
df_false_alarms = df[df['IncidentGroup'] == 'False Alarm']

# Drop rows with null values in the Latitude and Longitude columns
df_heatmap = df_false_alarms.dropna(subset=['Latitude', 'Longitude'])

# Create a map object centered on the mean of the latitude and longitude columns
m = folium.Map(location=[51.498611, -0.210884], zoom_start=13.5)

# Create a list of coordinates from the Latitude and Longitude columns
coordinates = df_heatmap[['Latitude', 'Longitude']].values.tolist()

# Create a heatmap layer
heatmap = HeatMap(data=coordinates)

# Add the heatmap layer to the map
heatmap.add_to(m)

# Display the map
m

### Skewness and Kurtosis

In [None]:
for column in df.select_dtypes(include='number'):
    # if df[column].isnull().sum() == 0:
    print(f"Column: {column}")
    print(f"Skewness: {skew(df[column])}")
    print(f"Kurtosis: {kurtosis(df[column])}")


### Density plot of missing values

In [None]:
df_1 = df.drop('StopCodeDescription', axis=1)
row_missing = df_1.isna().mean(axis=1) * 100

# Create a density plot of the percentage of missing values
sns.kdeplot(row_missing)
plt.xlabel('Percentage of missing values')
plt.title('Density plot of missing values per row')
plt.show()


### Count plot for each column

In [None]:
non_null_counts = df.iloc[:, ::-1].shape[0] - df.iloc[:, ::-1].isnull().sum()

# create bar plot
plt.figure(figsize=(12, 10))
plt.barh(non_null_counts.index, non_null_counts)

# set x-axis and y-axis labels
plt.xlabel('Number of Non-Null Values')
plt.ylabel('Columns')

# display plot
plt.show()

### Nominal cost per pump hour

In [None]:
x = df['Notional Cost (£)'] / df['PumpHoursRoundUp']

# Create a line plot for the new column
plt.plot(x)
avg_cost_per_hour = x.mean()
plt.axhline(y=avg_cost_per_hour, color='r', linestyle='--')

# Set the labels for the plot
plt.xlabel('Index')
plt.ylabel('Cost per Hour')

# Show the plot
plt.show()


### Elbow curve and Silhouette score

In [None]:
dataset = df[df['IncidentGroup'] != 'False Alarm']

# Preprocess the data
le = LabelEncoder()
df['StopCodeDescription'] = le.fit_transform(df['StopCodeDescription'])
df['DatetimeOfCall'] = pd.to_datetime(df['DatetimeOfCall'])
df['DatetimeOfCall'] = df['DatetimeOfCall'].astype(int)

# Select the relevant columns
X = df[['DatetimeOfCall', 'StopCodeDescription']]

# Create an empty list to store the inertia values for different k
inertia_values = []

# Use a for loop to fit KMeans with different values of k
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(X)
    inertia_values.append(kmeans.inertia_)

# Plot the elbow curve
plt.figure(figsize=(10, 8))
plt.plot(range(1, 11), inertia_values)
plt.title('Elbow Curve')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

silhouette_scores = []

# Use a for loop to fit KMeans with different values of k and compute the silhouette score
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(X)
    labels = kmeans.labels_
    silhouette_scores.append(silhouette_score(X, labels))

# Plot the silhouette scores
plt.figure(figsize=(10, 8))
plt.plot(range(2, 11), silhouette_scores)
plt.title('Silhouette Score')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette score')
plt.show()