In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
df=pd.read_excel('processed_data.xlsx')

In [3]:
# Convert DateOfCall to datetime
df['DateOfCall'] = pd.to_datetime(df['DateOfCall'], format='%d %b %Y')

# Convert TimeOfCall to datetime.time
df['TimeOfCall'] = pd.to_datetime(df['TimeOfCall']).dt.time

# Convert columns to category data type
df['IncidentGroup'] = df['IncidentGroup'].astype('category')
df['StopCodeDescription'] = df['StopCodeDescription'].astype('category')
df['PropertyCategory'] = df['PropertyCategory'].astype('category')
df['PropertyType'] = df['PropertyType'].astype('category')
df['Postcode_district'] = df['Postcode_district'].astype('category')
df['IncGeo_WardName'] = df['IncGeo_WardName'].astype('category')
df['IncidentStationGround'] = df['IncidentStationGround'].astype('category')
df['FirstPumpArriving_DeployedFromStation'] = df['FirstPumpArriving_DeployedFromStation'].astype('category')

# Print data types
print(df.dtypes)

IncidentNumber                                   object
DateOfCall                               datetime64[ns]
CalYear                                           int64
TimeOfCall                                       object
HourOfCall                                        int64
IncidentGroup                                  category
StopCodeDescription                            category
PropertyCategory                               category
PropertyType                                   category
Postcode_district                              category
IncGeo_WardName                                category
Easting_rounded                                   int64
Northing_rounded                                  int64
IncidentStationGround                          category
FirstPumpArriving_AttendanceTime                float64
FirstPumpArriving_DeployedFromStation          category
NumStationsWithPumpsAttending                   float64
NumPumpsAttending                               

# Main code starts from here (Association Analysis)


In [13]:
subset = df[['IncidentGroup', 'StopCodeDescription', 'Postcode_district', 'HourOfCall']]
# Convert 'HourOfCall' to categorical variable
subset['HourCategory'] = pd.cut(subset['HourOfCall'], bins=[-1, 6, 12, 18, 23], labels=['Night', 'Morning', 'Afternoon', 'Evening'])
# Drop the 'HourOfCall' column
subset = subset.drop('HourOfCall', axis=1)
fire_secondary = subset[(subset['IncidentGroup'] == 'Fire') & (subset['StopCodeDescription'] == 'Secondary Fire')]
fire_secondary = fire_secondary.drop('IncidentGroup', axis=1)
fire_secondary = fire_secondary.drop('StopCodeDescription', axis=1)
new_list = []
for index, row in subset.iterrows():
    new_list.append([row['Postcode_district'], row['HourCategory']])
#Let's transform the list, with one-hot encoding
from mlxtend.preprocessing import TransactionEncoder
a = TransactionEncoder()
a_data = a.fit(new_list).transform(new_list)
dff = pd.DataFrame(a_data,columns=a.columns_)
dff = dff.replace(False,0)
# Find frequent itemsets using Apriori algorithm
frequent_itemsets = apriori(dff, min_support=0.05, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Print top 5 rules by lift
#print(rules.nlargest(5, "confidence"))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
# Define a function to format each rule as a string
def format_rule(row):
    antecedents = ", ".join([str(x) for x in row["antecedents"]])
    consequents = ", ".join([str(x) for x in row["consequents"]])
    sup = round(row["antecedent support"], 2)
    conf = round(row["confidence"], 2)
    lift = round(row["lift"], 2)
    supp = round(row["support"], 2)
    zhang = round(row["zhangs_metric"], 2)
    return f"{antecedents}->{consequents}: sup={sup}, conf={conf}, lift={lift}, supp={supp}, zhang={zhang}"

# Format each rule as a string and join them together with a separator
result = "; \n".join(rules.nlargest(5, "confidence").apply(format_rule, axis=1))

# Print the formatted rules
print(result)

KT3->Afternoon: sup=0.17, conf=0.36, lift=1.07, supp=0.06, zhang=0.08; 
KT2->Afternoon: sup=0.21, conf=0.34, lift=1.0, supp=0.07, zhang=0.0; 
KT1->Afternoon: sup=0.22, conf=0.34, lift=1.0, supp=0.08, zhang=0.0; 
KT2->Morning: sup=0.21, conf=0.31, lift=1.09, supp=0.07, zhang=0.11; 
KT6->Morning: sup=0.17, conf=0.3, lift=1.05, supp=0.05, zhang=0.06
