In [1]:
# Set up for running the code
import pandas
import math
import numpy
import sklearn
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth


# Read in the data from the CSV file
dataframe = pandas.read_csv("Crime_Data_from_2020_to_Present.csv", dtype={'TIME OCC': str})


In [2]:
# Get only the date in the DATE OCC column
dataframe['DATE OCC'] = dataframe['DATE OCC'].str.split().str.get(0)

In [3]:
# Change the type of the column to date
dataframe['day'] = pandas.to_datetime(dataframe['DATE OCC'], format='%m/%d/%Y').dt.day
dataframe['month'] = pandas.to_datetime(dataframe['DATE OCC'], format='%m/%d/%Y').dt.month
dataframe['year'] = pandas.to_datetime(dataframe['DATE OCC'], format='%m/%d/%Y').dt.year

In [4]:
# Change the TIME OCC column to time
dataframe['hour'] = pandas.to_datetime(dataframe['TIME OCC'], format='%H%M').dt.hour
dataframe['minute'] = pandas.to_datetime(dataframe['TIME OCC'], format='%H%M').dt.minute

In [5]:
# Keep only violent crimes
dataframe.dropna(subset=['Crm Cd'])

agregousCrimeCDs = [
    231,
    230,
    624,
    622,
    623,
    860,
    110,
    753,
    822,
    921,
    882,
    910,
    920,
    113,
    625,
    122,
    121,
    251,
    250,
]

col = dataframe["Crm Cd"]


for cd in pandas.unique(col):
    if not cd in agregousCrimeCDs:
        dataframe = dataframe.loc[dataframe["Crm Cd"] != cd ]


dataframe

Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Crm Cd 4,LOCATION,Cross Street,LAT,LON,day,month,year,hour,minute
0,10304468,01/08/2020 12:00:00 AM,01/08/2020,2230,3,Southwest,377,2,624,BATTERY - SIMPLE ASSAULT,...,,1100 W 39TH PL,,34.0141,-118.2978,8,1,2020,22,30
1,190101086,01/02/2020 12:00:00 AM,01/01/2020,0330,1,Central,163,2,624,BATTERY - SIMPLE ASSAULT,...,,700 S HILL ST,,34.0459,-118.2545,1,1,2020,3,30
5,200100501,01/02/2020 12:00:00 AM,01/01/2020,0030,1,Central,163,1,121,"RAPE, FORCIBLE",...,,700 S BROADWAY,,34.0452,-118.2534,1,1,2020,0,30
24,200100546,01/15/2020 12:00:00 AM,01/15/2020,0700,1,Central,166,1,230,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",...,,600 SAN JULIAN ST,,34.0428,-118.2461,15,1,2020,7,0
27,200100552,01/19/2020 12:00:00 AM,01/19/2020,2000,1,Central,111,1,230,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",...,,ALAMEDA,LOS ANGELES,34.0578,-118.2371,19,1,2020,20,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
802932,231300825,06/07/2023 12:00:00 AM,06/07/2023,2203,13,Newton,1322,2,624,BATTERY - SIMPLE ASSAULT,...,,2300 WALL ST,,34.0254,-118.2629,7,6,2023,22,3
802934,231608412,05/21/2023 12:00:00 AM,05/20/2023,2130,16,Foothill,1663,2,624,BATTERY - SIMPLE ASSAULT,...,,12100 SHELDON ST,,34.2374,-118.3964,20,5,2023,21,30
802935,230512110,08/09/2023 12:00:00 AM,08/09/2023,1310,5,Harbor,516,1,230,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",...,,1200 N AVALON BL,,33.7868,-118.2658,9,8,2023,13,10
802952,231606525,03/22/2023 12:00:00 AM,03/22/2023,1000,16,Foothill,1602,1,230,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",...,,12800 FILMORE ST,,34.2790,-118.4116,22,3,2023,10,0


In [6]:
# Drop all unneeded columns
dataframe = dataframe.drop(columns=['Crm Cd', 'DR_NO', 'Date Rptd', 'DATE OCC', 'TIME OCC', 'Rpt Dist No', 'Part 1-2', 'Crm Cd Desc', 'AREA NAME', 'Premis Cd', 'Premis Desc', 'Weapon Used Cd', "Weapon Desc", "Status", "Status Desc", "Crm Cd 1", "Crm Cd 2", "Crm Cd 3", "Crm Cd 4", "LOCATION", "Cross Street", "LAT", "LON", "Mocodes"])

In [7]:
dataframe

Unnamed: 0,AREA,Vict Age,Vict Sex,Vict Descent,day,month,year,hour,minute
0,3,36,F,B,8,1,2020,22,30
1,1,25,M,H,1,1,2020,3,30
5,1,25,F,H,1,1,2020,0,30
24,1,62,M,A,15,1,2020,7,0
27,1,71,M,W,19,1,2020,20,0
...,...,...,...,...,...,...,...,...,...
802932,13,45,F,B,7,6,2023,22,3
802934,16,8,F,H,20,5,2023,21,30
802935,5,59,M,H,9,8,2023,13,10
802952,16,25,F,H,22,3,2023,10,0


In [8]:
# Change the types of all of the data to strings
dataframe.astype(str).dtypes

AREA            object
Vict Age        object
Vict Sex        object
Vict Descent    object
day             object
month           object
year            object
hour            object
minute          object
dtype: object

In [9]:
# Modify the values so all values are unique
cols = ["AREA", "Vict Age", "Vict Sex", "Vict Descent", "day", "month", "year", "hour", "minute"]
pref = ['a', 'A', 's', 'r', 'D','M','Y','h','m']

for i in range(len(cols)):

    print(cols[i])

    # get column to modify
    col = dataframe[cols[i]]

    # print(col)

    # Get all the unique values
    uniques = pandas.unique(col)

    # Add the prefix to all of the unique data and replace all values
    for u in uniques:
        if i == 5:
            print(u)
        new = str(pref[i]) + str(u)
        col = col.replace(u, new)

    # Set the new column to its values
    dataframe[cols[i]] = col

    print(cols[i], " is done.\n")

AREA
AREA  is done.

Vict Age
Vict Age  is done.

Vict Sex
Vict Sex  is done.

Vict Descent
Vict Descent  is done.

day
day  is done.

month
1
10
8
6
2
7
3
12
5
9
11
4
month  is done.

year
year  is done.

hour
hour  is done.

minute
minute  is done.



In [10]:
dataframe

Unnamed: 0,AREA,Vict Age,Vict Sex,Vict Descent,day,month,year,hour,minute
0,a3,A36,sF,rB,D8,M1,Y2020,h22,m30
1,a1,A25,sM,rH,D1,M1,Y2020,h3,m30
5,a1,A25,sF,rH,D1,M1,Y2020,h0,m30
24,a1,A62,sM,rA,D15,M1,Y2020,h7,m0
27,a1,A71,sM,rW,D19,M1,Y2020,h20,m0
...,...,...,...,...,...,...,...,...,...
802932,a13,A45,sF,rB,D7,M6,Y2023,h22,m3
802934,a16,A8,sF,rH,D20,M5,Y2023,h21,m30
802935,a5,A59,sM,rH,D9,M8,Y2023,h13,m10
802952,a16,A25,sF,rH,D22,M3,Y2023,h10,m0


In [11]:
# Create the new dataframe

# Create lists of the values we want in the new dataframe
cds = []
lists = []
cols = list(dataframe.columns)

for i, row in dataframe.iterrows():
    allOtherVals = []
    for c in cols:
        allOtherVals.append(row[c])

    lists.append(allOtherVals)

# Convert lists into pandas series
cdSeries = pandas.Series(cds)
listsSeries = pandas.Series(lists)

# Make the dataframe
frame = {'Crime Data': listsSeries}
newDataFrame = pandas.DataFrame(frame)

In [12]:
newDataFrame

Unnamed: 0,Crime Data
0,"[a3, A36, sF, rB, D8, M1, Y2020, h22, m30]"
1,"[a1, A25, sM, rH, D1, M1, Y2020, h3, m30]"
2,"[a1, A25, sF, rH, D1, M1, Y2020, h0, m30]"
3,"[a1, A62, sM, rA, D15, M1, Y2020, h7, m0]"
4,"[a1, A71, sM, rW, D19, M1, Y2020, h20, m0]"
...,...
131101,"[a13, A45, sF, rB, D7, M6, Y2023, h22, m3]"
131102,"[a16, A8, sF, rH, D20, M5, Y2023, h21, m30]"
131103,"[a5, A59, sM, rH, D9, M8, Y2023, h13, m10]"
131104,"[a16, A25, sF, rH, D22, M3, Y2023, h10, m0]"


In [13]:
# Save new dataframe to a csv
newDataFrame.to_csv("./preAssociation.csv", index=False)

In [19]:
encoder = TransactionEncoder()
encoder.fit(lists)
encodedData = encoder.transform(lists)
tfDataFrame = pandas.DataFrame(encodedData, columns=encoder.columns_)

In [20]:
tfDataFrame

Unnamed: 0,A0,A10,A11,A12,A120,A13,A14,A15,A16,A17,...,rV,rW,rX,rZ,rnan,sF,sH,sM,sX,snan
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131101,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
131102,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
131103,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
131104,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False


In [21]:
freq = apriori(tfDataFrame, min_support=0.1, use_colnames=True)

freq

Unnamed: 0,support,itemsets
0,0.100758,(M7)
1,0.256579,(Y2020)
2,0.26519,(Y2021)
3,0.279507,(Y2022)
4,0.198725,(Y2023)
5,0.285448,(m0)
6,0.193889,(m30)
7,0.220348,(rB)
8,0.478887,(rH)
9,0.168093,(rW)


In [23]:
results = freq.sort_values(['support'], ascending=[False])
results

Unnamed: 0,support,itemsets
11,0.560707,(sM)
8,0.478887,(rH)
10,0.405893,(sF)
5,0.285448,(m0)
29,0.282703,"(rH, sM)"
3,0.279507,(Y2022)
2,0.26519,(Y2021)
1,0.256579,(Y2020)
7,0.220348,(rB)
4,0.198725,(Y2023)


In [24]:
# print key
for i in range(len(cols)):
    print(pref[i] + ": " + cols[i])

a: AREA
A: Vict Age
s: Vict Sex
r: Vict Descent
D: day
M: month
Y: year
h: hour
m: minute


In [44]:
resultsPer = [[] for i in range(len(pref))]


for i, row in results.iterrows():

    accuracy = row.support

    for item in list(row.itemsets):
        key = item[0]
        index = pref.index(key)
        resultsPer[index].append({item[1:] : accuracy})


In [None]:
for i in range(len(resultsPer)):
    print(cols[i])
    for pair in resultsPer[i]:
        print('\t', pair)