## Import libraries

In [1]:
import pandas as pd
import glob
import os
from tqdm import tqdm
import numpy as np
from datetime import timedelta

## Preprocess data functions

In [2]:
def getAllFiles(path, file_name_type, start_year, start_month, end_year, end_month, included_columns, msoa_dictionary):
    li = []

    for x in tqdm(list(os.walk(path))):

        all_files = glob.glob(x[0] + "/*" + file_name_type + ".csv")

        for file_path in all_files:
            if (inTimePeriod(file_path, start_year, start_month, end_year, end_month)):
                df = pd.read_csv(file_path, index_col=None, header=0, usecols=included_columns)
                df = df[df['LSOA name'].notna()]
                df['MSOA'] = df['LSOA name'].apply(LSOAtoMSOA)
                df['MSOA'] = df['MSOA'].map(msoa_dictionary)
                li.append(df)
                
    complete_df = pd.concat(li, axis=0, ignore_index=True)
    complete_df.rename_axis('index')

    return complete_df

def inTimePeriod(file_path, start_year, start_month, end_year, end_month):
    file_path_parts = file_path.split("\\")
    file_name = file_path_parts[len(file_path_parts) - 1]
    year = int(file_name[:4])
    month = int(file_name[5:7])
    start_bool = (year >= start_year) or (year == start_year and month >= start_month)
    end_bool = (year <= end_year) or (year == end_year and month <= end_month)
    if (start_bool and end_bool):
        return True
    return False

def LSOAtoMSOA(LSOA):
    return LSOA[:-1]

def countMSOA(df):
    df = df.groupby(['Month', 'MSOA', 'Crime type'])
    df = df.agg(count=('LSOA name', 'count')).unstack(fill_value=0).stack()
    df = df.reset_index()
    return df

## MSOA Names table

In [3]:
msoa_names = pd.read_csv("MSOA-Names.csv", header=0, usecols=["msoa11nm", "msoa11hclnm"])
msoa_dict = dict(zip(msoa_names.msoa11nm, msoa_names.msoa11hclnm))
msoa_dict

{'Adur 001': 'Hillside',
 'Adur 002': 'Buckingham',
 'Adur 003': 'North Lancing',
 'Adur 004': 'Kingston & Southwick',
 'Adur 005': 'Shoreham Central & Beach',
 'Adur 006': 'Cokeham & Sompting',
 'Adur 007': 'Marsh Barn & Widewater',
 'Adur 008': 'South Lancing',
 'Allerdale 001': 'Wigton & Silloth',
 'Allerdale 002': 'Boltons, Warnell & Solway Coast',
 'Allerdale 003': 'Aspatria & Abbeytown',
 'Allerdale 004': 'Maryport, Dearham & Crosby',
 'Allerdale 005': 'Flimby, Ellenborough & Broughton Moor',
 'Allerdale 006': 'West Cockermouth & Great Broughton',
 'Allerdale 007': 'East Cockermouth & Buttermere',
 'Allerdale 008': 'Workington North & Seaton',
 'Allerdale 009': 'Workington West',
 'Allerdale 010': 'Workington East',
 'Allerdale 011': 'Harrington, Stainburn & Great Clifton',
 'Allerdale 012': 'Keswick & Derwent Valley',
 'Amber Valley 001': 'Alfreton',
 'Amber Valley 002': 'Crich, Holloway & Wingfield',
 'Amber Valley 003': 'Somercotes & Pye Bridge',
 'Amber Valley 004': 'Swanwick

## Dataset location

In [None]:
dataset_path = input("Enter the path to the dataset folder: ")

## Import data

Train set (2012-2018)

In [5]:
trainSet = getAllFiles(dataset_path, "street", 2012, 1, 2018, 12,
                       included_columns=['Month', 'LSOA name', 'Crime type'], msoa_dictionary=msoa_dict)
trainSet

100%|██████████| 132/132 [01:55<00:00,  1.14it/s]


Unnamed: 0,Month,LSOA name,Crime type,MSOA
0,2012-01,Bath and North East Somerset 001A,Anti-social behaviour,Keynsham North
1,2012-01,Bath and North East Somerset 001A,Anti-social behaviour,Keynsham North
2,2012-01,Bath and North East Somerset 001A,Anti-social behaviour,Keynsham North
3,2012-01,Bath and North East Somerset 001A,Other theft,Keynsham North
4,2012-01,Bath and North East Somerset 001A,Other theft,Keynsham North
...,...,...,...,...
40834957,2018-12,Wiltshire 062E,Public order,Downton & Morgan's Vale
40834958,2018-12,Wiltshire 062E,Vehicle crime,Downton & Morgan's Vale
40834959,2018-12,Wiltshire 062E,Violence and sexual offences,Downton & Morgan's Vale
40834960,2018-12,Wiltshire 062E,Violence and sexual offences,Downton & Morgan's Vale


Test set no covid (split on March 2020)

In [6]:
# test set
testSetNoCovid = getAllFiles(dataset_path, "street", 2019, 1, 2020, 2,
                             included_columns=['Month', 'LSOA name', 'Crime type'], msoa_dictionary=msoa_dict)
testSetNoCovid

100%|██████████| 132/132 [00:32<00:00,  4.03it/s]


Unnamed: 0,Month,LSOA name,Crime type,MSOA
0,2019-01,Bath and North East Somerset 001A,Anti-social behaviour,Keynsham North
1,2019-01,Bath and North East Somerset 001A,Criminal damage and arson,Keynsham North
2,2019-01,Bath and North East Somerset 001A,Criminal damage and arson,Keynsham North
3,2019-01,Bath and North East Somerset 001A,Other theft,Keynsham North
4,2019-01,Bath and North East Somerset 001A,Other theft,Keynsham North
...,...,...,...,...
12279031,2020-12,Wiltshire 062E,Criminal damage and arson,Downton & Morgan's Vale
12279032,2020-12,Wiltshire 062E,Violence and sexual offences,Downton & Morgan's Vale
12279033,2020-12,Wiltshire 062E,Violence and sexual offences,Downton & Morgan's Vale
12279034,2020-12,Wiltshire 062E,Violence and sexual offences,Downton & Morgan's Vale


Test set covid (split on March 2020)

In [7]:
# start of march covid measures in uk
testSetCovid = getAllFiles(dataset_path, "street", 2020, 3, 2021, 10,
                           included_columns=['Month', 'LSOA name', 'Crime type'], msoa_dictionary=msoa_dict)
testSetCovid

100%|██████████| 132/132 [00:28<00:00,  4.56it/s]


Unnamed: 0,Month,LSOA name,Crime type,MSOA
0,2020-01,Bath and North East Somerset 001A,Anti-social behaviour,Keynsham North
1,2020-01,Bath and North East Somerset 001A,Burglary,Keynsham North
2,2020-01,Bath and North East Somerset 001A,Burglary,Keynsham North
3,2020-01,Bath and North East Somerset 001A,Other theft,Keynsham North
4,2020-01,Bath and North East Somerset 001A,Other theft,Keynsham North
...,...,...,...,...
10837197,2021-10,Wiltshire 062E,Criminal damage and arson,Downton & Morgan's Vale
10837198,2021-10,Wiltshire 062E,Public order,Downton & Morgan's Vale
10837199,2021-10,Wiltshire 062E,Vehicle crime,Downton & Morgan's Vale
10837200,2021-10,Wiltshire 062E,Violence and sexual offences,Downton & Morgan's Vale


Complete dataset

In [8]:
completeSet = pd.concat([trainSet, testSetNoCovid, testSetCovid], axis=0, ignore_index=True)
completeSet

Unnamed: 0,Month,LSOA name,Crime type,MSOA
0,2012-01,Bath and North East Somerset 001A,Anti-social behaviour,Keynsham North
1,2012-01,Bath and North East Somerset 001A,Anti-social behaviour,Keynsham North
2,2012-01,Bath and North East Somerset 001A,Anti-social behaviour,Keynsham North
3,2012-01,Bath and North East Somerset 001A,Other theft,Keynsham North
4,2012-01,Bath and North East Somerset 001A,Other theft,Keynsham North
...,...,...,...,...
63951195,2021-10,Wiltshire 062E,Criminal damage and arson,Downton & Morgan's Vale
63951196,2021-10,Wiltshire 062E,Public order,Downton & Morgan's Vale
63951197,2021-10,Wiltshire 062E,Vehicle crime,Downton & Morgan's Vale
63951198,2021-10,Wiltshire 062E,Violence and sexual offences,Downton & Morgan's Vale


## Replace values that changed throughout years

In [9]:
trainSet['Crime type'] = trainSet['Crime type'].replace('Violent crime', 'Violence and sexual offences')
trainSet['Crime type'] = trainSet['Crime type'].replace('Public disorder and weapons','Other crime')

In [10]:
testSetNoCovid['Crime type'] = testSetNoCovid['Crime type'].replace('Violent crime', 'Violence and sexual offences')
testSetNoCovid['Crime type'] = testSetNoCovid['Crime type'].replace('Public disorder and weapons','Other crime')

In [11]:
completeSet['Crime type'] = completeSet['Crime type'].replace('Violent crime', 'Violence and sexual offences')
completeSet['Crime type'] = completeSet['Crime type'].replace('Public disorder and weapons','Other crime')

## Count crime type per MSOA per month

In [12]:
trainSetCount = countMSOA(trainSet)
trainSetCount

Unnamed: 0,Month,MSOA,Crime type,count
0,2012-01,"Abberley, Holt Heath & Hallow",Anti-social behaviour,6
1,2012-01,"Abberley, Holt Heath & Hallow",Bicycle theft,0
2,2012-01,"Abberley, Holt Heath & Hallow",Burglary,1
3,2012-01,"Abberley, Holt Heath & Hallow",Criminal damage and arson,0
4,2012-01,"Abberley, Holt Heath & Hallow",Drugs,0
...,...,...,...,...
8385281,2018-12,Ystradgynlais & Tawe Uchaf,Robbery,0
8385282,2018-12,Ystradgynlais & Tawe Uchaf,Shoplifting,1
8385283,2018-12,Ystradgynlais & Tawe Uchaf,Theft from the person,0
8385284,2018-12,Ystradgynlais & Tawe Uchaf,Vehicle crime,0


In [13]:
testSetNoCovidCount = countMSOA(testSetNoCovid)
testSetNoCovidCount

Unnamed: 0,Month,MSOA,Crime type,count
0,2019-01,"Abberley, Holt Heath & Hallow",Anti-social behaviour,10
1,2019-01,"Abberley, Holt Heath & Hallow",Bicycle theft,0
2,2019-01,"Abberley, Holt Heath & Hallow",Burglary,1
3,2019-01,"Abberley, Holt Heath & Hallow",Criminal damage and arson,3
4,2019-01,"Abberley, Holt Heath & Hallow",Drugs,0
...,...,...,...,...
2325199,2020-12,Ystradgynlais & Tawe Uchaf,Robbery,0
2325200,2020-12,Ystradgynlais & Tawe Uchaf,Shoplifting,1
2325201,2020-12,Ystradgynlais & Tawe Uchaf,Theft from the person,0
2325202,2020-12,Ystradgynlais & Tawe Uchaf,Vehicle crime,0


In [14]:
completeSetCount = countMSOA(completeSet)
completeSetCount

Unnamed: 0,Month,MSOA,Crime type,count
0,2012-01,"Abberley, Holt Heath & Hallow",Anti-social behaviour,6
1,2012-01,"Abberley, Holt Heath & Hallow",Bicycle theft,0
2,2012-01,"Abberley, Holt Heath & Hallow",Burglary,1
3,2012-01,"Abberley, Holt Heath & Hallow",Criminal damage and arson,0
4,2012-01,"Abberley, Holt Heath & Hallow",Drugs,0
...,...,...,...,...
11645083,2021-10,Ystradgynlais & Tawe Uchaf,Robbery,0
11645084,2021-10,Ystradgynlais & Tawe Uchaf,Shoplifting,2
11645085,2021-10,Ystradgynlais & Tawe Uchaf,Theft from the person,0
11645086,2021-10,Ystradgynlais & Tawe Uchaf,Vehicle crime,1


## Change column names

In [15]:
# no covid
trainSetCount = trainSetCount.rename(columns={"Month":"Date"})
testSetNoCovidCount = testSetNoCovidCount.rename(columns={"Month":"Date"})
# complete (with covid)
completeSetCount = completeSetCount.rename(columns={"Month":"Date"})

## Save as csv

In [16]:
trainSetCount.to_csv("datasets/train.csv")

In [17]:
testSetNoCovidCount.to_csv("datasets/test.csv")

In [18]:
completeSetCount.to_csv("datasets/complete.csv")