Import libraries

In [1]:
import pandas as pd
import glob
import os
from tqdm import tqdm
import time
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import math
from datetime import timedelta

import statsmodels.api as sm
from statsmodels.compat import lzip
from statsmodels.formula.api import ols
from sklearn.ensemble import GradientBoostingRegressor

Preprocess data

In [2]:
def getAllFiles(path, file_name_type, start_year, start_month, end_year, end_month, included_columns):
    li = []

    for x in tqdm(list(os.walk(data_path))):

        all_files = glob.glob(x[0] + "/*" + file_name_type + ".csv")

        for file_path in all_files:
            if (inTimePeriod(file_path, start_year, start_month, end_year, end_month)):
                df = pd.read_csv(file_path, index_col=None, header=0, usecols=included_columns)
                df = df[df['LSOA name'].notna()]
                df['Year'] = df['Month'].apply(monthYearToYear)
                df['Month'] = df['Month'].apply(monthYearToMonth)
                df['LSOA'] = df['LSOA name'].apply(LSOANameToCode)
                df['MSOA'] = df['LSOA'].apply(LSOAtoMSOA)
                df = df.drop(columns=['LSOA name'])
                li.append(df)
                
    complete_df = pd.concat(li, axis=0, ignore_index=True)
    complete_df.rename_axis('index')

    return complete_df

def inTimePeriod(file_path, start_year, start_month, end_year, end_month):
    file_path_parts = file_path.split("\\")
    file_name = file_path_parts[len(file_path_parts) - 1]
    year = int(file_name[:4])
    month = int(file_name[5:7])
    start_bool = (year >= start_year) or (year == start_year and month >= start_month)
    end_bool = (year <= end_year) or (year == end_year and month <= end_month)
    if (start_bool and end_bool):
        return True
    return False

def monthYearToYear(monthYear):
    return int(monthYear[:4])

def monthYearToMonth(monthYear):
    return int(monthYear[5:7])

def MonthYearRegionDF(dataframe, month, year, region):
    return dataframe[(dataframe['Month'] == month) & (dataframe['Year'] == year) &
                     (dataframe['Falls within'] == region)]

def LSOANameToCode(LSOAname):
    return str(LSOAname)[-4:]

def LSOAtoMSOA(LSOA):
    return LSOA[:-1]

def countMSOA(df):
    df = df.groupby(['Year', 'Month', 'MSOA', 'Crime type'])
    df = df.agg(count=('LSOA', 'count'))
    df = df.reset_index()
    return df

Data location

In [None]:
data_path = input("Enter the path to the data folder: ")

Train set (2012-2018)

In [4]:
trainSet = getAllFiles(data_path, "street", 2012, 1, 2018, 12,
                       included_columns=['Month', 'LSOA name', 'Crime type'])
trainSet

100%|██████████| 132/132 [08:07<00:00,  3.69s/it]


Unnamed: 0,Month,Crime type,Year,LSOA,MSOA
0,1,Anti-social behaviour,2012,001A,001
1,1,Anti-social behaviour,2012,001A,001
2,1,Anti-social behaviour,2012,001A,001
3,1,Other theft,2012,001A,001
4,1,Other theft,2012,001A,001
...,...,...,...,...,...
40834957,12,Public order,2018,062E,062
40834958,12,Vehicle crime,2018,062E,062
40834959,12,Violence and sexual offences,2018,062E,062
40834960,12,Violence and sexual offences,2018,062E,062


Test set no covid (split on March 2020)

In [5]:
# test set
testSetNoCovid = getAllFiles(data_path, "street", 2019, 1, 2020, 2,
                             included_columns=['Month', 'LSOA name', 'Crime type'])
testSetNoCovid

100%|██████████| 132/132 [01:03<00:00,  2.07it/s]


Unnamed: 0,Month,Crime type,Year,LSOA,MSOA
0,1,Anti-social behaviour,2019,001A,001
1,1,Criminal damage and arson,2019,001A,001
2,1,Criminal damage and arson,2019,001A,001
3,1,Other theft,2019,001A,001
4,1,Other theft,2019,001A,001
...,...,...,...,...,...
12279031,12,Criminal damage and arson,2020,062E,062
12279032,12,Violence and sexual offences,2020,062E,062
12279033,12,Violence and sexual offences,2020,062E,062
12279034,12,Violence and sexual offences,2020,062E,062


Test set covid (split on March 2020)

In [7]:
# start of march covid measures in uk
testSetCovid = getAllFiles(data_path, "street", 2020, 3, 2021, 10,
                           included_columns=['Month', 'LSOA name', 'Crime type'])
testSetCovid

100%|██████████| 132/132 [00:57<00:00,  2.31it/s] 


Unnamed: 0,Month,Crime type,Year,LSOA,MSOA
0,1,Anti-social behaviour,2020,001A,001
1,1,Burglary,2020,001A,001
2,1,Burglary,2020,001A,001
3,1,Other theft,2020,001A,001
4,1,Other theft,2020,001A,001
...,...,...,...,...,...
10837197,10,Criminal damage and arson,2021,062E,062
10837198,10,Public order,2021,062E,062
10837199,10,Vehicle crime,2021,062E,062
10837200,10,Violence and sexual offences,2021,062E,062


Count crime type per MSOA per month

In [8]:
trainSetCount = countMSOA(trainSet)
trainSetCount

Unnamed: 0,Year,Month,MSOA,Crime type,count
0,2012,1,001,Anti-social behaviour,6290
1,2012,1,001,Burglary,1715
2,2012,1,001,Criminal damage and arson,1756
3,2012,1,001,Drugs,607
4,2012,1,001,Other crime,502
...,...,...,...,...,...
143257,2018,12,140,Robbery,4
143258,2018,12,140,Shoplifting,1
143259,2018,12,140,Theft from the person,2
143260,2018,12,140,Vehicle crime,12


In [9]:
testSetNoCovidCount = countMSOA(testSetNoCovid)
testSetNoCovidCount

Unnamed: 0,Year,Month,MSOA,Crime type,count
0,2019,1,001,Anti-social behaviour,3402
1,2019,1,001,Bicycle theft,147
2,2019,1,001,Burglary,1473
3,2019,1,001,Criminal damage and arson,1699
4,2019,1,001,Drugs,397
...,...,...,...,...,...
42934,2020,12,140,Robbery,3
42935,2020,12,140,Shoplifting,1
42936,2020,12,140,Theft from the person,2
42937,2020,12,140,Vehicle crime,6


In [10]:
testSetCovidCount = countMSOA(testSetCovid)
testSetCovidCount

Unnamed: 0,Year,Month,MSOA,Crime type,count
0,2020,1,001,Anti-social behaviour,3243
1,2020,1,001,Bicycle theft,167
2,2020,1,001,Burglary,1241
3,2020,1,001,Criminal damage and arson,1737
4,2020,1,001,Drugs,481
...,...,...,...,...,...
36776,2021,10,138,Theft from the person,3
36777,2021,10,138,Violence and sexual offences,16
36778,2021,10,140,Criminal damage and arson,1
36779,2021,10,140,Drugs,1


Save as csv

In [11]:
trainSetCount.to_csv("train_count_street_data.csv")

In [12]:
testSetNoCovidCount.to_csv("test_no_covid_count_street_data.csv")

In [13]:
testSetCovidCount.to_csv("test_covid_count_street_data.csv")

In [14]:
trainSetCount['Crime type'].unique()

array(['Anti-social behaviour', 'Burglary', 'Criminal damage and arson',
       'Drugs', 'Other crime', 'Other theft',
       'Public disorder and weapons', 'Robbery', 'Shoplifting',
       'Vehicle crime', 'Violent crime', 'Bicycle theft',
       'Possession of weapons', 'Public order', 'Theft from the person',
       'Violence and sexual offences'], dtype=object)

In [15]:
testSetNoCovidCount['Crime type'].unique()

array(['Anti-social behaviour', 'Bicycle theft', 'Burglary',
       'Criminal damage and arson', 'Drugs', 'Other crime', 'Other theft',
       'Possession of weapons', 'Public order', 'Robbery', 'Shoplifting',
       'Theft from the person', 'Vehicle crime',
       'Violence and sexual offences'], dtype=object)

In [16]:
testSetCovidCount['Crime type'].unique()

array(['Anti-social behaviour', 'Bicycle theft', 'Burglary',
       'Criminal damage and arson', 'Drugs', 'Other crime', 'Other theft',
       'Possession of weapons', 'Public order', 'Robbery', 'Shoplifting',
       'Theft from the person', 'Vehicle crime',
       'Violence and sexual offences'], dtype=object)