In [1]:
import pandas as pd
import glob
import os
from tqdm import tqdm
import time
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import math
from datetime import timedelta

# import geopandas
# import geopy
# from geopy.geocoders import Nominatim
#import folium
#from geopy.extra.rate_limiter import RateLimiter
# from folium import plugins
# from folium.plugins import MarkerCluster
# from folium.plugins import HeatMap

#% matplotlib inline
import statsmodels.api as sm
from statsmodels.compat import lzip
from statsmodels.formula.api import ols
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
# def getAllFiles(path, file_name_type, start_year, start_month, end_year, end_month, dropped_columns):
def getAllFiles(path, file_name_type, start_year, start_month, end_year, end_month, included_columns):
    li = []

    for x in tqdm(list(os.walk(data_path))):

        all_files = glob.glob(x[0] + "/*" + file_name_type + ".csv")

        for file_path in all_files:
            if (inTimePeriod(file_path, start_year, start_month, end_year, end_month)):
                df = pd.read_csv(file_path, index_col=None, header=0, usecols=included_columns)
                #df.drop(columns=dropped_columns, inplace=True)
                df['Year'] = df['Month'].apply(monthYearToYear)
                df['Month'] = df['Month'].apply(monthYearToMonth)
                df['LSOA'] = df['LSOA name'].apply(LSOANameToCode)
                df['MSOA'] = df['LSOA'].apply(LSOAtoMSOA)
                df = df.drop(columns=['LSOA name'])
                #df = df[df['Longitude'].notna()]
                #df = df[df['Latitude'].notna()]
                li.append(df)
                
    complete_df = pd.concat(li, axis=0, ignore_index=True)
    complete_df.rename_axis('index')

    return complete_df

def inTimePeriod(file_path, start_year, start_month, end_year, end_month):
    file_path_parts = file_path.split("\\")
    file_name = file_path_parts[len(file_path_parts) - 1]
    year = int(file_name[:4])
    month = int(file_name[5:7])
    start_bool = (year >= start_year) or (year == start_year and month >= start_month)
    end_bool = (year <= end_year) or (year == end_year and month <= end_month)
    if (start_bool and end_bool):
        return True
    return False

def monthYearToYear(monthYear):
    return int(monthYear[:4])

def monthYearToMonth(monthYear):
    return int(monthYear[5:7])

def MonthYearRegionDF(dataframe, month, year, region):
    return dataframe[(dataframe['Month'] == month) & (dataframe['Year'] == year) &
                     (dataframe['Falls within'] == region)]

def LSOANameToCode(LSOAname):
    return str(LSOAname)[-4:]

def LSOAtoMSOA(LSOA):
    return LSOA[:-1]

# test = MonthYearRegionDF(fullStreetFrame, 1, 2017, 'Avon and Somerset Constabulary')
# test2 = test[test['Crime type'] == 'Violence and sexual offences']
# test2.head(1)
# ## Heatmaps
# hm_markers = folium.Map([51.4549741, -2.5979205], zoom_start=14)
# for index, row in tqdm(test2.iterrows()):
#     folium.CircleMarker([row['Latitude'], row['Longitude']],
#                         radius=3,
#                         popup=row['Crime type'],
#                         fill_color="#3db7e4",  # divvy color
#                         ).add_to(hm_markers)

# heat_data = [[row['Latitude'], row['Longitude']] for index, row in test2.iterrows()]
# HeatMap(heat_data).add_to(hm_markers)
# hm_markers
# heat_data = [[row['Latitude'], row['Longitude']] for index, row in test2.iterrows()]

# hm = folium.Map([51.4549741, -2.5979205], zoom_start=14)

# # Plot it on the map
# HeatMap(heat_data).add_to(hm)

# hm



In [3]:
#data_path = r"C:\Users\20201935\Documents\Data Challenge 2\Jan_2010_Oct_2021"
data_path = r"C:\Users\20202810\OneDrive - TU Eindhoven\Documents\University\Year 2\JGB050 - Data Challenge 2\Jan_2010_Oct_2021"

#trainstreetframe = getAllFiles(data_path, "street", 2012, 1, 2018, 10)
#teststreetframe = getAllFiles(data_path, "street", 2018, 11, 2021, 10)
#fullStreetFrame = getAllFiles(data_path, "street", 2012, 1, 2021, 10, ['Context', 'Location', 'Reported by', 'Last outcome category', 'LSOA code', 'LSOA name'])

In [6]:
trainSet = getAllFiles(data_path, "street", 2012, 1, 2018, 12,
                       included_columns=['Month', 'LSOA name', 'Crime type'])
trainSet

100%|██████████| 132/132 [02:29<00:00,  1.13s/it]


Unnamed: 0,Month,Crime type,Year,LSOA,MSOA
0,1,Anti-social behaviour,2012,001A,001
1,1,Anti-social behaviour,2012,001A,001
2,1,Anti-social behaviour,2012,001A,001
3,1,Other theft,2012,001A,001
4,1,Other theft,2012,001A,001
...,...,...,...,...,...
42573979,12,Public order,2018,062E,062
42573980,12,Vehicle crime,2018,062E,062
42573981,12,Violence and sexual offences,2018,062E,062
42573982,12,Violence and sexual offences,2018,062E,062


In [4]:
# test set
testSetNoCovid = getAllFiles(data_path, "street", 2019, 1, 2020, 2,
                             included_columns=['Month', 'LSOA name', 'Crime type'])
testSetNoCovid

100%|██████████| 132/132 [00:42<00:00,  3.12it/s]


Unnamed: 0,Month,Crime type,Year,LSOA,MSOA
0,1,Anti-social behaviour,2019,001A,001
1,1,Criminal damage and arson,2019,001A,001
2,1,Criminal damage and arson,2019,001A,001
3,1,Other theft,2019,001A,001
4,1,Other theft,2019,001A,001
...,...,...,...,...,...
12872599,12,Criminal damage and arson,2020,062E,062
12872600,12,Violence and sexual offences,2020,062E,062
12872601,12,Violence and sexual offences,2020,062E,062
12872602,12,Violence and sexual offences,2020,062E,062


In [5]:
# start of march covid measures in uk
testSetCovid = getAllFiles(data_path, "street", 2020, 3, 2021, 10,
                           included_columns=['Month', 'LSOA name', 'Crime type'])
testSetCovid

100%|██████████| 132/132 [00:37<00:00,  3.53it/s] 


Unnamed: 0,Month,Crime type,Year,LSOA,MSOA
0,1,Anti-social behaviour,2020,001A,001
1,1,Burglary,2020,001A,001
2,1,Burglary,2020,001A,001
3,1,Other theft,2020,001A,001
4,1,Other theft,2020,001A,001
...,...,...,...,...,...
11339247,10,Criminal damage and arson,2021,062E,062
11339248,10,Public order,2021,062E,062
11339249,10,Vehicle crime,2021,062E,062
11339250,10,Violence and sexual offences,2021,062E,062


In [7]:
def countMSOA(df):
    df = df.groupby(['Year', 'Month', 'MSOA', 'Crime type'])
    df = df.agg(count=('LSOA', 'count'))
    df = df.reset_index()
    return df

In [8]:
trainSetCount = countMSOA(trainSet)
trainSetCount

Unnamed: 0,Year,Month,MSOA,Crime type,count
0,2012,1,001,Anti-social behaviour,6290
1,2012,1,001,Burglary,1715
2,2012,1,001,Criminal damage and arson,1756
3,2012,1,001,Drugs,607
4,2012,1,001,Other crime,502
...,...,...,...,...,...
144385,2018,12,na,Robbery,235
144386,2018,12,na,Shoplifting,1338
144387,2018,12,na,Theft from the person,317
144388,2018,12,na,Vehicle crime,1030


In [None]:
testSetNoCovidCount = countMSOA(testSetNoCovid)
testSetNoCovidCount

In [None]:
testSetCovidCount = countMSOA(testSetCovid)
testSetCovidCount

In [None]:
trainSetCount.to_csv("train_count_street_data.csv")

In [None]:
testSetNoCovidCount.to_csv("test_no_covid_count_street_data.csv")

In [None]:
testSetCovidCount.to_csv("test_covid_count_street_data.csv")

In [None]:
fullStreetFrame

In [None]:
fullStreetFrame['LSOA name'].isna().sum()

In [None]:
trainstreetframe.to_csv("train_street_data.csv")
trainstreetframe

In [None]:
teststreetframe.to_csv("test_street_data.csv")
teststreetframe

In [None]:
trainstreetframe.groupby(['Crime type']).size()

In [None]:
trainstreetframe.groupby(['Reported by', 'Falls within']).size()

In [None]:
trainstreetframe = trainstreetframe.reset_index() 

for index, row in trainstreetframe.iterrows():
    print(index)
    if (row['Reported by'] != row['Falls within']):
        print(row['Reported by'] + " " + row['Falls within'])
        print(trainstreetframe.loc)
        break

In [None]:
#testframe = getAllFiles(data_path, "street", 2020, 11, 2021, 10, ['Context', 'Location', 'Reported By', 'Last outcome category', 'LSOA code', 'LSOA name'])

In [None]:
testframe.groupby(['Reported by', 'Falls within']).size()

In [None]:
falls_within = testframe['Falls within'].unique()

In [None]:
falls_within[0]