In [None]:
import pandas as pd
import numpy as np
import pyproj

In [None]:
df = pd.read_csv('data/crimedata_csv_AllNeighbourhoods_AllYears.csv')
df.columns = map(str.lower, df.columns)
num_null_types = df['type'].isnull().sum()
print(f"Number of rows with null 'type': {num_null_types}")
p = pyproj.Proj(proj='utm', zone=10, ellps='WGS84', datum='WGS84', units='m', hemisphere="north")

In [None]:
lon, lat = p(df['x'].values, df['y'].values, inverse=True)
latitude, longitude = p(lon, lat)
df2 = pd.DataFrame(np.c_[lat, lon], columns=['Latitude', 'Longitude'])
df = df[~((df['x'] == 0.0) | (df['y'] == 0.0))]
print(df.head())
df = pd.concat([df, df2], axis=1)
df.dropna(inplace=True)

In [None]:
# Find duplicate rows, considering only latitude and longitude, but keep one instance of each duplicate
duplicate_rows = df.duplicated(subset=['Latitude', 'Longitude'], keep='first')

In [None]:
# Number of unique duplicate pairs
num_duplicate_pairs = duplicate_rows.sum()

In [None]:
# Prints number of unique duplicate pairs
print("Number of unique duplicate pairs based on Latitude and Longitude: ", num_duplicate_pairs)

In [None]:
fiveYearsPrior = df['year'].max() - 5

In [None]:
# Exclude data before fiveYearsPrior
df = df[df['year'] >= fiveYearsPrior]
print(df['year'].min())

In [None]:
crime_severities = {
    'Homicide': 10,
    'Vehicle Collision or Pedestrian Struck (with Fatality)': 9,
    'Offence Against a Person': 8,
    'Break and Enter Commercial': 7,
    'Break and Enter Residential/Other': 6,
    'Vehicle Collision or Pedestrian Struck (with Injury)': 5,
    'Theft of Vehicle': 4,
    'Theft from Vehicle': 3,
    'Theft of Bicycle': 3,
    'Other Theft': 2,
    'Mischief': 1
}

In [None]:
# Map crime types to their severities to create the 'weight' column
df['weight'] = df['type'].map(crime_severities)

In [None]:
# Group by Latitude and Longitude, sum the weights of each group
grouped_df = df.groupby(['Latitude', 'Longitude'])['weight'].mean().reset_index(name='weight')

In [None]:
# Print to check your new DataFrame
print(grouped_df.head())

In [None]:
print(grouped_df['weight'].median())
print(grouped_df['weight'].max())
# Now normalize the 'weight' column to range 1 - 10
grouped_df['weight'] = ((grouped_df['weight'] - grouped_df['weight'].min()) /
                        (grouped_df['weight'].max() - grouped_df['weight'].min()) *
                        (10 - 1)) + 1

In [None]:
# Print to check your new DataFrame with normalized 'weight'
print(grouped_df)
print(grouped_df['weight'].quantile(0.5))
print(grouped_df['weight'].max())
grouped_df.columns = map(str.lower, grouped_df.columns)
grouped_df.to_csv('temp_data3.csv', index=False)