In [2]:
# Depencancies and Setup
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import time
import scipy.stats as st
import datetime
from scipy.stats import linregress
from pprint import pprint

from config import weather_api_key
from config import geoapify_key

from citipy import citipy

# Create file pathway and create new output directory and filepath
open_file_path = os.getcwd()

crimedata_file_path= os.path.join(open_file_path,"UN_crime_reports")

if not os.path.exists(os.path.join(crimedata_file_path,"Crime_figs")):
    os.mkdir(os.path.join(crimedata_file_path,"Crime_figs"))

output_file_path = os.path.join(crimedata_file_path,"Crime_figs")






ModuleNotFoundError: No module named 'config'

In [26]:
# Recall combined UN crime and population CSV
crime_pop_combined = pd.read_csv(os.path.join(crimedata_file_path,"UN_Combined_crime_pop.csv"))

# Remove the index from the previous merge
crime_pop_combined = crime_pop_combined.drop(columns = ['Unnamed: 0'])

# The VALUE column is an object due to the comma separator.  This will cancel any mathematical operations, so....
crime_pop_combined['VALUE'] = crime_pop_combined['VALUE'].str.replace(',','')
crime_pop_combined['VALUE'] = crime_pop_combined['VALUE'].astype(float)
crime_pop_combined = crime_pop_combined.rename(columns = {'VALUE': 'Crimes Committed'})


In [27]:
# Group by county and crime type to get overall number of offences

total_crime_year_county = crime_pop_combined.groupby(['Country','ISO2_code','Year']).sum()["Crimes Committed"]

total_crime_year_county
# country_crime_dict = total_crime_year_county.to_dict()

# total_crime_year_county = pd.DataFrame.from_dict(country_crime_dict,orient='index').reset_index()
# total_crime_year_county.columns=['Country','Year']
# total_crime_year_county

# total_crime_year_county.head(20)

Country   ISO2_code  Year
Albania   AL         2016    15828.0
                     2017    17261.0
                     2018    20792.0
                     2019    15418.0
                     2020    11019.0
                              ...   
Zambia    ZM         2017     5554.0
                     2018     5421.0
                     2019    10317.0
                     2020     4335.0
Zimbabwe  ZW         2020     3750.0
Name: Crimes Committed, Length: 693, dtype: float64

In [28]:
# country_pop_year_dups = crime_pop_combined[crime_pop_combined.duplicated(subset=['PopTotal'])]['PopTotal'].unique()
# total_pop_year_country = crime_pop_combined[crime_pop_combined['PopTotal'].isin(country_pop_year_dups)==False]
# total_pop_year_country = total_pop_year_country.groupby(['Country','Year']).value_counts()
# total_pop_year_country.head()

total_pop_year_country = crime_pop_combined.drop(columns = ['Category',
                                                            'Unit of measurement',
                                                            'Crimes Committed',
                                                            'Source',
                                                            'Crime Type'
                                                            ]
                                                            ).drop_duplicates()

total_pop_year_country

# total_pop_year_country = crime_pop_combined.[crime_pop_combined[PopTotal].isin()


Unnamed: 0,Country,Year,ISO3_code,ISO2_code,PopTotal
0,Albania,2016,ALB,AL,2881.063
15,Albania,2017,ALB,AL,2879.355
30,Albania,2018,ALB,AL,2877.013
45,Albania,2019,ALB,AL,2873.883
60,Albania,2020,ALB,AL,2866.849
...,...,...,...,...,...
28856,Iraq (Central Iraq),2018,,,
30556,Timor-Leste,2016,TLS,TL,1224.562
30559,Timor-Leste,2017,TLS,TL,1243.235
30707,South Africa,2016,ZAF,ZA,56422.274


In [29]:
# Considering the API pulls, all of the countries are listed by ISO2_code, so I these df will be merged based on this code
# while keeping the country name for reference.  Also, I choose how = 'inner' because I need these two df to be the same size after merge
# to avoid NaN, null, or blank values. 
# There could still be some special districts that slipped though the initial data collection

total_crime_pop = pd.merge(total_crime_year_county,total_pop_year_country, how = 'inner',on='ISO2_code')
total_crime_pop.head()

Unnamed: 0,ISO2_code,Crimes Committed,Country,Year,ISO3_code,PopTotal
0,AL,15828.0,Albania,2016,ALB,2881.063
1,AL,15828.0,Albania,2017,ALB,2879.355
2,AL,15828.0,Albania,2018,ALB,2877.013
3,AL,15828.0,Albania,2019,ALB,2873.883
4,AL,15828.0,Albania,2020,ALB,2866.849


In [30]:
# check for NaN, Null, or blank values... That is find all of the special regions/ UN districts that slipped through!!!
def nan_null_test(df):
    null_mask = df.isnull()
    nan_mask = df.isna()
    result = df.loc[null_mask.any(axis=1)]
    print(f"Number of NaN values : {len(result)}")
    
    nan_mask = df.isna()
    result = df.loc[nan_mask.any(axis=1)]
    print(f"Number of NaN values : {len(result)}")
    return(result)

nan_null_test(total_crime_pop)

Number of NaN values : 0
Number of NaN values : 0


Unnamed: 0,ISO2_code,Crimes Committed,Country,Year,ISO3_code,PopTotal


In [31]:
# Calculate crimes committed per 100,000 population.
# Keep in mind that values in PopTotal have already been divided by 1000, i.e. PopTotal = population/1000 citizens
total_crime_pop['Crimes/ 100,000 population'] = 100 * total_crime_pop['Crimes Committed'] / total_crime_pop['PopTotal']

total_crime_pop.head()

Unnamed: 0,ISO2_code,Crimes Committed,Country,Year,ISO3_code,PopTotal,"Crimes/ 100,000 population"
0,AL,15828.0,Albania,2016,ALB,2881.063,549.380558
1,AL,15828.0,Albania,2017,ALB,2879.355,549.706445
2,AL,15828.0,Albania,2018,ALB,2877.013,550.153927
3,AL,15828.0,Albania,2019,ALB,2873.883,550.75311
4,AL,15828.0,Albania,2020,ALB,2866.849,552.104418


In [32]:
# Change crime/population column names to match ideal cities csv from API requests.

total_crime_pop = total_crime_pop.rename(columns={'Country':'Country Name','ISO2_code':'Country'})
total_crime_pop.head()



Unnamed: 0,Country,Crimes Committed,Country Name,Year,ISO3_code,PopTotal,"Crimes/ 100,000 population"
0,AL,15828.0,Albania,2016,ALB,2881.063,549.380558
1,AL,15828.0,Albania,2017,ALB,2879.355,549.706445
2,AL,15828.0,Albania,2018,ALB,2877.013,550.153927
3,AL,15828.0,Albania,2019,ALB,2873.883,550.75311
4,AL,15828.0,Albania,2020,ALB,2866.849,552.104418


In [33]:
# Import ideal cities CSV cleaned from the API requests and client's requirements
ideal_cities_csv = "cities_airports_hotel_lim.csv"

ideal_cities_list = pd.read_csv(os.path.join(open_file_path, ideal_cities_csv))
ideal_cities_df = ideal_cities_list.drop(columns = [
    'City_ID', 
    'Max Temp', 
    'Humidity',
    'Cloudiness',
    'Wind Speed', 
    'Date',
    'Rank',
    'Crime Index',
    'Safety Index',
    'Number of Hotels',
    'Number of Resturants',
    'Bodies of Water',
    'Tourist Attractions',
    'Natural Places',
    'Hospital',
    'Entertainment',
    'Rental Car',
    'Airport'
])

ideal_cities_df.head(50)

Unnamed: 0,City,Lat,Lng,Country
0,khark,29.2614,50.3306,IR
1,newman,37.3138,-121.0208,US
2,dogonbadan,30.3586,50.7981,IR
3,al wajh,26.2455,36.4525,SA
4,al awjam,26.5632,49.9433,SA
5,santana do paraiso,-19.3636,-42.5686,BR
6,brownwood,31.7093,-98.9912,US
7,al kharijah,25.4514,30.5464,EG
8,goya,-29.14,-59.2626,AR
9,alegrete,-29.7831,-55.7919,BR


In [34]:
# Merge the ideal cities list with the global crime df and remove all rows that do not match...

crime_comparo_ideal_cities = pd.merge(ideal_cities_df, total_crime_pop, how = 'inner', on='Country')
crime_comparo_ideal_cities['City']

0        newman
1        newman
2        newman
3        newman
4        newman
         ...   
768    vallenar
769    vallenar
770    vallenar
771    vallenar
772    vallenar
Name: City, Length: 773, dtype: object

In [35]:
crime_comparo_ideal_cities.head()

Unnamed: 0,City,Lat,Lng,Country,Crimes Committed,Country Name,Year,ISO3_code,PopTotal,"Crimes/ 100,000 population"
0,newman,37.3138,-121.0208,US,9720010.0,United States of America,2016,USA,327210.198,2970.570618
1,newman,37.3138,-121.0208,US,9720010.0,United States of America,2017,USA,329791.231,2947.32215
2,newman,37.3138,-121.0208,US,9720010.0,United States of America,2018,USA,332140.037,2926.479472
3,newman,37.3138,-121.0208,US,9720010.0,United States of America,2019,USA,334319.671,2907.399966
4,newman,37.3138,-121.0208,US,9720010.0,United States of America,2020,USA,335942.003,2893.359542


In [36]:
# test for NaN null
nan_null_test(crime_comparo_ideal_cities)

Number of NaN values : 0
Number of NaN values : 0


Unnamed: 0,City,Lat,Lng,Country,Crimes Committed,Country Name,Year,ISO3_code,PopTotal,"Crimes/ 100,000 population"


In [37]:
# ISO2_code = IR is Iran, which has not reported any data to the UN on crime.  Changing all NaN values to 'Not Reported'
crime_comparo_ideal_cities.fillna('0', inplace=True)
nan_null_test(crime_comparo_ideal_cities)

Number of NaN values : 0
Number of NaN values : 0


Unnamed: 0,City,Lat,Lng,Country,Crimes Committed,Country Name,Year,ISO3_code,PopTotal,"Crimes/ 100,000 population"


---
### ***Locations from Iran were dropped due to difficultities obtaining travel visas, and other tourism restrictions***
---

In [38]:
ideal_city_crime_risk = pd.DataFrame(columns = ['Country', "Year",'Crimes/100,000 people'])
unique_countries = crime_comparo_ideal_cities['Country'].unique()
sorted_ideal_city_crime = crime_comparo_ideal_cities[crime_comparo_ideal_cities['Country'].isin(unique_countries)]

ideal_city_crime_risk=sorted_ideal_city_crime[['Country' , 'Year' , 'Crimes/ 100,000 population']]

ideal_city_crime_risk

Unnamed: 0,Country,Year,"Crimes/ 100,000 population"
0,US,2016,2970.570618
1,US,2017,2947.322150
2,US,2018,2926.479472
3,US,2019,2907.399966
4,US,2020,2893.359542
...,...,...,...
768,CL,2016,2082.539194
769,CL,2017,2050.261531
770,CL,2018,2013.768280
771,CL,2019,1978.014994


In [19]:
country_name = crime_comparo_ideal_cities['Country Name'].unique()

country_code = pd.Series(ideal_city_crime_risk['Country'].unique())
country_code = country_code.sort_values(ascending = True).reset_index(drop = True)
crime_reported_years = ideal_city_crime_risk.groupby('Country')['Year'].nunique()
latest_crime_report = ideal_city_crime_risk.groupby(['Country']).max()['Year']
mean_country_crime_over_year = ideal_city_crime_risk.groupby(['Country']).mean()['Crimes/ 100,000 population'].round(2)
num_cities_in_country = crime_comparo_ideal_cities.groupby(['Country']).count()['City']
oldest_crime_report = ideal_city_crime_risk.groupby(['Country']).min()['Year']
recent_normalized_crime = ideal_city_crime_risk.groupby(['Country'])['Crimes/ 100,000 population'].max().round(2)

country_name




array(['United States of America', 'Saudi Arabia', 'Brazil', 'Egypt',
       'Argentina', 'Uruguay', 'Madagascar', 'Libya', 'Bahrain', 'Chile'],
      dtype=object)

In [20]:

city_crime_summary = pd.DataFrame({
    "Country Name": country_name,
    "Country Code": country_code,
    "Number of Cities in Country": num_cities_in_country,
    "Most Recent Crime Report": latest_crime_report,
    "Oldest Crime Report Included": oldest_crime_report,
    "Crime Year Reports Included": crime_reported_years,
    "Offences / 100,000 people":recent_normalized_crime    
})
city_crime_summary = city_crime_summary.reset_index(drop=True)
city_crime_summary

ValueError: array length 10 does not match index length 20