<a href="https://colab.research.google.com/github/neerajthandayan/CourseProject/blob/main/Ethnic_Diversity_Calculation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Calculating Ethinic Diversity in Different Police Force Areas

In [112]:
# Importing Libraries

import numpy as np
import pandas as pd

In [113]:
# Loading Data

data = pd.read_csv('/content/drive/MyDrive/Project Data/Arrest_data.csv',header=0)
data.head(2)

Unnamed: 0,Measure,Time,Time_type,Ethnicity,Ethnicity_type,Gender,Gender_type,Age_Group,Age_Group_type,Geography,Geography_type,Number of arrests,"Population by ethnicity, gender, and PFA (based on 2011 Census)","Rate per 1,000 population by ethnicity, gender, and PFA","Proportion of arrests of this ethnicity, in this year, of this gender and in this police force area (excludes unreported)",Notes
0,Number of arrests,2018/19,Financial year,All,All,All,Male/Female,10 - 17 years,Arrest age group categories,All,Police Force Area,60208,N/A for individual age groups,N/A for individual age groups,N/A for individual age groups,Excludes Lancashire (both arrest and populatio...
1,Number of arrests,2018/19,Financial year,All,All,Female,Male/Female,10 - 17 years,Arrest age group categories,All,Police Force Area,9290,N/A for individual age groups,N/A for individual age groups,N/A for individual age groups,Excludes Lancashire (both arrest and populatio...


In [114]:
data.columns 

Index(['Measure', 'Time', 'Time_type', 'Ethnicity', 'Ethnicity_type', 'Gender',
       'Gender_type', 'Age_Group', 'Age_Group_type', 'Geography',
       'Geography_type', 'Number of arrests',
       'Population by ethnicity, gender, and PFA (based on 2011 Census)',
       'Rate per 1,000 population by ethnicity, gender, and PFA',
       'Proportion of arrests of this ethnicity, in this year, of this gender and in this police force area (excludes unreported)',
       'Notes'],
      dtype='object')

In [115]:
# Dropping unnecessary columns 

data.drop(columns=['Measure','Time_type','Rate per 1,000 population by ethnicity, gender, and PFA','Geography_type','Number of arrests','Gender_type','Age_Group_type',
                   'Proportion of arrests of this ethnicity, in this year, of this gender and in this police force area (excludes unreported)',
                   'Notes'], inplace=True)

In [116]:
# Slicing Data  and renaming columns

ndata = data.loc[data['Age_Group'] == "All"]
ndata = ndata.loc[data['Gender'] == "All"]
ndata = ndata.loc[data['Ethnicity_type'] == "ONS 2001 16+1"]
ndata = ndata.loc[data['Time'] == "2018/19"]
ndata = ndata.loc[data['Geography'] != "All"]
newdata = ndata.drop(columns=['Time','Ethnicity_type','Gender','Age_Group']).copy()
newdata.rename(columns={'Population by ethnicity, gender, and PFA (based on 2011 Census)':'Population'},inplace=True)

# Droping NaN Values

newdata.dropna(inplace=True)

# Resetting Index

newdata.reset_index(inplace=True)

In [117]:
# Cleaning Population Data

c = newdata['Population'].copy()

for i in range(len(c)):
  if type(c.iloc[i]) == str:
    c.iloc[i] = c.iloc[i].replace(',','')
    c.iloc[i]= pd.to_numeric(c.iloc[i])
  else:
    continue

newdata['Population'] = c

In [118]:
# Creating Geography Object

PFA = newdata['Geography'].unique()
PFA

array(['Avon and Somerset', 'Bedfordshire', 'Cambridgeshire', 'Cheshire',
       'Cleveland', 'Cumbria', 'Derbyshire', 'Devon and Cornwall',
       'Dorset', 'Durham', 'Dyfed-Powys', 'Essex', 'Gloucestershire',
       'Greater Manchester', 'Gwent', 'Hampshire', 'Hertfordshire',
       'Humberside', 'Kent', 'Lancashire', 'Leicestershire',
       'Lincolnshire', 'Merseyside', 'Metropolitan Police', 'Norfolk',
       'North Wales', 'North Yorkshire', 'Northamptonshire',
       'Northumbria', 'Nottinghamshire', 'South Wales', 'South Yorkshire',
       'Staffordshire', 'Suffolk', 'Surrey', 'Sussex', 'Thames Valley',
       'Warwickshire', 'West Mercia', 'West Midlands', 'West Yorkshire',
       'Wiltshire'], dtype=object)

In [119]:
# Creating Columns for Relative Population Data and Squared Relative Population

newdata['rel_pop'] = np.NaN
newdata['rl_sd'] = np.NaN

In [120]:
# Populating Relative Population Data and Squared Relative Population

u = []

for i in PFA:
  a = newdata.loc[newdata['Geography'] == i].copy()
  a['rel_pop'] = (a['Population']/a['Population'].sum())*100
  for x in a['rel_pop']:
    u.append(x)

newdata['rel_pop'] = u
newdata['rl_sd'] = np.square(newdata['rel_pop'].copy())

In [121]:
# Making Herfindahl-Hirschman concentration index object

Ethnic_Diversity = []

for q in PFA:
  Ethnic_Diversity.append(10000 - newdata.loc[newdata['Geography'] == q]['rl_sd'].sum())

In [122]:
# Craeting DataFrame for Ethnic Diversity

mydict = {'Geography': PFA, 'Diversity_Score': Ethnic_Diversity}
fdata = pd.DataFrame.from_dict(mydict)
fdata.sort_values(by='Diversity_Score', ascending=False)

Unnamed: 0,Geography,Diversity_Score
23,Metropolitan Police,7643.939281
39,West Midlands,5502.85049
1,Bedfordshire,4986.546898
20,Leicestershire,4226.977
36,Thames Valley,3875.831242
40,West Yorkshire,3754.38739
13,Greater Manchester,3579.964301
16,Hertfordshire,3421.414505
2,Cambridgeshire,3302.841513
34,Surrey,2981.203065


In [123]:
# Saving Ethinic Diversity CSV

fdata.to_csv('ethnicdiversity.csv')