In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

## Loading Preprocesed Data, Normalizing it

In [7]:
df = pd.read_csv(r'../data/df_allmetro_all.csv')

# add ountry column
df['country'] = df['metroreg'].apply (lambda x: x[:2])

# calculate relative crime
df.loc[:, 'crime/population'] = (df['Nr_Crimes'] / df['population']) *1_000_000     # normalizing to per Million 
df.loc[:, 'density'] = df['population'] / df['area'] 

# drop na
df.dropna()

# drop country aggregates
df = df[df['metroreg'].str.len() >= 3]

# choosing youth unemployment & normalized PP standard 
df = df[(df['unit'] == 'PPS_HAB_EU27_2020:Purchasing power standard (PPS, EU27 from 2020), per inhabitant in percentage of the EU27 (from 2020) average')  & (df['dfunemp_sex'] == 'T:Total')]

# drop metroreg that had a change in area over the years
df_grouped = df.groupby(['metroreg', 'TIME_PERIOD'])['area'].mean().reset_index()
metroregs_with_area_change = df_grouped.groupby('metroreg').filter(lambda x: x['area'].nunique() > 1)['metroreg'].unique()
list(metroregs_with_area_change)
# choose only metropolitan areas, keep non-metropolitan ones
metroregs_with_area_change = [metroreg for metroreg in metroregs_with_area_change if not metroreg.endswith('NM')]
# drop metroregs with area change
df = df.drop(df[df['metroreg'].isin([metroregs_with_area_change])].index)

df.to_csv(r'../data/normalized_data_no_area_change.csv')