#### Libraries

In [None]:
import glob
import pandas as pd
import uuid
import os
import hashlib

### Uploading data

path = r'C:\Users\u1142974\Crime UK\2024\Data'
csv_files = glob.glob(path + "/*.csv")
df_list = (pd.read_csv(file) for file in csv_files)
df = pd.concat(df_list, ignore_index=True)
df.rename(columns = {'Crime ID':'ID', 'Crime type':'Type','LSOA name':'LSOA'}, inplace = True)
df = df.iloc[:,[0,1,2,3,4,5,7,9]]

In [None]:
df

### Cleaing data

#### Analysis of missing data

#
df = df[~df['Month'].isin([0, '0.0'])]

In [None]:
# Calculate the proportion of missing values for each group in 'Month'
missing_proportion = df.groupby('Month').apply(lambda x: x.isnull().sum() / x.shape[0]*100).round(2).astype(str)+'%'

# Display the result
print("\nProportion of missing values for each group in 'Month':")
missing_proportion

#### Handling with missing data

In [None]:
for index, row in df.iterrows():
    if pd.isnull(row['ID']):
        hash_object = hashlib.sha256(str.encode(str(index)))
        df.at[index, 'ID'] = hash_object.hexdigest()

In [None]:
df_cleaned_all = df[~df.duplicated(keep=False)]

## Analysis

In [None]:
df_cleaned_all[['ID', 'Type']].groupby('Type').count().sort_values('ID')

In [None]:
# Group by 'Type' and count occurrences of 'ID'
grouped_data = df_cleaned_all[['ID', 'Type']].groupby('Type').count().sort_values('ID', ascending=False)

# Define colors using the specified palette
colors = sns.color_palette("YlGn", len(grouped_data))

# Plotting the bar chart with the specified palette colors for each type
fig, ax = plt.subplots(figsize=(35, 15))  # Increase the figure size to make the x-axis bigger
bars = ax.bar(grouped_data.index, grouped_data['ID'], color=colors)

# Rotate x-axis labels by 45 degrees
plt.xticks(rotation=45, ha = 'right')

# Show plot
plt.show()


In [None]:
# Convert categorical data to numerical data using one-hot encoding
data_encoded = pd.get_dummies(df_cleaned_all, columns=['Type'])

# Ensure all data is numeric and handle non-numeric values
for column in data_encoded.columns:
    data_encoded[column] = pd.to_numeric(data_encoded[column], errors='coerce')

# Fill missing values with the mean of the column
data_encoded.fillna(data_encoded.mean(), inplace=True)

# Calculate Pearson correlation
correlation_matrix_type_numeric = data_encoded.corr(method='pearson')

# Display correlation between types of crimes
type_columns = [col for col in data_encoded.columns if col.startswith('Type_')]
correlation_matrix_type_numeric = correlation_matrix_type_numeric.loc[type_columns, type_columns]

correlation_matrix_type_numeric

In [None]:
# Zakoduj kolumnę "Type" używając one-hot encoding
df_encoded = pd.get_dummies(df_cleaned_all['Type'], prefix='Type')

# Oblicz korelację Pearsona
correlation_matrix = df_encoded.corr(method='pearson')

# Wyświetl macierz korelacji
correlation_matrix

In [None]:
# Convert 'Month' to datetime format
df_cleaned_all['Month'] = pd.to_datetime(df_cleaned_all['Month'])

# Group by 'Month' and 'Type' and count occurrences of 'ID'
grouped_data = df_cleaned_all.groupby(['Month', 'Type']).count().reset_index()

# Pivot the data to get a time series format
pivot_data = grouped_data.pivot(index='Month', columns='Type', values='ID').fillna(0)

# Define colors using the specified palette
colors = sns.color_palette("rocket", len(pivot_data.columns))

# Plotting the time series data with specified colors for each type
fig, ax = plt.subplots(figsize=(35, 10))  # Increase the figure size to make the x-axis bigger

for i, crime_type in enumerate(pivot_data.columns):
    ax.plot(pivot_data.index, pivot_data[crime_type], color=colors[i], label=crime_type)
    ax.bar(pivot_data.index, pivot_data[crime_type], color=colors[i], alpha=0.3)

# Adding title and labels
plt.title('Time Series of Crime Types')
plt.xlabel('Month')
plt.ylabel('Count of IDs')

# Rotate x-axis labels by 45 degrees and show all months
ax.set_xticks(pivot_data.index)
ax.set_xticklabels(pivot_data.index.strftime('%Y-%m'), rotation=45, ha='right')

# Add legend
plt.legend()

# Show plot
plt.show()