In [17]:
import pandas as pd;
import numpy as np;
import re;

profile_df = pd.read_csv('../static/data/original/neighbourhood-profiles-2016-csv.csv')
profile_df.head()

Unnamed: 0,_id,Category,Topic,Data Source,Characteristic,City of Toronto,Agincourt North,Agincourt South-Malvern West,Alderwood,Annex,...,Willowdale West,Willowridge-Martingrove-Richview,Woburn,Woodbine Corridor,Woodbine-Lumsden,Wychwood,Yonge-Eglinton,Yonge-St.Clair,York University Heights,Yorkdale-Glen Park
0,1,Neighbourhood Information,Neighbourhood Information,City of Toronto,Neighbourhood Number,,129,128,20,95,...,37,7,137,64,60,94,100,97,27,31
1,2,Neighbourhood Information,Neighbourhood Information,City of Toronto,TSNS2020 Designation,,No Designation,No Designation,No Designation,No Designation,...,No Designation,No Designation,NIA,No Designation,No Designation,No Designation,No Designation,No Designation,NIA,Emerging Neighbourhood
2,3,Population,Population and dwellings,Census Profile 98-316-X2016001,"Population, 2016",2731571,29113,23757,12054,30526,...,16936,22156,53485,12541,7865,14349,11817,12528,27593,14804
3,4,Population,Population and dwellings,Census Profile 98-316-X2016001,"Population, 2011",2615060,30279,21988,11904,29177,...,15004,21343,53350,11703,7826,13986,10578,11652,27713,14687
4,5,Population,Population and dwellings,Census Profile 98-316-X2016001,Population Change 2011-2016,4.50%,-3.90%,8.00%,1.30%,4.60%,...,12.90%,3.80%,0.30%,7.20%,0.50%,2.60%,11.70%,7.50%,-0.40%,0.80%


In [18]:
crime_df = pd.read_csv('../static/data/original/neighbourhood-crime-rates.csv')
crime_df.head()

Unnamed: 0,_id,OBJECTID,Neighbourhood,Hood_ID,F2020_Population_Projection,Assault_2014,Assault_2015,Assault_2016,Assault_2017,Assault_2018,...,Shootings_2019,Shootings_2020,Shooting_Rate2014,Shootings_Rate2015,Shootings_Rate2016,Shootings_Rate2017,Shootings_Rate2018,Shootings_Rate2019,Shootings_Rate2020,geometry
0,1,1,Yonge-St.Clair,97,14083,16,25,34,25,28,...,0,0,0.0,0.0,7.722008,0.0,0.0,0.0,0.0,"{u'type': u'Polygon', u'coordinates': (((-79.3..."
1,2,2,York University Heights,27,30277,273,298,363,351,362,...,6,12,6.963789,0.0,13.98699,34.50417,27.18961,20.10252,39.63404,"{u'type': u'Polygon', u'coordinates': (((-79.5..."
2,3,3,Lansing-Westgate,38,18146,42,81,67,84,68,...,2,1,0.0,0.0,5.980146,0.0,5.737564,11.24543,5.510856,"{u'type': u'Polygon', u'coordinates': (((-79.4..."
3,4,4,Yorkdale-Glen Park,31,17560,106,137,175,163,178,...,17,14,19.74074,19.75244,19.70055,44.31221,61.04261,100.1709,79.72665,"{u'type': u'Polygon', u'coordinates': (((-79.4..."
4,5,5,Stonegate-Queensway,16,27410,91,74,78,98,86,...,1,0,11.70229,7.788465,0.0,3.821754,3.759257,3.702744,0.0,"{u'type': u'Polygon', u'coordinates': (((-79.4..."


## Fix neighbourhood names

In [19]:
# get the intersection of two data above
# Get neighbourhood names from profile_df (excluding non-neighbourhood columns)
profile_neighbourhoods = set(profile_df.columns[6:])  # Skip first 6 columns which are metadata

# Get neighbourhood names from crime_df
crime_neighbourhoods = set(crime_df["Neighbourhood"])

# Get the intersection
common_neighbourhoods = profile_neighbourhoods.intersection(crime_neighbourhoods)

# Print results
print(f"Profile neighbourhoods: {len(profile_neighbourhoods)}")
print(f"Crime neighbourhoods: {len(crime_neighbourhoods)}")
print(f"Common neighbourhoods: {len(common_neighbourhoods)}")
print("\nMissing in crime data:", profile_neighbourhoods - crime_neighbourhoods)
print("\nMissing in profile data:", crime_neighbourhoods - profile_neighbourhoods)

Profile neighbourhoods: 140
Crime neighbourhoods: 140
Common neighbourhoods: 137

Missing in crime data: {'Cabbagetown-South St. James Town', 'North St. James Town', 'Weston-Pelham Park'}

Missing in profile data: {'North St.James Town', 'Weston-Pellam Park', 'Cabbagetown-South St.James Town'}


In [20]:
# Create a mapping dictionary for the inconsistent names
name_mapping = {
    'Cabbagetown-South St.James Town': 'Cabbagetown-South St. James Town',
    'North St.James Town': 'North St. James Town',
    'Weston-Pellam Park': 'Weston-Pelham Park'
}

# Replace the names in crime_df
crime_df['Neighbourhood'] = crime_df['Neighbourhood'].replace(name_mapping)

# Verify the intersection again
crime_neighbourhoods = set(crime_df["Neighbourhood"])
common_neighbourhoods = profile_neighbourhoods.intersection(crime_neighbourhoods)
print(f"Common neighbourhoods after fix: {len(common_neighbourhoods)}")

Common neighbourhoods after fix: 140


## Clean Profile Data

In [21]:
# Count NaN values for each column
nan_counts = profile_df.isna().sum()

# Show only columns that have NaN values (if any)
columns_with_nans = nan_counts[nan_counts > 0]
print("Columns with NaN values:")
print(columns_with_nans)

# For columns with NaN values, show unique Category-Topic combinations
if len(columns_with_nans) > 0:
    for col in columns_with_nans.index:
        print(f"\nUnique Category-Topic combinations where {col} is NaN:")
        unique_combinations = profile_df[profile_df[col].isna()][['Category', 'Topic']].drop_duplicates()
        print(unique_combinations)

Columns with NaN values:
City of Toronto                  7
Agincourt North                 56
Agincourt South-Malvern West    56
Alderwood                       56
Annex                           56
                                ..
Wychwood                        56
Yonge-Eglinton                  56
Yonge-St.Clair                  56
York University Heights         56
Yorkdale-Glen Park              56
Length: 141, dtype: int64

Unique Category-Topic combinations where City of Toronto is NaN:
                       Category                      Topic
0     Neighbourhood Information  Neighbourhood Information
1960            Journey to work      Commuting destination

Unique Category-Topic combinations where Agincourt North is NaN:
             Category                                    Topic
946            Income            Income of individuals in 2015
1014           Income             Income of households in 2015
1076           Income      Income of economic families in 2015
168

In [22]:
selected_row = {
  "Income of households in 2015": ["Average after-tax income of households in 2015 ($)"],
  "Mother tongue": [],
  "Immigrants by selected place of birth": [],
  "Population and dwellings": ["Land area in square kilometres"]
}

In [23]:
# stripe profile_df Characteristic and Topic
profile_df['Characteristic'] = profile_df['Characteristic'].str.strip()
profile_df['Topic'] = profile_df['Topic'].str.strip()

In [24]:
# Create a mask for filtering based on selected_row criteria
masks = []

for topic, characteristics in selected_row.items():
    if topic == "Mother tongue":
        # profile_df['Characteristic'] should not start from space
        topic_mask = (profile_df['Topic'] == topic) & ~(profile_df['Characteristic'].str.endswith('n.i.e.') | profile_df['Characteristic'].str.endswith('n.o.s.'))
    elif characteristics:  # If the list is not empty
        # Filter rows where Topic matches and Characteristic is in the list
        topic_mask = (profile_df['Topic'] == topic) & (profile_df['Characteristic'].isin(characteristics))
    else:
        # If list is empty, just match the Topic
        topic_mask = (profile_df['Topic'] == topic)
    masks.append(topic_mask)

# Combine all masks with OR operation
final_mask = pd.concat(masks, axis=0).groupby(level=0).any()

# Apply the mask to filter the DataFrame
filtered_profile_df = profile_df[final_mask]

filtered_profile_df.head()

Unnamed: 0,_id,Category,Topic,Data Source,Characteristic,City of Toronto,Agincourt North,Agincourt South-Malvern West,Alderwood,Annex,...,Willowdale West,Willowridge-Martingrove-Richview,Woburn,Woodbine Corridor,Woodbine-Lumsden,Wychwood,Yonge-Eglinton,Yonge-St.Clair,York University Heights,Yorkdale-Glen Park
8,9,Population,Population and dwellings,Census Profile 98-316-X2016001,Land area in square kilometres,630.2,7.41,7.83,4.95,2.81,...,2.91,5.53,12.31,1.6,1.17,1.68,1.65,1.17,13.23,6.04
139,140,Language,Mother tongue,Census Profile 98-316-X2016001,Mother tongue for the total population excludi...,2704415.0,28845.0,23740.0,12035.0,29880.0,...,16890.0,22150.0,53310.0,12445.0,7845.0,13345.0,11810.0,12530.0,27590.0,14050.0
140,141,Language,Mother tongue,Census Profile 98-316-X2016001,Single responses,2598230.0,27740.0,22720.0,11675.0,29110.0,...,16320.0,21270.0,50360.0,12185.0,7650.0,12925.0,11495.0,12240.0,26190.0,13395.0
141,142,Language,Mother tongue,Census Profile 98-316-X2016001,Official languages,1411345.0,7185.0,7215.0,7465.0,21355.0,...,5925.0,11745.0,23050.0,9840.0,5530.0,8680.0,8720.0,9480.0,11590.0,6000.0
142,143,Language,Mother tongue,Census Profile 98-316-X2016001,English,1375905.0,7070.0,7080.0,7360.0,20645.0,...,5785.0,11470.0,22645.0,9515.0,5405.0,8445.0,8485.0,9230.0,11385.0,5900.0


In [25]:
# 获取需要处理的列（从第6列开始）
value_columns = filtered_profile_df.columns[5:]

# 定义函数来转换字符串为数值
def convert_to_numeric(x):
    if pd.isna(x):  # 处理空值
        return x
    # 移除千位分隔符
    if isinstance(x, str):
        x = x.replace(',', '')
    # 转换为float
    try:
        return float(x)
    except:
        return x

# 应用转换到所有数值列
for col in value_columns:
    filtered_profile_df[col] = filtered_profile_df[col].apply(convert_to_numeric)

filtered_profile_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_profile_df[col] = filtered_profile_df[col].apply(convert_to_numeric)


Unnamed: 0,_id,Category,Topic,Data Source,Characteristic,City of Toronto,Agincourt North,Agincourt South-Malvern West,Alderwood,Annex,...,Willowdale West,Willowridge-Martingrove-Richview,Woburn,Woodbine Corridor,Woodbine-Lumsden,Wychwood,Yonge-Eglinton,Yonge-St.Clair,York University Heights,Yorkdale-Glen Park
8,9,Population,Population and dwellings,Census Profile 98-316-X2016001,Land area in square kilometres,630.2,7.41,7.83,4.95,2.81,...,2.91,5.53,12.31,1.6,1.17,1.68,1.65,1.17,13.23,6.04
139,140,Language,Mother tongue,Census Profile 98-316-X2016001,Mother tongue for the total population excludi...,2704415.0,28845.0,23740.0,12035.0,29880.0,...,16890.0,22150.0,53310.0,12445.0,7845.0,13345.0,11810.0,12530.0,27590.0,14050.0
140,141,Language,Mother tongue,Census Profile 98-316-X2016001,Single responses,2598230.0,27740.0,22720.0,11675.0,29110.0,...,16320.0,21270.0,50360.0,12185.0,7650.0,12925.0,11495.0,12240.0,26190.0,13395.0
141,142,Language,Mother tongue,Census Profile 98-316-X2016001,Official languages,1411345.0,7185.0,7215.0,7465.0,21355.0,...,5925.0,11745.0,23050.0,9840.0,5530.0,8680.0,8720.0,9480.0,11590.0,6000.0
142,143,Language,Mother tongue,Census Profile 98-316-X2016001,English,1375905.0,7070.0,7080.0,7360.0,20645.0,...,5785.0,11470.0,22645.0,9515.0,5405.0,8445.0,8485.0,9230.0,11385.0,5900.0


In [26]:
# Count NaN values for each column
nan_counts = filtered_profile_df.isna().sum()

# Show only columns that have NaN values (if any)
columns_with_nans = nan_counts[nan_counts > 0]
print("Columns with NaN values:")
print(columns_with_nans)

# For columns with NaN values, show unique Category-Topic combinations
if len(columns_with_nans) > 0:
    for col in columns_with_nans.index:
        print(f"\nUnique Category-Topic combinations where {col} is NaN:")
        unique_combinations = filtered_profile_df[filtered_profile_df[col].isna()][['Category', 'Topic']].drop_duplicates()
        print(unique_combinations)

Columns with NaN values:
Series([], dtype: int64)


In [27]:
neighbourhoods = filtered_profile_df.columns[5:]
base_cols = ['_id', 'Category', 'Topic', 'Data Source', 'Characteristic']

def revert_table_direction(df):
    melted_df = pd.melt(
        df,
        id_vars=base_cols,
        value_vars=neighbourhoods,
        var_name='neighbourhood',
        value_name='value'
    )

    pivoted_df = melted_df.pivot(
        index=['neighbourhood'],
        columns='Characteristic',
        values='value'
    ).reset_index()

    pivoted_df.columns.name = None

    return pivoted_df

In [28]:
# language df and calculate shannon entropy

language_df = filtered_profile_df[filtered_profile_df['Topic'] == 'Mother tongue']
language_df = revert_table_direction(language_df)

def calculate_shannon_entropy(row):
    """
    Calculate Shannon Entropy for a row of language data.
    Higher values indicate more diversity.
    """
    # Get the total population from the specific column
    total_population = row['Mother tongue for the total population excluding institutional residents']
    
    # Remove non-language columns and convert to numpy array
    values = row.drop(['neighbourhood', 'Mother tongue for the total population excluding institutional residents']).values.astype(float)
    
    # Avoid division by zero
    if total_population == 0:
        return 0
    
    # Calculate proportions using the total population
    proportions = values / total_population
    
    # Remove zeros to avoid log(0)
    proportions = proportions[proportions > 0]
    
    # Calculate Shannon entropy
    entropy = -np.sum(proportions * np.log(proportions))
    
    return entropy

# Calculate Shannon Entropy for each neighborhood
diversity_scores = pd.DataFrame()
diversity_scores['neighbourhood'] = language_df['neighbourhood']
diversity_scores['total_language_population'] = language_df['Mother tongue for the total population excluding institutional residents'].astype(int)
diversity_scores['shannon_diversity'] = language_df.apply(calculate_shannon_entropy, axis=1)

# Sort by diversity score in descending order
diversity_scores = diversity_scores.sort_values('shannon_diversity', ascending=False)

# Display results
print("\nCultural Diversity Scores (Shannon Entropy):")
print("\nTop 10 Most Diverse Neighborhoods:")
print(diversity_scores[['neighbourhood', 'total_language_population', 'shannon_diversity']].head(10))

print("\nBottom 10 Least Diverse Neighborhoods:")
print(diversity_scores[['neighbourhood', 'total_language_population', 'shannon_diversity']].tail(10))

# Calculate summary statistics
print("\nSummary Statistics of Diversity Scores:")
print(diversity_scores['shannon_diversity'].describe())

# Save results to CSV
# diversity_scores.to_csv('../static/data/processed/language_diversity_scores.csv', index=False)


Cultural Diversity Scores (Shannon Entropy):

Top 10 Most Diverse Neighborhoods:
                         neighbourhood  total_language_population  \
43                     Flemingdon Park                      21930   
49                          Henry Farm                      15725   
31                  Don Valley Village                      26850   
87                    Newtonbrook West                      23640   
90                North St. James Town                      18405   
139            York University Heights                      27590   
64       Kingsview Village-The Westway                      21990   
32                         Dorset Park                      24395   
119                   Thorncliffe Park                      20850   
82   Mount Olive-Silverstone-Jamestown                      32825   

     shannon_diversity  
43            6.944351  
49            6.768657  
31            6.617282  
87            6.467129  
90            6.457026  
139     

In [29]:
# add land area to diversity_scores
land_area_df = filtered_profile_df[filtered_profile_df['Topic'] == 'Population and dwellings']
land_area_df = revert_table_direction(land_area_df)

#merge land_area_df and diversity_scores
diversity_scores = pd.merge(land_area_df, diversity_scores, on='neighbourhood', how = 'left')

#print land_area_df
print(diversity_scores.head())


                  neighbourhood  Land area in square kilometres  \
0               Agincourt North                            7.41   
1  Agincourt South-Malvern West                            7.83   
2                     Alderwood                            4.95   
3                         Annex                            2.81   
4             Banbury-Don Mills                            9.98   

   total_language_population  shannon_diversity  
0                      28845           4.799994  
1                      23740           5.494088  
2                      12035           4.893117  
3                      29880           4.326750  
4                      27480           5.816395  


## Calculate Overall Crime Rate for Each Neighbourhood

In [30]:
crime_processed_df = pd.read_csv('../static/data/processed/neighbourhood-crime-rates.csv')
crime_processed_df.head()

Unnamed: 0,neighbourhood,crime_type,year,population,crime_count,crime_rate
0,Agincourt North,Assault,2014,31618,67,219.9462
1,Agincourt North,Assault,2015,31618,77,255.2712
2,Agincourt North,Assault,2016,31618,78,260.3906
3,Agincourt North,Assault,2017,31618,73,240.5034
4,Agincourt North,Assault,2018,31618,80,259.7656


In [31]:
# Filter for 2020 data and calculate overall crime rate
crime_2020_df = crime_processed_df[crime_processed_df['year'] == 2020]

# Group by neighbourhood to sum crime counts and get population
overall_crime_2020 = crime_2020_df.groupby('neighbourhood').agg({
    'crime_count': 'sum',
    'population': 'first'  # Take first population value as it's same for each neighborhood
}).reset_index()

# Rename the population column
overall_crime_2020 = overall_crime_2020.rename(columns={'population': 'population_2020'})

# Calculate overall crime rate per 100,000 people
overall_crime_2020['overall_crime_rate'] = (overall_crime_2020['crime_count'] / overall_crime_2020['population_2020']) * 100000

# Sort by crime rate to see distribution
print("\nTop 10 Neighborhoods by 2020 Overall Crime Rate:")
print(overall_crime_2020.sort_values('overall_crime_rate', ascending=False).head(10))

print("\nBottom 10 Neighborhoods by 2020 Overall Crime Rate:")
print(overall_crime_2020.sort_values('overall_crime_rate').head(10))

# Basic statistics of crime rates
print("\nSummary Statistics of Overall Crime Rates:")
print(overall_crime_2020['overall_crime_rate'].describe())

# Save to CSV if needed
# overall_crime_2020.to_csv('../static/data/processed/overall_crime_rate_2020.csv', index=False)


Top 10 Neighborhoods by 2020 Overall Crime Rate:
               neighbourhood  crime_count  population_2020  overall_crime_rate
79                 Moss Park         1150            23905         4810.709057
62      Kensington-Chinatown          725            21196         3420.456690
23     Church-Yonge Corridor         1216            39279         3095.801828
6        Bay Street Corridor          899            32790         2741.689539
138  York University Heights          733            30277         2420.979621
124   West Humber-Clairville          892            37133         2402.175962
139       Yorkdale-Glen Park          397            17560         2260.820046
54             Humber Summit          288            13458         2139.991083
120               University          171             8433         2027.748132
13               Black Creek          450            23206         1939.153667

Bottom 10 Neighborhoods by 2020 Overall Crime Rate:
                neighbourhoo

In [32]:
#Merge crime rates with diversity scores
select_filter_df = pd.merge(overall_crime_2020, diversity_scores, on='neighbourhood', how = 'left')

#calculate population density
select_filter_df['population_density'] = select_filter_df['population_2020'] / select_filter_df['Land area in square kilometres']

#add a new column for householdincome in 2015
income_df = filtered_profile_df[filtered_profile_df['Topic'] == 'Income of households in 2015']
income_df = revert_table_direction(income_df)

#merge select_filter_df and income_df
select_filter_df = pd.merge(select_filter_df, income_df, on='neighbourhood', how = 'left')

print(select_filter_df.head())

#export to csv
select_filter_df.to_csv('../static/data/processed/select-filter.csv', index=False)



                  neighbourhood  crime_count  population_2020  \
0               Agincourt North          168            31618   
1  Agincourt South-Malvern West          306            27406   
2                     Alderwood          125            13242   
3                         Annex          592            34680   
4             Banbury-Don Mills          190            31186   

   overall_crime_rate  Land area in square kilometres  \
0          531.342906                            7.41   
1         1116.543823                            7.83   
2          943.966168                            4.95   
3         1707.035755                            2.81   
4          609.247739                            9.98   

   total_language_population  shannon_diversity  population_density  \
0                      28845           4.799994         4266.936572   
1                      23740           5.494088         3500.127714   
2                      12035           4.893117      