# Task
Generate a DataFrame showing the unique street names from 'combined_press_releases_final.csv' and whether each street name has an exact or similar match in the 'variant' column of 'malta_street_dimension.csv'. The output DataFrame should contain three columns: 'street', 'found identical', and 'found similar name'.

In [8]:
import pandas as pd

# Load the datasets
press_releases_df = pd.read_csv('combined_press_releases_final.csv')
street_dimension_df = pd.read_csv('malta_street_dimension.csv')

#change to lower case
press_releases_df['street'] = press_releases_df['street'].str.lower()
street_dimension_df['variant'] = street_dimension_df['variant'].str.lower()

# Display the first 5 rows and column names of press_releases_df
print("--- press_releases_df ---")
print(press_releases_df.head())
print("\nColumns in press_releases_df:", press_releases_df.columns.tolist())

# Display the first 5 rows and column names of street_dimension_df
print("\n--- street_dimension_df ---")
print(street_dimension_df.head())
print("\nColumns in street_dimension_df:", street_dimension_df.columns.tolist())

--- press_releases_df ---
   release_id                                           og_title  \
0           1  Collision between a car and a motorbike in Żur...   
1           1  Collision between a car and a motorbike in Żur...   
2           2                    Car-motorcycle traffic accident   
3           2                    Car-motorcycle traffic accident   
4           3              Car-motorcycle collision in Ħal Qormi   

                                          og_content og_date_published  \
0  Today, at around 0930hrs, the Police were info...        2025-10-09   
1  Today, at around 0930hrs, the Police were info...        2025-10-09   
2  Yesterday, at around 1830hrs, the Police were ...        2025-06-20   
3  Yesterday, at around 1830hrs, the Police were ...        2025-06-20   
4  Today, at around 0800hrs, the Police were info...        2025-05-12   

  og_date_modified accident_datetime  is_accident                 street  \
0       2025-10-09  09/10/2025 09:30        

In [9]:
import sys
import os

try:
    from fuzzywuzzy import fuzz
    print("fuzzywuzzy is already installed.")
except ImportError:
    print("fuzzywuzzy is not installed. Installing...")
    %pip install fuzzywuzzy python-Levenshtein
    from fuzzywuzzy import fuzz
    print("fuzzywuzzy installed successfully.")

unique_press_release_streets = press_releases_df['street'].dropna().unique()

print(f"\nNumber of unique streets in press releases: {len(unique_press_release_streets)}")
print("Top 5 unique streets from press_releases_df:", unique_press_release_streets[:5].tolist())


fuzzywuzzy is already installed.

Number of unique streets in press releases: 99
Top 5 unique streets from press_releases_df: ['triq il-belt valletta', 'triq dawret il-gudja', 'valley road', 'triq dawret ħal għaxaq', 'triq il-buqana']


In [17]:
similarity_threshold = 75 # Define a threshold for similar matches (e.g., 75 out of 100)

# Initialize a list to store results
results = []

for street in unique_press_release_streets:
    found_identical = False
    found_similar = False

    # Check for exact match
    if street in street_dimension_df['variant'].values:
        found_identical = True
    else:
        # Check for similar match if no exact match
        for variant in street_dimension_df['variant'].unique():
            if pd.notna(variant) and fuzz.ratio(street.lower(), str(variant).lower()) >= similarity_threshold:
                found_similar = True
                break

    results.append({
        'street_Police Press releases': street,
        'found identical': found_identical,
        'found similar name': found_similar,
        'similar street': variant if found_similar else 'not applicable'
    })

# Create the final DataFrame
street_match_df = pd.DataFrame(results)

# Display the resulting DataFrame
print("\n--- Street Matching Results ---")
print(street_match_df.head())
print(f"\nTotal unique streets checked: {len(street_match_df)}")
print(f"Number of streets with identical match: {street_match_df['found identical'].sum()}")
print(f"Number of streets with similar match (excluding identical): {street_match_df[(street_match_df['found similar name'] == True) & (street_match_df['found identical'] == False)].sum()['found similar name']}")
print(f"Number of streets with NO matching (identical OR similar): {street_match_df[(street_match_df['found similar name'] == False) & (street_match_df['found identical'] == False)].count()['street_Police Press releases']}")



--- Street Matching Results ---
  street_Police Press releases  found identical  found similar name  \
0        triq il-belt valletta             True               False   
1         triq dawret il-gudja            False                True   
2                  valley road             True               False   
3       triq dawret ħal għaxaq            False                True   
4               triq il-buqana             True               False   

           similar street  
0          not applicable  
1    triq dawret il gudja  
2          not applicable  
3  triq dawret hal ghaxaq  
4          not applicable  

Total unique streets checked: 99
Number of streets with identical match: 63
Number of streets with similar match (excluding identical): 36
Number of streets with NO matching (identical OR similar): 0


In [18]:
print('List of UNDETECTED streets')
print(street_match_df[(street_match_df['found identical'] == False) & (street_match_df['found similar name'] == False)]['street_Police Press releases'])

List of UNDETECTED streets
Series([], Name: street_Police Press releases, dtype: object)


In [19]:
print('List of similar street detected')
print(street_match_df[(street_match_df['found identical'] == False) & (street_match_df['found similar name'] == True)][['street_Police Press releases','similar street']])

List of similar street detected
   street_Police Press releases               similar street
1          triq dawret il-gudja         triq dawret il gudja
3        triq dawret ħal għaxaq       triq dawret hal ghaxaq
7              triq tal-barrani             triq tal barrani
12                  żabbar road                  zabbar road
13        triq sant ‘elizabetta        triq sant' elizabetta
16                triq il-baċir                triq il bacir
19    triq il-perit dom mintoff    triq il perit dom mintoff
23              triq għar dalam              triq ghar dalam
24        triq diċembru tlettax        triq dicembru tlettax
27   xatt l-għassara tal-għeneb   xatt l-ghassara tal-gheneb
28    triq il-mina ta’ hompesch     triq il-mina ta'hompesch
30                triq iż-żejfa                triq iz zejfa
36                triq ħompesch                 triq hompesh
37             triq sant’antnin               triq sant anna
38            triq sant’andrija            triq sant 