# Importing Libraries

In [1]:
# libraries for automating the data analysis process

import os 
from glob import glob
import re

# libraries for data analysis

import pandas as pd
from fuzzywuzzy import fuzz

# Defining Folder Variables

In [2]:
# Set relative paths for different folders
MAIN_FOLDER = os.path.abspath(os.path.join(os.getcwd(), ".."))

CODE_FOLDER = os.path.join(MAIN_FOLDER, "Code")

PROFILE_FOLDER = os.path.join(CODE_FOLDER, "ProfileReports")

DATA_FOLDER = os.path.join(MAIN_FOLDER, "Data")

RAW_SHEETS_FOLDER = os.path.join(DATA_FOLDER, "Raw_Sheets")

CLEANED_SHEETS_FOLDER = os.path.join(DATA_FOLDER, "Cleaned_Sheets")


# Defining Sheet Variables

In [3]:
SAVE_TO_CSV = os.path.join(CLEANED_SHEETS_FOLDER, "cleaned_sheet.csv")

FINAL_SHEET = os.path.join(CLEANED_SHEETS_FOLDER, "powerbi_analysis.csv")

# Loading Data

In [4]:
df = pd.read_csv(SAVE_TO_CSV)

df

Unnamed: 0,OrderNumber,CUSTID,JOBTYPE,Department,FROMLOCSTR,TOLOCSTR,STARTTM,ENDTM,TripDuration,CTCOMPLETEDT,DriverId,PrimeMoverId,TrailerId,JOINED DATE,RESIGNED DATE,RACE,Team,Status,AGE_Years,Seniority
0,TL9008453,00-RE001,LOCAL,CHIEN,JI DRUM,1 SERAYA AVE,2021-01-07 08:00:00,2021-01-07 08:30:00,0.500000,2021-01-07,2,215,1538.0,2007-06-07,2021-02-23,CHINESE,CHIEN LI,16.0,53.58,13.72
1,TL9008453,00-RE001,LOCAL,CHIEN,JI DRUM,1 SERAYA AVE,2021-01-07 10:30:00,2021-01-07 11:00:00,0.500000,2021-01-07,2,215,972.0,2007-06-07,2021-02-23,CHINESE,CHIEN LI,16.0,53.58,13.72
2,TL9009105,00-RE001,LOCAL,CHIEN,JI DRUM,1 SERAYA AVE,2021-01-27 08:00:00,2021-01-27 08:30:00,0.500000,2021-01-27,2,215,972.0,2007-06-07,2021-02-23,CHINESE,CHIEN LI,6.0,53.58,13.72
3,TL9009105,00-RE001,LOCAL,CHIEN,JI DRUM,1 SERAYA AVE,2021-01-27 10:30:00,2021-01-27 11:00:00,0.500000,2021-01-27,2,215,1538.0,2007-06-07,2021-02-23,CHINESE,CHIEN LI,6.0,53.58,13.72
4,TL9008550,00-RE001,LOCAL,CHIEN,JI DRUM,1 SERAYA AVE,2021-01-11 08:00:00,2021-01-11 08:30:00,0.500000,2021-01-11,2,215,1538.0,2007-06-07,2021-02-23,CHINESE,CHIEN LI,13.0,53.58,13.72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1081779,TE8037442,00-RA186,EXPORT,MAIN,PTC48(G),42PANDAN1,2023-12-06 09:50:00,2023-12-06 10:00:00,0.166667,2023-12-06,220,222,1772.0,2008-03-17,2023-12-31,CHINESE,FCL,34.0,51.33,15.79
1081780,TL9050450,00-RA186,LOCAL,CHIEN,COGENT TANK 15,48DRUMMING,2023-12-04 10:30:00,2023-12-04 12:00:00,1.500000,2023-12-04,37,145,119.0,2004-06-25,2023-12-31,CHINESE,CHIEN LI,8.0,68.25,19.52
1081781,TL9050450,00-RA186,LOCAL,CHIEN,48DRUMMING,STOLT NEL,2023-12-06 05:39:00,2023-12-06 05:59:00,0.333333,2023-12-06,302,394,119.0,2008-08-22,2023-05-10,CHINESE,CHIEN LI,21.0,39.08,14.71
1081782,TE8037521,00-RA186,EXPORT,MAIN,42PANDAN1,PSA/PPT,2023-12-13 09:30:00,2023-12-13 10:30:00,1.000000,2023-12-13,110,424,1937.0,2000-11-20,2022-10-02,CHINESE,FCL,8.0,58.92,21.86


# Fuzzy String Matching Algorithm

## Preprocessing Strings via Regex

In [5]:
# first use regex to replace all non-alphanumeric characters with a space

def preprocess_street_name(name):
    """
    Preprocess the street name by removing special characters.
    """

    pattern = r'[^A-Za-z0-9\s]'  # Matches any character that is not a letter, digit, or whitespace
    
    # replace non- letter, digit, or whitespace with a space
    cleaned_name = re.sub(pattern, ' ', name)
    
    return cleaned_name

df['FROMLOCSTR'] = df['FROMLOCSTR'].apply(lambda x: preprocess_street_name(x))
df['TOLOCSTR'] = df['TOLOCSTR'].apply(lambda x: preprocess_street_name(x))

df.head(-10)

Unnamed: 0,OrderNumber,CUSTID,JOBTYPE,Department,FROMLOCSTR,TOLOCSTR,STARTTM,ENDTM,TripDuration,CTCOMPLETEDT,DriverId,PrimeMoverId,TrailerId,JOINED DATE,RESIGNED DATE,RACE,Team,Status,AGE_Years,Seniority
0,TL9008453,00-RE001,LOCAL,CHIEN,JI DRUM,1 SERAYA AVE,2021-01-07 08:00:00,2021-01-07 08:30:00,0.5,2021-01-07,2,215,1538.0,2007-06-07,2021-02-23,CHINESE,CHIEN LI,16.0,53.58,13.72
1,TL9008453,00-RE001,LOCAL,CHIEN,JI DRUM,1 SERAYA AVE,2021-01-07 10:30:00,2021-01-07 11:00:00,0.5,2021-01-07,2,215,972.0,2007-06-07,2021-02-23,CHINESE,CHIEN LI,16.0,53.58,13.72
2,TL9009105,00-RE001,LOCAL,CHIEN,JI DRUM,1 SERAYA AVE,2021-01-27 08:00:00,2021-01-27 08:30:00,0.5,2021-01-27,2,215,972.0,2007-06-07,2021-02-23,CHINESE,CHIEN LI,6.0,53.58,13.72
3,TL9009105,00-RE001,LOCAL,CHIEN,JI DRUM,1 SERAYA AVE,2021-01-27 10:30:00,2021-01-27 11:00:00,0.5,2021-01-27,2,215,1538.0,2007-06-07,2021-02-23,CHINESE,CHIEN LI,6.0,53.58,13.72
4,TL9008550,00-RE001,LOCAL,CHIEN,JI DRUM,1 SERAYA AVE,2021-01-11 08:00:00,2021-01-11 08:30:00,0.5,2021-01-11,2,215,1538.0,2007-06-07,2021-02-23,CHINESE,CHIEN LI,13.0,53.58,13.72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1081769,XI7007593,00-RE114,IMPORT,XINHUA,31GUL CR,14 PENJ RD,2023-12-12 08:30:00,2023-12-12 09:30:00,1.0,2023-12-12,169,296,1771.0,2007-12-01,2023-12-31,CHINESE,Team 1,13.0,55.33,16.08
1081770,XI7007593,00-RE114,IMPORT,XINHUA,14 PENJ RD,EKY2,2023-12-12 09:30:00,2023-12-12 10:00:00,0.5,2023-12-12,169,296,1771.0,2007-12-01,2023-12-31,CHINESE,Team 1,13.0,55.33,16.08
1081771,XI7007610,00-RE114,IMPORT,XINHUA,PSA PPT,PTC ISL G,2023-12-12 09:00:00,2023-12-12 10:00:00,1.0,2023-12-12,379,338,1499.0,2014-11-01,2023-12-31,MALAY,Team 1,9.0,55.08,9.16
1081772,XI7007610,00-RE114,IMPORT,XINHUA,PTC ISL G,CWT47JLNBUROH,2023-12-13 11:00:00,2023-12-13 12:00:00,1.0,2023-12-13,356,336,1934.0,2012-08-24,2023-12-31,MALAY,Team 1,8.0,37.42,11.35


# Current # of FROMLOCSTR

In [6]:
df['FROMLOCSTR'].nunique()

483

In [7]:
df['FROMLOCSTR'].unique()

array(['JI DRUM', '1 SEARAYA', 'J ISLAND', '1 SERAYA AVE', 'ISLAND G ',
       '31 AYER', '31AYERMBRD', '2 SERAYA P', 'STOLT NEL', '2SERAYA PL',
       'MER GROUND', 'PTCISLAND', 'NX5', '48DRUMMING', '2SERAYA RI',
       'JOO LONG', '35SHIPYARD', '1 TUAS', '18 PIONEER', '39 SHIPYAR',
       'P A C', 'STOLTNEL', '2 BANYAN', '1SAKRAVE', '38 GUL', 'TPC',
       'DPONTSKR', '59PENJURU', '250 ISLAND', '30 GULCRES', '162 GUL',
       '14TUAS DR2', '800 SUPER', 'CONT CONN', 'EKY2', 'ITS 23PIOSEC',
       'GREENEARTH', '5 SGKADUT', 'PTC ISLAND', 'INF PAC', '41 TUASCRE',
       '10 TAMPINE', 'NTH COAST', '21 J BUROH', '60WOODLAND', 'EXXON PAC',
       '21 J PORT', '19B JGPIER', '46TUASCRES', '21GUL LANE', 'EXXONPAC',
       '28GULCRES', '5JLN BESUT', '5 TG PENJ', '20PENJURU', '15TEMBUSU',
       '20 BANYAN', 'CWT INT', '21SAKRA', '39SHIPYARD', 'PTCMRTGR',
       'JPEIRCP', 'ITS 162', '48 PTC', '210JLNBROH', '1 BUROH',
       'PACIFICTRA', 'OCW3', 'ALLIED 8', 'CWT 24 P C', 'PTCISL OFF',
       '

# Current # of TOLOCSTR

In [8]:
df['TOLOCSTR'].nunique()

493

In [9]:
df['TOLOCSTR'].unique()

array(['1 SERAYA AVE', '31 AYER', '5 SGKADUT', 'J ISLAND', 'JI DRUM',
       'ISLAND G ', 'MER GROUND', '1 SEARAYA', '31AYERMBRD', '48DRUMMING',
       '2SERAYA RI', 'JOO LONG', '2SERAYA RI 2', 'STSS', 'STOLT NEL',
       '39 SHIPYAR', 'P A C', 'PTC ISLAND', 'PTCISLAND', 'TPC',
       '250 ISLAND', '1 SERAYA P', '1 TUAS', '1SAKRAVE', '2 BANYAN',
       '38 GUL', '35SHIPYARD', '59PENJURU', '15TEMBUSU', '800 SUPER',
       'OCW3', 'NX5', '30 GULCRES', '14TUAS DR2', '162 GUL', 'CONT CONNE',
       'CONT CONN', 'ECO', 'GREENEARTH', 'EKY2', '39SHIPYARD',
       '210CALTEX', 'KT', '10TAMPINES', '19B JGPIER', '21 J BUROH',
       'NTH COAST', 'INF PAC', 'ANX5', 'EXXONPAC', 'EXXON PAC',
       '21 J PORT', '150TUASSTH', '5JLN BESUT', '20PENJURU', '21GUL LANE',
       '28GULCRES', 'PSA TPT', '20 BANYAN', '5 TG PENJ', 'DPONTSKR',
       'CWT INT', '48 PTC', 'ITS 162', 'JPEIRCP', 'PTCMRTGR', '21SAKRA',
       '42 PANDAN', 'PTC48 PARK', 'PTCISL OFF', 'PPT', 'BT', '42PTC',
       '210JLNBROH', 'TOT

# Current Unique 1-1 Locations

In [10]:
unique_trip_count = df.groupby(['FROMLOCSTR', 'TOLOCSTR']).size().count()
print(unique_trip_count)

7921


## Fuzzy Matching using Token Set Ratio

In [11]:
# calculate Levenshtein distance between two strings, considering set intersections
def levenshtein_distance(s1, s2):
    return fuzz.token_set_ratio(s1, s2)

THRESHOLD = 75

def group_similar_addresses(df):
    grouped_addresses = {}
    for from_loc in df['FROMLOCSTR'].unique():
        similar_addresses = [from_loc]  # Include itself in the group
        for address in grouped_addresses.keys():
            similarity = levenshtein_distance(from_loc, address)
            if similarity >= THRESHOLD:
                similar_addresses.append(address)
        for address in similar_addresses:
            grouped_addresses[address] = similar_addresses
    return grouped_addresses

similar_address_groups = group_similar_addresses(df)

def assign_corrected_address(from_loc):
    for correct_address, similar_addresses in similar_address_groups.items():
        if from_loc in similar_addresses:
            return correct_address

# lambda function with .apply() speeds up the process
df['CORRECTFROM'] = df['FROMLOCSTR'].apply(lambda x: assign_corrected_address(x))

df['CORRECTTO'] = df['TOLOCSTR'].apply(lambda x: assign_corrected_address(x))

df 


Unnamed: 0,OrderNumber,CUSTID,JOBTYPE,Department,FROMLOCSTR,TOLOCSTR,STARTTM,ENDTM,TripDuration,CTCOMPLETEDT,...,TrailerId,JOINED DATE,RESIGNED DATE,RACE,Team,Status,AGE_Years,Seniority,CORRECTFROM,CORRECTTO
0,TL9008453,00-RE001,LOCAL,CHIEN,JI DRUM,1 SERAYA AVE,2021-01-07 08:00:00,2021-01-07 08:30:00,0.500000,2021-01-07,...,1538.0,2007-06-07,2021-02-23,CHINESE,CHIEN LI,16.0,53.58,13.72,JI DRUM,1 SEARAYA
1,TL9008453,00-RE001,LOCAL,CHIEN,JI DRUM,1 SERAYA AVE,2021-01-07 10:30:00,2021-01-07 11:00:00,0.500000,2021-01-07,...,972.0,2007-06-07,2021-02-23,CHINESE,CHIEN LI,16.0,53.58,13.72,JI DRUM,1 SEARAYA
2,TL9009105,00-RE001,LOCAL,CHIEN,JI DRUM,1 SERAYA AVE,2021-01-27 08:00:00,2021-01-27 08:30:00,0.500000,2021-01-27,...,972.0,2007-06-07,2021-02-23,CHINESE,CHIEN LI,6.0,53.58,13.72,JI DRUM,1 SEARAYA
3,TL9009105,00-RE001,LOCAL,CHIEN,JI DRUM,1 SERAYA AVE,2021-01-27 10:30:00,2021-01-27 11:00:00,0.500000,2021-01-27,...,1538.0,2007-06-07,2021-02-23,CHINESE,CHIEN LI,6.0,53.58,13.72,JI DRUM,1 SEARAYA
4,TL9008550,00-RE001,LOCAL,CHIEN,JI DRUM,1 SERAYA AVE,2021-01-11 08:00:00,2021-01-11 08:30:00,0.500000,2021-01-11,...,1538.0,2007-06-07,2021-02-23,CHINESE,CHIEN LI,13.0,53.58,13.72,JI DRUM,1 SEARAYA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1081779,TE8037442,00-RA186,EXPORT,MAIN,PTC48 G,42PANDAN1,2023-12-06 09:50:00,2023-12-06 10:00:00,0.166667,2023-12-06,...,1772.0,2008-03-17,2023-12-31,CHINESE,FCL,34.0,51.33,15.79,PTC48 G,48PANDAN
1081780,TL9050450,00-RA186,LOCAL,CHIEN,COGENT TANK 15,48DRUMMING,2023-12-04 10:30:00,2023-12-04 12:00:00,1.500000,2023-12-04,...,119.0,2004-06-25,2023-12-31,CHINESE,CHIEN LI,8.0,68.25,19.52,COGENT TANK 15,48DRUMMING
1081781,TL9050450,00-RA186,LOCAL,CHIEN,48DRUMMING,STOLT NEL,2023-12-06 05:39:00,2023-12-06 05:59:00,0.333333,2023-12-06,...,119.0,2008-08-22,2023-05-10,CHINESE,CHIEN LI,21.0,39.08,14.71,48DRUMMING,STOLT NEL
1081782,TE8037521,00-RA186,EXPORT,MAIN,42PANDAN1,PSA PPT,2023-12-13 09:30:00,2023-12-13 10:30:00,1.000000,2023-12-13,...,1937.0,2000-11-20,2022-10-02,CHINESE,FCL,8.0,58.92,21.86,48PANDAN,PPT


# New number of Unique 1-1 Combinations

In [12]:
unique_trip_count = df.groupby(['CORRECTFROM', 'CORRECTTO']).size().count()
print(unique_trip_count)

4741


# Saving Final DataFrame

In [13]:
df.to_csv(FINAL_SHEET, index=False)