In [1]:
import pandas as pd
from src.firefall import FirefallService
from src.imss import IMSService

adobe_env = 'prod'
ims_service = IMSService(adobe_env)
firefall_service = FirefallService(ims_service.ACCESS_TOKEN, 'prod')

def longest_common_subsequence(a, b):
    '''
    Longest Common Subsequence (LCS): LCS measures the longest sequence of characters that appear in both strings in the same order. 
    Example:
    url1: 'abcde'
    url2: 'ace'
    LCS: 'ace'
    similarity = 3 / max(5, 3) = 0.6 (over length of the longest string)
    '''
    m, n = len(a), len(b)
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if a[i - 1] == b[j - 1]:
                dp[i][j] = dp[i - 1][j - 1] + 1
            else:
                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
    return dp[m][n] / max(m, n)


def levenshtein_distance(a, b):
    '''
    Levenshtein Distance: Levenshtein Distance measures the minimum number of single-character edits (insertions, deletions, 
    or substitutions) needed to change one string into the other.
    Example:
    url1: 'kitten'
    url2: 'sitting'
    edits: 'k' -> 's', 'e' -> 'i', add 'g' : 3 edits
    similarity = 1 - 3 / max(6, 7) = 0.5714 (over length of the longest string)
    '''
    m, n = len(a), len(b)
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0:
                dp[i][j] = j
            elif j == 0:
                dp[i][j] = i
            elif a[i - 1] == b[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])
    return 1 - (dp[m][n] / max(m, n))


def jaccard_similarity(a, b):
    '''
    Jaccard Similarity: Jaccard Similarity measures the size of the intersection divided by the size of the union of 
    two sets of characters.
    Example:
    url1: 'night'
    url2: 'nacht'
    intersection: {'n', 'h', 't'} = 3
    union: {'n', 'i', 'g', 'h', 't', 'a', 'c'} = 7
    similarity = 3 / 7 = 0.4286
    '''
    set_a, set_b = set(a), set(b)
    intersection = len(set_a & set_b)
    union = len(set_a | set_b)
    return intersection / union


def sorensen_dice_coefficient(a, b):
    '''
    Sørensen–Dice Coefficient: Sørensen–Dice Coefficient is similar to Jaccard but gives more weight to common elements. 
    It measures the similarity between two sets by calculating twice the size of the intersection divided by the sum of 
    the sizes of both sets.
    Example: 
    url1: 'night'
    url2: 'nacht'
    intersection: {'n', 'h', 't'} = 3
    total size of both sets: 5 + 5 = 10
    similarity = 2 * 3 / 10 = 0.6
    '''
    set_a, set_b = set(a), set(b)
    intersection = len(set_a & set_b)
    return 2 * intersection / (len(set_a) + len(set_b))

def find_best_match(broken_url, sitemap_urls, metric_function):
    '''
    broken_url: the broken URL to find the best match for
    sitemap_urls: a list of URLs to compare against
    metric_function: the similarity metric to use, e.g. longest_common_subsequence, levenshtein_distance, jaccard_similarity, sorensen_dice_coefficient
    '''
    best_match = None
    best_score = -1

    for url in sitemap_urls:
        score = metric_function(broken_url, url)
        if score > best_score:
            best_score = score
            best_match = url

    return best_match, best_score

def get_best_matches_df(broken_backlinks, sitemap_urls):
    results = {
        'Broken URL': [],
        'LCS Best Match': [],
        'Levenshtein Best Match': [],
        'Jaccard Best Match': [],
        'Sørensen-Dice Best Match': [],
    }

    for broken_url in broken_backlinks:
        lcs_match, lcs_score = find_best_match(broken_url, sitemap_urls, longest_common_subsequence)
        lev_match, lev_score = find_best_match(broken_url, sitemap_urls, levenshtein_distance)
        jaccard_match, jaccard_score = find_best_match(broken_url, sitemap_urls, jaccard_similarity)
        dice_match, dice_score = find_best_match(broken_url, sitemap_urls, sorensen_dice_coefficient)

        results['Broken URL'].append(broken_url)
        results['LCS Best Match'].append(lcs_match)
        results['Levenshtein Best Match'].append(lev_match)
        results['Jaccard Best Match'].append(jaccard_match)
        results['Sørensen-Dice Best Match'].append(dice_match)

    return pd.DataFrame(results)



<Response [200]>


In [2]:
#Read the broken backlinks and sitemap URLs from the files
with open('data/lovesac/broken_urls.txt') as f:
    broken_backlinks = f.read().splitlines()

with open('data/lovesac/sitemap_urls.txt') as f:
    sitemap_urls = f.read().splitlines()

df_best_matches = get_best_matches_df(broken_backlinks, sitemap_urls)
df_best_matches


Unnamed: 0,Broken URL,LCS Best Match,Levenshtein Best Match,Jaccard Best Match,Sørensen-Dice Best Match
0,https://www.lovesac.com/sacs/bigone.html,https://www.lovesac.com/sacs/inserts,https://www.lovesac.com/sacs/inserts,https://www.lovesac.com/financing,https://www.lovesac.com/financing
1,https://www.lovesac.com/footsac-blanket-mousse...,https://www.lovesac.com/products/footsac-blank...,https://www.lovesac.com/products/footsac-blank...,https://www.lovesac.com/products/footsac-blank...,https://www.lovesac.com/products/footsac-blank...
2,https://www.lovesac.com/25th-anniversary-sweep...,https://www.lovesac.com/throw-pillows/covers/s...,https://www.lovesac.com/to-the-trade-details,https://www.lovesac.com/products/sactionals-dr...,https://www.lovesac.com/products/sactionals-dr...
3,https://www.lovesac.com/sacs/lovesac-x-jeremy-...,https://www.lovesac.com/stealthtech-sound-char...,https://www.lovesac.com/sacs/covers/select,https://www.lovesac.com/privacy-policy,https://www.lovesac.com/privacy-policy
4,https://www.lovesac.com/sactionals/learn.html,https://www.lovesac.com/sactionals/inserts,https://www.lovesac.com/sactionals/inserts,https://www.lovesac.com/inspiration,https://www.lovesac.com/inspiration
5,https://www.lovesac.com/warranty,https://www.lovesac.com/clearance,https://www.lovesac.com/cart,https://www.lovesac.com/clearance,https://www.lovesac.com/clearance
6,https://www.lovesac.com/history/,https://www.lovesac.com/how-to,https://www.lovesac.com/heroes,https://www.lovesac.com/privacy-policy,https://www.lovesac.com/privacy-policy
7,https://www.lovesac.com/sactionals/2-seats-4-s...,https://www.lovesac.com/products/sactionals-an...,https://www.lovesac.com/sactionals-with-stealt...,https://www.lovesac.com/products/24x24-throw-p...,https://www.lovesac.com/products/24x24-throw-p...
8,https://www.lovesac.com/sacs/pillowsac.html,https://www.lovesac.com/sacs/sac-bundles,https://www.lovesac.com/sacs/inserts,https://www.lovesac.com/accessories,https://www.lovesac.com/accessories
9,https://www.lovesac.com/buy-furniture/sac/supe...,https://www.lovesac.com/sactionals/inserts/select,https://www.lovesac.com/squattomans/covers,https://www.lovesac.com/know-your-fabrics,https://www.lovesac.com/know-your-fabrics


In [40]:
prompt = '''

As an SEO expert tasked with fixing broken backlinks, you will receive an alternative URL from the same website, selected based on similarity indices. 
Please be aware that these suggestions might not be completely accurate.

Carefully review the provided URLs. 
If the suggested URL does not seem to be an exact match, opt to redirect to the homepage or, if it pertains to a product, the relevant category page.

Matching Process:
- Identify the keywords after the last slash in the URLs.
- Analyze the keywords semantically to understand their meaning.
- Evaluate similarity: If the keywords are semantically similar, the URL is considered a match.
Remember, the goal is to redirect users to a page that closely matches their original intent. 
It is preferable to redirect to a related category page or the main page rather than a completely unrelated one.

For example, for the following example, as the URLs were not a good match, and broken link is a product page, then it is better to redirect to the category page.
Broken link: https://www.lovesac.com/sacs/bigone.html
Alternative URL link: https://www.lovesac.com/financing
Suggested URL: https://www.lovesac.com/sacs/

Now, give suggested URL for the following:
Here is the broken link: https://www.lovesac.com/footsac-eskimo-phur.html
Here is the suggested link: https://www.lovesac.com/sacs-inspiration-page	

The output should be a JSON containing the suggested URL and AI rationale.

'''
response = firefall_service.completions(11, prompt)
response['generations'][0][0]['message']['content']

200 {"conversation_identifier":null,"query_id":null,"llm_type":"azure_chat_openai","generations":[[{"text":"{\n  \"suggested_url\": \"https://www.lovesac.com/sacs/\",\n  \"AI_rationale\": \"The broken link contains the keyword 'footsac-eskimo-phur' which is related to a product. The suggested URL is the main category page for 'sacs' which is a more relevant page for users looking for different types of sacs.\"\n}","generation_info":{"finish_reason":"stop","content_filter_results":{"hate":{"filtered":false,"severity":"safe"},"self_harm":{"filtered":false,"severity":"safe"},"sexual":{"filtered":false,"severity":"safe"},"violence":{"filtered":false,"severity":"safe"}}},"type":"ChatGeneration","message":{"content":"{\n  \"suggested_url\": \"https://www.lovesac.com/sacs/\",\n  \"AI_rationale\": \"The broken link contains the keyword 'footsac-eskimo-phur' which is related to a product. The suggested URL is the main category page for 'sacs' which is a more relevant page for users looking for 

'{\n  "suggested_url": "https://www.lovesac.com/sacs/",\n  "AI_rationale": "The broken link contains the keyword \'footsac-eskimo-phur\' which is related to a product. The suggested URL is the main category page for \'sacs\' which is a more relevant page for users looking for different types of sacs."\n}'

In [41]:
print(response['generations'][0][0]['message']['content'])

{
  "suggested_url": "https://www.lovesac.com/sacs/",
  "AI_rationale": "The broken link contains the keyword 'footsac-eskimo-phur' which is related to a product. The suggested URL is the main category page for 'sacs' which is a more relevant page for users looking for different types of sacs."
}
