In [1]:
import pandas as pd
import numpy as np
import random
import csv
import re

pd.set_option("display.max_rows", None)

In [2]:
cpu_df = pd.read_csv('cpu_mark.csv', index_col = 0)
gpu_df = pd.read_csv('gpu_mark.csv', index_col = 0)

In [3]:
cpu_name_list = [cpu_df['CPU Name'][idx].lower() for idx in cpu_df.index]
gpu_name_list = [gpu_df['GPU Name'][idx].lower() for idx in gpu_df.index]

In [4]:
print(cpu_name_list)
print()
print('Next:')
print()
print(gpu_name_list)

['aarch64 rev 2 (aarch64)', 'aarch64 rev 4 (aarch64)', 'ac8257v/wab', 'allwinner a133', 'allwinner a523', 'allwinner a527', 'allwinner h618', 'amd 3015ce', 'amd 3015e', 'amd 3020e', 'amd 4700s', 'amd a4 micro 6400t apu', 'amd a4 pro 3340b', 'amd a4 pro 7300b apu', 'amd a4 pro 7350b', 'amd a4 1200 apu', 'amd a4 1250 apu', 'amd a4 3300 apu', 'amd a4 3300m apu', 'amd a4 3305m apu', 'amd a4 3310mx apu', 'amd a4 3320m apu', 'amd a4 3330mx apu', 'amd a4 3400 apu', 'amd a4 3420 apu', 'amd a4 4000 apu', 'amd a4 4020 apu', 'amd a4 4300m apu', 'amd a4 4355m apu', 'amd a4 5000 apu', 'amd a4 5050 apu', 'amd a4 5100 apu', 'amd a4 5150m apu', 'amd a4 5300 apu', 'amd a4 5300b apu', 'amd a4 6210 apu', 'amd a4 6250j apu', 'amd a4 6300 apu', 'amd a4 6300b apu', 'amd a4 6320 apu', 'amd a4 7210 apu', 'amd a4 7300 apu', 'amd a4 9120', 'amd a4 9120c', 'amd a4 9120e', 'amd a4 9125', 'amd a6 micro 6500t apu', 'amd a6 pro 7050b apu', 'amd a6 pro 7400b', 'amd a6 1450 apu', 'amd a6 3400m apu', 'amd a6 3410mx apu

# Levenshtein

In [5]:
from Levenshtein import ratio
"""
def LCS(s, t):
    n = len(s)
    m = len(t)
    s=' '+s
    t=' '+t

    f = [[0 for i in range(m + 1)] for j in range(n+1)]
     
    for i in range(1,n + 1):
        for j in range(1,m + 1):
            if(s[i - 1] == t[j - 1]):
                f[i][j] = f[i - 1][j - 1] + 1
            else:
                f[i][j] = max(f[i-1][j], f[i][j-1])
    return (f[n][m]+n+m)/(n+m)
"""
def find_relative_string(s, slist):
    found=[-1, -1]
    for i in range(len(slist)):
        acc=ratio(s, slist[i])
        if found[0]<acc:
            found[0]=acc
            found[1]=i
    return found

def transform_cpu(row):
    cpu = row['CPU']
    if pd.notna(cpu):
        cpu=cpu.lower()
        acc, pos=find_relative_string(cpu, cpu_name_list)
        if acc>0.5:
            row["CPU Name"] = cpu_df['CPU Name'][pos]
            row["CPU Mark"] = cpu_df['CPU Rank'][pos]
        else:
            row["CPU Name"] = np.nan
            row["CPU Mark"] = np.nan
    return row

def transform_gpu(row):
    gpu = row['GPU']
    if pd.notna(gpu):
        gpu=gpu.lower()
        acc, pos=find_relative_string(gpu, gpu_name_list)
        if acc>0.5:
            row["GPU Name"] = gpu_df['GPU Name'][pos]
            row["GPU Mark"] = gpu_df['GPU Rank'][pos]
        else:
            row["GPU Name"] = np.nan
            row["GPU Mark"] = np.nan
    return row


# CER

In [6]:
# CER

def character_error_rate(ref_string, hyp_string):
    # Initialize variables for counting errors and reference length
    num_errors = 0
    ref_length = len(ref_string)

    # Calculate the Levenshtein distance matrix
    dp = [[0] * (len(hyp_string) + 1) for _ in range(len(ref_string) + 1)]
    for i in range(len(ref_string) + 1):
        for j in range(len(hyp_string) + 1):
            if i == 0:
                dp[i][j] = j
            elif j == 0:
                dp[i][j] = i
            elif ref_string[i - 1] == hyp_string[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(dp[i - 1][j - 1], dp[i - 1][j], dp[i][j - 1])

    # Traceback to calculate the number of errors
    i, j = len(ref_string), len(hyp_string)
    while i > 0 or j > 0:
        if i > 0 and j > 0 and ref_string[i - 1] == hyp_string[j - 1]:
            i -= 1
            j -= 1
        elif j > 0 and (i == 0 or dp[i][j - 1] + 1 == dp[i][j]):
            num_errors += 1  # Insertion
            j -= 1
        elif i > 0 and (j == 0 or dp[i - 1][j] + 1 == dp[i][j]):
            num_errors += 1  # Deletion
            i -= 1
        else:
            num_errors += 1  # Substitution
            i -= 1
            j -= 1

    # Calculate the Character Error Rate (CER)
    cer = num_errors / ref_length if ref_length > 0 else 0
    return cer

In [16]:
def find_relative_string(s, slist):
    found=[-1, -1]
    for i in range(len(slist)):
        
        acc=ratio(s, slist[i])
        if found[0]<acc:
            found[0]=acc
            found[1]=i
    return found


def transform_cpu_test(name):
    res = ''
    name_low = name.lower().replace('ghz','')
    acc, pos = find_relative_string(name_low, cpu_name_list)
    print(acc, pos)
    if acc > 0.5:
        res =  cpu_df['CPU Name'][pos]
    return res

def transform_gpu_test(name):
    res = ''
    name_low = name.lower()
    acc, pos = find_relative_string(name_low, gpu_name_list)
    print(acc, pos)
    if acc > 0.5:
        res =  gpu_df['GPU Name'][pos]
    return res

Intel Core i5-1335U (13th Gen)      87

Intel Core i7-1355U (13th Gen)      77

Intel Core i7-1360P (13th Gen)   

In [12]:
cpu_name = 'Intel Core i5 13th Gen 13420H (2.1)'
# print(transform_cpu_test(cpu_name))

In [13]:
gpu_name = 'geforce rtx 4050'
# print(transform_gpu_test(gpu_name))
# print(ratio('intel iris xe graphics', 'intel iris xe'))

# Fuzzy String Matching

In [1]:
!pip install thefuzz
!pip install fuzzywuzzy

Collecting thefuzz
  Downloading thefuzz-0.22.1-py3-none-any.whl.metadata (3.9 kB)
Collecting rapidfuzz<4.0.0,>=3.0.0 (from thefuzz)
  Downloading rapidfuzz-3.10.1-cp311-cp311-win_amd64.whl.metadata (11 kB)
Downloading thefuzz-0.22.1-py3-none-any.whl (8.2 kB)
Downloading rapidfuzz-3.10.1-cp311-cp311-win_amd64.whl (1.6 MB)
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   - -------------------------------------- 0.0/1.6 MB 653.6 kB/s eta 0:00:03
   ---- ----------------------------------- 0.2/1.6 MB 2.0 MB/s eta 0:00:01
   ------------ --------------------------- 0.5/1.6 MB 3.5 MB/s eta 0:00:01
   ---------------------- ----------------- 0.9/1.6 MB 4.8 MB/s eta 0:00:01
   ----------------------------------- ---- 1.4/1.6 MB 6.1 MB/s eta 0:00:01
   ---------------------------------------- 1.6/1.6 MB 6.4 MB/s eta 0:00:00
Installing collected packages: rapidfuzz, thefuzz
Successfully installed rapidfuzz-3.10.1 thefuzz-0.22.1



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
from thefuzz import fuzz, process

print(process.extract(cpu_name.lower(), cpu_name_list), fuzz.)

[('aarch64 rev 2 (aarch64)', 86), ('amd epyc 3101 4 core', 86), ('amd fx 670k quad core', 86), ('amd fx 770k quad core', 86), ('amd fx 870k quad core', 86)]


In [None]:
from thefuzz import fuzz, process

print(process.extract(gpu_name.lower(), gpu_name_list, limit=3))

In [28]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Sample list of strings
string_list = cpu_name_list

# Input string
input_string = 'AMD Ryzen 5 5500U (1.5 GHz)'

# Find the most similar string
best_match = process.extractOne(input_string, string_list, scorer=fuzz.ratio)

print("Input String:", input_string)
print("Best Match:", best_match[0], "with a similarity score of", best_match[1])


Input String: AMD Ryzen 5 5500U (1.5 GHz)
Best Match: amd ryzen 5 5500h with a similarity score of 79


In [22]:
# Sample list of strings
string_list = gpu_name_list

# Input string
input_string = 'geforce rtx 4050'

# Find the most similar string
best_match = process.extractOne(input_string, string_list, scorer=fuzz.ratio)

print("Input String:", input_string)
print("Best Match:", best_match[0], "with a similarity score of", best_match[1])

Input String: geforce rtx 4050
Best Match: geforce rtx 2050 with a similarity score of 94


print "FuzzyWuzzy Ratio: ", fuzz.ratio(s1, s2) 

print "FuzzyWuzzy PartialRatio: ", fuzz.partial_ratio(s1, s2) 

print "FuzzyWuzzy TokenSortRatio: ", fuzz.token_sort_ratio(s1, s2) 

print "FuzzyWuzzy TokenSetRatio: ", fuzz.token_set_ratio(s1, s2) 

print "FuzzyWuzzy WRatio: ", fuzz.WRatio(s1, s2),'\n\n'

In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

def mapping(s, slist):
    s2 = s.lower()
    found=[0, -1]
    for i in range(len(slist)):
        acc=sum([fuzz.ratio(slist[i], s2),
                 fuzz.partial_ratio(slist[i], s2),
                 fuzz.token_sort_ratio(slist[i], s2),
                 fuzz.token_set_ratio(slist[i], s2)])/4
#       acc=max(fuzz.ratio(slist[i], s2),fuzz.partial_ratio(slist[i], s2),fuzz.token_sort_ratio(slist[i], s2),fuzz.token_set_ratio(slist[i], s2))
        if found[0]<acc:
            found[0]=acc
            found[1]=i
    return found



best_match_cpu = mapping(cpu_string, cpu_name_list)
print(best_match_cpu)
print(cpu_df['CPU Name'][best_match_cpu[1]])


print('--------------------------------------------')

cpu_string = 'Intel Core i7-1365U vProÂ® Processor'
gpu_string = 'Geforce RTX 4050'
best_match_gpu = mapping(gpu_string, gpu_name_list)
print(best_match_gpu)
print(gpu_df['GPU Name'][best_match_gpu[1]])

[83.0, 2495]
Intel Core i7 1365U
--------------------------------------------
[94.0, 618]
GeForce RTX 2050
