In [1]:
import pandas as pd
import numpy as np

In [2]:
## reading the cleaned data

data = pd.read_csv("ebay_cleaned.csv")

## lowering the Token column
data["Token_Lower"] = data["Token"].str.lower()

In [3]:
data.head(10)

Unnamed: 0,Record Number,Title,Token,Tag,Clean_Token,Simple_Tag,Token_Lower
0,1,Supreme Nike SB Dunk High By any Means Red US1...,Supreme,Modell,supreme,,supreme
1,1,Supreme Nike SB Dunk High By any Means Red US1...,Nike,Marke,nike,,nike
2,1,Supreme Nike SB Dunk High By any Means Red US1...,SB,Produktlinie,sb,,sb
3,1,Supreme Nike SB Dunk High By any Means Red US1...,Dunk,Produktlinie,dunk,,dunk
4,1,Supreme Nike SB Dunk High By any Means Red US1...,High,Schuhschaft-Typ,high,,high
5,1,Supreme Nike SB Dunk High By any Means Red US1...,By,Modell,by,,by
6,1,Supreme Nike SB Dunk High By any Means Red US1...,any,Modell,any,,any
7,1,Supreme Nike SB Dunk High By any Means Red US1...,Means,Modell,means,,means
8,1,Supreme Nike SB Dunk High By any Means Red US1...,Red,Farbe,color_token,Farbe,red
9,1,Supreme Nike SB Dunk High By any Means Red US1...,US10,US-Schuhgröße,size_token,us_size,us10


In [4]:
def tag_count(data):
    """
    Function to create a dictionary with tags as keys and the count as values
    """
    flatten_data = list(data["Tag"])
    tag_counts = {}

    for i in flatten_data:
        if i in tag_counts:
            tag_counts[i] += 1
        else:
            tag_counts[i] = 1

    tag_counts = {key: value for key, value in sorted(tag_counts.items())}
    return tag_counts

In [5]:
### we will be taking the first tag of every record number which is 4286 (loss of rows from data cleaning)


def inital_state(data):
    """
    Initial State Distribution matrix for prbability of a POS tag in the starting of a sentence
    """
    # data_subset = data.drop_duplicates(["Record Number"], ignore_index=True)
    initial_token = list(data["Tag"])
    initial_tag = {}
    for i in initial_token:
        if i in initial_tag:
            initial_tag[i] += 1
        else:
            initial_tag[i] = 1

    for key in initial_tag:
        initial_tag[key] = initial_tag[key] / len(initial_token)

    sorted_dict = {key: value for key, value in sorted(initial_tag.items())}

    pi = np.array(list(sorted_dict.values()))
    return pi

In [6]:
def transition_matrix(data):
    """
    This function creates transtion matrix (POS tag, POS tag)
    and has alpha value as 1 for smoothing
    """
    flatten_data = list(data["Tag"])
    transition_counts = {}

    for i in range(len(flatten_data) - 1):
        transition = (flatten_data[i], flatten_data[i + 1])

        # If the transition exists in the dictionary, increment its count
        if transition in transition_counts:
            transition_counts[transition] += 1
        else:
            # If the transition is not in the dictionary, add it with a count of 1
            transition_counts[transition] = 1

    tag_list_ordered = list(tag_count(data).keys())

    # 2D Matrix A with size as length of unique POS tags
    A = np.zeros((len(tag_list_ordered), len(tag_list_ordered)), dtype=float)

    for i in range(len(tag_list_ordered)):
        for j in range(len(tag_list_ordered)):
            count = 0

            key = (tag_list_ordered[i], tag_list_ordered[j])

            if key in transition_counts:
                count = transition_counts[key]

            A[i, j] = count

    A = A + 1

    A = A / A.sum(axis=1)[:, None]

    return A

In [7]:
def emission_matrix(data):
    """
    This function is for observation matrix (Vocab,POS tag)
    It creates a list with of vocabulary and adding "OOV"
    The alpha value is 1 for smoothing
    """
    flatten_data = list(zip(list(data["Token_Lower"]), list(data["Tag"])))
    vocab = []
    for i in flatten_data:
        vocab.append(i[0])

    unique_set = set(vocab)
    unique_set.add("OOV")

    unique_list = list(unique_set)

    vocab = {}
    for i, word in enumerate(unique_list):
        vocab[word] = i

    emission_counts = {}
    for i in range(len(flatten_data)):
        if flatten_data[i] in emission_counts:
            # If the item exists in the dictionary, increment its count
            emission_counts[flatten_data[i]] += 1
        else:
            # If the item is not in the dictionary, add it with a count of 1
            emission_counts[flatten_data[i]] = 1

    tag_list_ordered = list(tag_count(data).keys())

    B = np.zeros((len(tag_count(data)), len(unique_list)), dtype=float)

    for i in range(len(tag_count(data))):
        for j in range(len(unique_list)):
            count = 0

            key = (unique_list[j], tag_list_ordered[i])
            if key in emission_counts:
                count = emission_counts[key]

            B[i, j] = count

    B = B + 1

    B = B / B.sum(axis=1)[:, None]
    return B, unique_list

In [8]:
def viterbi(
    obs,
    pi,
    A,
    B,
):
    """Infer most likely state sequence using the Viterbi algorithm.

    Args:
        obs: An iterable of ints representing observations.
        pi: A 1D numpy array of floats representing initial state probabilities.
        A: A 2D numpy array of floats representing state transition probabilities.
        B: A 2D numpy array of floats representing emission probabilities.

    Returns:
        A tuple of:
        * A 1D numpy array of ints representing the most likely state sequence.
        * A float representing the probability of the most likely state sequence.
    """
    N = len(obs)
    Q, V = B.shape  # num_states, num_observations

    # d_{ti} = max prob of being in state i at step t
    #   AKA viterbi
    # \psi_{ti} = most likely state preceeding state i at step t
    #   AKA backpointer

    # initialization
    log_d = [np.log(pi) + np.log(B[:, obs[0]])]
    log_psi = [np.zeros((Q,))]

    # recursion
    for z in obs[1:]:
        log_da = np.expand_dims(log_d[-1], axis=1) + np.log(A)
        log_d.append(np.max(log_da, axis=0) + np.log(B[:, z]))
        log_psi.append(np.argmax(log_da, axis=0))

    # termination
    log_ps = np.max(log_d[-1])
    qs = [-1] * N
    qs[-1] = int(np.argmax(log_d[-1]))
    for i in range(N - 2, -1, -1):
        qs[i] = log_psi[i + 1][qs[i + 1]]

    return qs, np.exp(log_ps)

In [9]:
## using the training data to test the accuracy and sent_id is the record number
def test(sent_id, data):
    test_data = data.loc[data["Record Number"] == sent_id]

    # test = [tup for sent in test_data for tup in sent]
    test_words = list(test_data["Token"])
    test_tags = list(test_data["Tag"])

    obs = []
    vocab = emission_matrix(data)[1]
    for i in range(len(test_words)):
        if test_words[i] in vocab:
            obs.append(vocab.index(test_words[i]))
        else:
            obs.append(vocab.index("OOV"))

    return obs, test_tags

In [10]:
def accuracy(data, sent_id, seq):
    tags_test = test(sent_id, data)[1]
    obs_model = []
    tag_order = list(tag_count(data).keys())
    for i in seq:
        obs_model.append(tag_order[i])

    score = []
    print("\n")
    print("-------------------------------------------------------")
    print("Testing for Sentence Id : " + str(sent_id))
    print("\n")
    for i in range(len(obs_model)):
        print(f"True: {tags_test[i]} | Predicted: {obs_model[i]}")
        if tags_test[i] == obs_model[i]:
            score.append(1)
        else:
            score.append(0)

    accuracy_num = round(sum(score) / len(score) * 100, 3)
    print("The Accuracy of the model for this case is : " + str(accuracy_num) + "%")

    return accuracy_num

In [11]:
def store_values(value_list, value):
    value_list.append(value)


pi = inital_state(data)
print("The order of sequence in which POS tag is used :")
print(list(tag_count(data).keys()))

print("The Initial State distribution is ")
print(pi)

A = transition_matrix(data)
print("The Transition Matrix is")
print(transition_matrix(data))

B = emission_matrix(data)[0]
print("The Observation Matrix is")
print(emission_matrix(data)[0])

# Id for test sentences
test_sent_id = [*range(3000, 3003, 1)]

The order of sequence in which POS tag is used :
['Abteilung', 'Aktivität', 'Akzente', 'Anlass', 'Besonderheiten', 'Charakter', 'Charakter Familie', 'Dämpfungsgrad', 'EU-Schuhgröße', 'Erscheinungsjahr', 'Farbe', 'Futtermaterial', 'Gewebeart', 'Herstellernummer', 'Herstellungsland und -region', 'Innensohlenmaterial', 'Jahreszeit', 'Laufsohlenmaterial', 'Marke', 'Maßeinheit', 'Modell', 'Muster', 'No Tag', 'Obermaterial', 'Obscure', 'Produktart', 'Produktlinie', 'Schuhschaft-Typ', 'Schuhweite', 'Stil', 'Stollentyp', 'Thema', 'UK-Schuhgröße', 'US-Schuhgröße', 'Verschluss', 'Zwischensohlen-Typ']
The Initial State distribution is 
[6.84580528e-02 1.22601957e-02 2.15982721e-03 1.24295939e-02
 7.72879346e-03 4.44670309e-04 2.11747766e-04 6.77592851e-04
 4.84902384e-02 1.50340914e-03 7.93842375e-02 2.11747766e-04
 3.68441113e-03 4.95489773e-02 1.16461271e-03 4.65845085e-04
 8.46991064e-04 3.17621649e-04 9.72980985e-02 9.95214500e-04
 1.30584847e-01 1.60928302e-03 1.81891331e-01 1.54152374e-02
 

[[9.33881210e-05 9.33881210e-05 9.33881210e-05 ... 9.33881210e-05
  9.33881210e-05 9.33881210e-05]
 [1.24161907e-04 1.24161907e-04 1.24161907e-04 ... 1.24161907e-04
  1.24161907e-04 1.24161907e-04]
 [1.31978356e-04 1.31978356e-04 1.31978356e-04 ... 1.31978356e-04
  1.31978356e-04 1.31978356e-04]
 ...
 [1.25250501e-04 1.25250501e-04 1.25250501e-04 ... 1.25250501e-04
  1.25250501e-04 1.25250501e-04]
 [1.27534753e-04 1.27534753e-04 1.27534753e-04 ... 1.27534753e-04
  1.27534753e-04 1.27534753e-04]
 [1.32275132e-04 1.32275132e-04 1.32275132e-04 ... 1.32275132e-04
  1.32275132e-04 1.32275132e-04]]


In [12]:
# Testing for 3 sentences
stored_values = []
stored_loss = []
for i in test_sent_id:
    obs = test(i, data)[0]
    seq, model_prob = viterbi(obs, pi, A, B)
    model_accuracy = accuracy(data, i, seq)
    store_values(stored_values, model_accuracy)
    print("The Model probability for this case is : " + str(model_prob))

print("----------------------------------------------------------------")
print(
    "The Average Model Accuracy is "
    + str(round(sum(stored_values) / len(stored_values), 3))
    + "%"
)



-------------------------------------------------------
Testing for Sentence Id : 3000


True: No Tag | Predicted: No Tag
True: No Tag | Predicted: No Tag
True: Abteilung | Predicted: No Tag
True: Marke | Predicted: No Tag
True: No Tag | Predicted: No Tag
True: No Tag | Predicted: No Tag
The Accuracy of the model for this case is : 66.667%
The Model probability for this case is : 1.9400980390753678e-24


-------------------------------------------------------
Testing for Sentence Id : 3001


True: Marke | Predicted: Modell
True: Produktlinie | Predicted: Modell
True: Produktlinie | Predicted: Modell
True: Modell | Predicted: Modell
True: Modell | Predicted: Modell
True: Schuhschaft-Typ | Predicted: Modell
True: Abteilung | Predicted: Modell
True: Obermaterial | Predicted: Modell
True: Farbe | Predicted: Farbe
True: Farbe | Predicted: Farbe
True: US-Schuhgröße | Predicted: Farbe
True: EU-Schuhgröße | Predicted: Farbe
The Accuracy of the model for this case is : 33.333%
The Model proba