In [2]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pandas as pd
import numpy as np
from queue import PriorityQueue
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from queue import LifoQueue

In [3]:
df = pd.read_excel('commentary.xlsx')
commentary = df['commentary'].tolist()
print(commentary[0])


['Dusan Tadic  - Southampton -  won a free kick on the left wing.', 'Missed chance. Dusan Tadic  - Southampton -  shot with left foot from the centre of the box missed to the left.', 'Dusan Tadic  - Southampton -  won a free kick in defence.', 'Fouled by Dusan Tadic  - Southampton', 'Offside - Southampton. Dusan Tadic with a pass, however Shane Long is in offside.', 'Missed chance. Dusan Tadic  - Southampton -  shot with left foot from the centre of the box missed. Assist -  Shane Long.', 'Missed chance. Dusan Tadic  - Southampton -  shot with left foot from outside the box is high and wide to the left after corner.', 'Fouled by Dusan Tadic  - Southampton', 'Dusan Tadic  - Southampton -  won a free kick on the left wing.', 'New attacking attempt. Charlie Austin  - Southampton -  shot with right foot from outside the box is saved by goalkeeper in the centre of the goal. Assist -  Dusan Tadic.']


In [20]:
tokens = [word_tokenize(comment) for comment in commentary]
# tokens = [word for word in word_tokenize(" ".join(commentary))]
tokens = [[word.replace("'", "") for word in comment] for comment in tokens]
punc = [',', '-', '.', "'", '[', ']', '', ' ', '(', ')', '!']
tokens = [[word for word in sentence if word not in punc and not word.isdigit()] for sentence in tokens]
print(tokens[0])


['Dusan', 'Tadic', 'Southampton', 'won', 'a', 'free', 'kick', 'on', 'the', 'left', 'wing', 'Missed', 'chance', 'Dusan', 'Tadic', 'Southampton', 'shot', 'with', 'left', 'foot', 'from', 'the', 'centre', 'of', 'the', 'box', 'missed', 'to', 'the', 'left', 'Dusan', 'Tadic', 'Southampton', 'won', 'a', 'free', 'kick', 'in', 'defence', 'Fouled', 'by', 'Dusan', 'Tadic', 'Southampton', 'Offside', 'Southampton', 'Dusan', 'Tadic', 'with', 'a', 'pass', 'however', 'Shane', 'Long', 'is', 'in', 'offside', 'Missed', 'chance', 'Dusan', 'Tadic', 'Southampton', 'shot', 'with', 'left', 'foot', 'from', 'the', 'centre', 'of', 'the', 'box', 'missed', 'Assist', 'Shane', 'Long', 'Missed', 'chance', 'Dusan', 'Tadic', 'Southampton', 'shot', 'with', 'left', 'foot', 'from', 'outside', 'the', 'box', 'is', 'high', 'and', 'wide', 'to', 'the', 'left', 'after', 'corner', 'Fouled', 'by', 'Dusan', 'Tadic', 'Southampton', 'Dusan', 'Tadic', 'Southampton', 'won', 'a', 'free', 'kick', 'on', 'the', 'left', 'wing', 'New', 'atta

In [21]:
# POS tagging
pos_tags = [pos_tag(word) for word in tokens]
count = 0 
for token, pos in pos_tags[2]:
    print(f"Token: {token}\tPOS Tag: {pos}")
    count += 1
    if count == 10:
        break


Token: Corner	POS Tag: NNP
Token: Leicester	POS Tag: NNP
Token: City	POS Tag: NNP
Token: Conceded	POS Tag: NNP
Token: by	POS Tag: IN
Token: Jan	POS Tag: NNP
Token: Bednarek	POS Tag: NNP
Token: Fouled	POS Tag: NNP
Token: by	POS Tag: IN
Token: Jan	POS Tag: NNP


In [22]:
# Named Entity Recognition
ner_tags = [ne_chunk(word) for word in pos_tags]
print(ner_tags[0])


(S
  (PERSON Dusan/NNP)
  (PERSON Tadic/NNP Southampton/NNP)
  won/VBD
  a/DT
  free/JJ
  kick/NN
  on/IN
  the/DT
  left/NN
  wing/NN
  Missed/VBD
  chance/NN
  (PERSON Dusan/NNP Tadic/NNP Southampton/NNP)
  shot/NN
  with/IN
  left/JJ
  foot/NN
  from/IN
  the/DT
  centre/NN
  of/IN
  the/DT
  box/NN
  missed/VBD
  to/TO
  the/DT
  left/VBN
  (PERSON Dusan/NNP Tadic/NNP Southampton/NNP)
  won/VBD
  a/DT
  free/JJ
  kick/NN
  in/IN
  defence/NN
  Fouled/VBN
  by/IN
  (PERSON
    Dusan/NNP
    Tadic/NNP
    Southampton/NNP
    Offside/NNP
    Southampton/NNP
    Dusan/NNP
    Tadic/NNP)
  with/IN
  a/DT
  pass/NN
  however/RB
  (PERSON Shane/NNP Long/NNP)
  is/VBZ
  in/IN
  offside/NN
  Missed/VBD
  chance/NN
  (PERSON Dusan/NNP Tadic/NNP Southampton/NNP)
  shot/NN
  with/IN
  left/JJ
  foot/NN
  from/IN
  the/DT
  centre/NN
  of/IN
  the/DT
  box/NN
  missed/VBD
  (PERSON Assist/NNP Shane/NNP Long/NNP)
  Missed/VBD
  chance/NN
  (PERSON Dusan/NNP Tadic/NNP Southampton/NNP)
  shot/NN
 

In [23]:
# Lemmatization


def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None


wnl = WordNetLemmatizer()
lowercase_pos_tags = [[(token.lower(), pos) if pos != 'NNP' else (token, pos) for token, pos in pos_tag] for pos_tag in pos_tags]
union = []
for pos_tag in lowercase_pos_tags:
    lemmas_words = []
    for token, pos in pos_tag:
        if pos != 'NNP':
            wn_pos = get_wordnet_pos(pos)
            if wn_pos is not None:
                if token.lower() == 'pass':
                    lemmas_words.append(token)
                else: 
                    lemma = wnl.lemmatize(token, pos=wn_pos)
                    lemmas_words.append(lemma)
            else:
                lemmas_words.append(token)
        else:
            lemmas_words.append(token)
    union.append(lemmas_words)

print(union[0])


['Dusan', 'Tadic', 'Southampton', 'win', 'a', 'free', 'kick', 'on', 'the', 'left', 'wing', 'miss', 'chance', 'Dusan', 'Tadic', 'Southampton', 'shot', 'with', 'left', 'foot', 'from', 'the', 'centre', 'of', 'the', 'box', 'miss', 'to', 'the', 'leave', 'Dusan', 'Tadic', 'Southampton', 'win', 'a', 'free', 'kick', 'in', 'defence', 'foul', 'by', 'Dusan', 'Tadic', 'Southampton', 'Offside', 'Southampton', 'Dusan', 'Tadic', 'with', 'a', 'pass', 'however', 'Shane', 'Long', 'be', 'in', 'offside', 'miss', 'chance', 'Dusan', 'Tadic', 'Southampton', 'shot', 'with', 'left', 'foot', 'from', 'the', 'centre', 'of', 'the', 'box', 'miss', 'Assist', 'Shane', 'Long', 'miss', 'chance', 'Dusan', 'Tadic', 'Southampton', 'shot', 'with', 'left', 'foot', 'from', 'outside', 'the', 'box', 'be', 'high', 'and', 'wide', 'to', 'the', 'left', 'after', 'corner', 'foul', 'by', 'Dusan', 'Tadic', 'Southampton', 'Dusan', 'Tadic', 'Southampton', 'win', 'a', 'free', 'kick', 'on', 'the', 'left', 'wing', 'New', 'attack', 'attempt

In [24]:
# remove stop words
stop_word = stopwords.words('english')
stop_word.remove('won')
commentary_filtered = [[word for word in words if word not in stop_word] for words in union]
print(commentary_filtered[2])


['Corner', 'Leicester', 'City', 'Conceded', 'Jan', 'Bednarek', 'Fouled', 'Jan', 'Bednarek', 'Southampton']


In [25]:
# path = "D:\\google_model\\GoogleNews-vectors-negative300.bin.gz"
# model = KeyedVectors.load_word2vec_format(path, binary=True)


In [26]:
# create a Word2Vec model object and train 
model = Word2Vec(commentary_filtered, vector_size=100, window=5, min_count=1, workers=4)


In [27]:
unique_words = set()
for sublist in commentary_filtered:
    unique_words.update(sublist)
count = len(unique_words)
print("Unique words count:", count)
for word in unique_words:
    if word == 'win':
        print(word)


Unique words count: 982
win


In [34]:
word_dict = {i: word for i, word in enumerate(unique_words)}
reversed_word_dict = {value: key for key, value in word_dict.items()}
print(word_dict[0])


second


In [29]:
# class Edge:
#     def __init__(self, to, weight):
#         self.to = to
#         self.weight = weight
#         self.next = None


# class Graph:
#     def __init__(self, num_vertices):
#         self.num_vertices = num_vertices
#         self.adj_list = [None] * num_vertices

#     def add_edge(self, u, v, weight):
#         # 添加边 u -> v
#         edge = Edge(v, weight)
#         edge.next = self.adj_list[u]
#         self.adj_list[u] = edge

#     def print_graph(self):
#         for i in range(self.num_vertices):
#             print("Vertex", i)
#             edge = self.adj_list[i]
#             while edge:
#                 print(" ->", edge.to, "(Weight:", edge.weight, ")")
#                 edge = edge.next

#     def print_node(self,i):
#         print("Vertex", i)
#         edge = self.adj_list[i]
#         while edge:
#             print(" ->", edge.to, "(Weight:", edge.weight, ")")
#             edge = edge.next


# num_vertices = count
# graph = Graph(num_vertices)


In [30]:
# for word in word_dict:
#     similarity = model.wv.most_similar(word,topn= 10 )
#     for v,dis in similarity:
#         graph.add_edge(word_dict[word],word_dict[v],dis)

# graph.print_node(0)

In [31]:


class Graph:
    def __init__(self, count, word_dict, model):
        self.count = count
        # self.count = 6
        self.word_dict = word_dict
        self.model = model
        self.graph = [[[] for _ in range(self.count)] for _ in range(self.count)]
        self.pre = [[] for _ in range(self.count)]
        self.dist = [0x3fffff for _ in range(self.count)]
        self.path = []
        self.temp_path = []

    def create_graph(self):
        for node1 in range(self.count):
            word1 = self.word_dict[node1]
            for node2 in range(self.count):
                word2 = self.word_dict[node2]
                self.graph[node1][node2] = self.get_dist(word1, word2)
                self.graph[node2][node1] = self.graph[node1][node2]

        # self.graph[9][3] = 80
        # self.graph[3][9] = 80
        # self.graph = [
        #     [0, 2, 4, -1, -1, -1],
        #     [2, 0, 1, 4, 2, -1],
        #     [4, 1, 0, -1, 3, -1],
        #     [-1, 4, -1, 0, 3, 2],
        #     [-1, 2, 3, 3, 0, 2],
        #     [-1, -1, -1, 2, 2, 0]
        # ]
    
    def get_dist(self, word1, word2):
        v1 = self.model.wv[word1]
        v2 = self.model.wv[word2]
        distance = np.sqrt(np.sum((v1 - v2) ** 2))
        distance = distance ** 2   
        return distance
    
    def dijkstra(self, s, t):
        self.dist[s] = 0
        q = PriorityQueue()
        q.put((0, s))
        while not q.empty():
            dis, n = q.get()
            if dis > self.dist[n]:
                continue
            for i in range(self.count):
                if self.graph[i][n] == -1:
                    continue
                new_dist = self.dist[n] + self.graph[i][n]
                if new_dist < self.dist[i]:
                    self.dist[i] = float(new_dist)
                    if n != i:
                        self.pre[i] = [n]
                    q.put((float(self.dist[i]), i))
                elif new_dist == self.dist[i]:
                    if n != i:
                        self.pre[i].append(n)
        return self.dist[t], self.pre

    def record_shortest_path(self, start, target):
        stack = LifoQueue()
        stack.put((start, [start]))

        while not stack.empty():
            node, cur_path = stack.get()

            if node == target:
                self.path.append(cur_path)
            for child in self.pre[node]:
                    stack.put((child, cur_path + [child]))


In [32]:
graph = Graph(count = count, word_dict = word_dict, model= model)
graph.create_graph()


def enter_word(flag):
    while True:
        if flag == 0:
            str = input("Please enter a term: ")
        else:
            str = input("Please enter another term:")
        if str in reversed_word_dict:
            return reversed_word_dict[str]
        else:
            print("The word is not in the dataset. Please enter again.")

source = enter_word(0)
target = enter_word(1)
# user_input_s = input("please input a term: ")
# user_input_t = input("Please input another term: ")
# source = reversed_word_dict[user_input_s]
# target = reversed_word_dict[user_input_t]
result = graph.dijkstra(source, target)
# result = graph.dijkstra(9,3)
print(result[0])
graph.record_shortest_path(start= target, target= source)
print(graph.path)
tran_word = graph.path


84.45010253944736
[[425, 768, 126, 380, 813, 914, 682, 645, 258]]


In [33]:
res_word = [[word_dict[num] for num in sub_path]for sub_path in tran_word]
reversed_list = []
for sublist in res_word:
    reversed_sublist = sublist[::-1]
    reversed_list.append(reversed_sublist)

for sublist in reversed_list:
    output = " -> ".join(sublist)
    print(output)


goal -> hit -> bar -> distance -> opportunity -> Sung-yueng -> Bony -> attacking -> kick
