In [1]:
import nltk

nltk.download("punkt")
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize

nltk.download("stopwords")

from nltk.corpus import stopwords

eng_stopwords = stopwords.words("english")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to /home/p1dg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/p1dg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import zipfile

zip_file_path = "archive (1).zip"

extract_path = "datasets"

with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
    zip_ref.extractall(extract_path)

In [3]:
import numpy as np
import pandas as pd

# raw_data = pd.read_csv("./datasets/All-seasons.csv")
raw_data = pd.read_csv("./datasets/Game_of_Thrones_Script.csv")
raw_data = raw_data.rename(columns={"Name": "Character", "Sentence": "Line"})

raw_data

Unnamed: 0,Release Date,Season,Episode,Episode Title,Character,Line
0,2011-04-17,Season 1,Episode 1,Winter is Coming,waymar royce,What do you expect? They're savages. One lot s...
1,2011-04-17,Season 1,Episode 1,Winter is Coming,will,I've never seen wildlings do a thing like this...
2,2011-04-17,Season 1,Episode 1,Winter is Coming,waymar royce,How close did you get?
3,2011-04-17,Season 1,Episode 1,Winter is Coming,will,Close as any man would.
4,2011-04-17,Season 1,Episode 1,Winter is Coming,gared,We should head back to the wall.
...,...,...,...,...,...,...
23906,2019-05-19,Season 8,Episode 6,The Iron Throne,brienne,I think we can all agree that ships take prece...
23907,2019-05-19,Season 8,Episode 6,The Iron Throne,bronn,I think that's a very presumptuous statement.
23908,2019-05-19,Season 8,Episode 6,The Iron Throne,tyrion lannister,I once brought a jackass and a honeycomb into ...
23909,2019-05-19,Season 8,Episode 6,The Iron Throne,man,The Queen in the North!


In [4]:
# удаляем не гланых героев
replica_min_num = 100
main_characters = (
    raw_data["Character"]
    .value_counts()[raw_data["Character"].value_counts() > replica_min_num]
    .index
)

raw_data = raw_data[raw_data["Character"].isin(main_characters)]

raw_data

Unnamed: 0,Release Date,Season,Episode,Episode Title,Character,Line
15,2011-04-17,Season 1,Episode 1,Winter is Coming,jon snow,Go on. Father's watching.
16,2011-04-17,Season 1,Episode 1,Winter is Coming,jon snow,And your mother.
18,2011-04-17,Season 1,Episode 1,Winter is Coming,sansa stark,Thank you.
20,2011-04-17,Season 1,Episode 1,Winter is Coming,eddard stark,And which one of you was a marksman at ten? Ke...
21,2011-04-17,Season 1,Episode 1,Winter is Coming,jon snow,"Don't think too much, Bran."
...,...,...,...,...,...,...
23905,2019-05-19,Season 8,Episode 6,The Iron Throne,bronn,"Well, I imagine he isn't using them properly."
23906,2019-05-19,Season 8,Episode 6,The Iron Throne,brienne,I think we can all agree that ships take prece...
23907,2019-05-19,Season 8,Episode 6,The Iron Throne,bronn,I think that's a very presumptuous statement.
23908,2019-05-19,Season 8,Episode 6,The Iron Throne,tyrion lannister,I once brought a jackass and a honeycomb into ...


In [5]:
raw_data["repeat"] = raw_data["Character"].shift(-1)
raw_data = raw_data[raw_data["Character"] != raw_data["repeat"]]
raw_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_data["repeat"] = raw_data["Character"].shift(-1)


Unnamed: 0,Release Date,Season,Episode,Episode Title,Character,Line,repeat
16,2011-04-17,Season 1,Episode 1,Winter is Coming,jon snow,And your mother.,sansa stark
18,2011-04-17,Season 1,Episode 1,Winter is Coming,sansa stark,Thank you.,eddard stark
20,2011-04-17,Season 1,Episode 1,Winter is Coming,eddard stark,And which one of you was a marksman at ten? Ke...,jon snow
21,2011-04-17,Season 1,Episode 1,Winter is Coming,jon snow,"Don't think too much, Bran.",robb stark
22,2011-04-17,Season 1,Episode 1,Winter is Coming,robb stark,Relax your bow arm.,eddard stark
...,...,...,...,...,...,...,...
23905,2019-05-19,Season 8,Episode 6,The Iron Throne,bronn,"Well, I imagine he isn't using them properly.",brienne
23906,2019-05-19,Season 8,Episode 6,The Iron Throne,brienne,I think we can all agree that ships take prece...,bronn
23907,2019-05-19,Season 8,Episode 6,The Iron Throne,bronn,I think that's a very presumptuous statement.,tyrion lannister
23908,2019-05-19,Season 8,Episode 6,The Iron Throne,tyrion lannister,I once brought a jackass and a honeycomb into ...,man


In [6]:
raw_data["Line"] = raw_data["Line"].apply(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_data["Line"] = raw_data["Line"].apply(str)


In [7]:
tokenized_sentences = raw_data["Line"].apply(wordpunct_tokenize)
len(tokenized_sentences)
# vectorizer
vectorizer = TfidfVectorizer(
    stop_words=eng_stopwords,
    ngram_range=(1, 3),
    max_features=5024,
    tokenizer=wordpunct_tokenize,
)

In [8]:
matrix_tfidf = vectorizer.fit_transform(raw_data["Character"])
print(matrix_tfidf.shape)

(13288, 92)




In [9]:
class SimpleSearchEngine:
    def __init__(self, text_database: list[str], top_k: int):
        self.raw_procesed_data = [self.preprocess(sample) for sample in text_database]
        self.base = []
        self.retriever = None
        self.inverted_index = {}
        self._init_retriever(text_database)
        self._init_inverted_index(text_database)
        self.top_k = top_k

    @staticmethod
    def preprocess(sentence: str) -> str:
        return sentence

    def _init_retriever(self, text_database: list[str]):
        """
        TfidfVectorizer is used to convert a collection of raw documents into a
        matrix of TF-IDF features.
        Use fit_transform method of TfidfVectorizer to learn the vocabulary and
        idf from the training set and the transformed matrix.
        """
        self.retriever = TfidfVectorizer(
            stop_words=eng_stopwords,
            ngram_range=(2, 5),
            max_features=5024,
            tokenizer=wordpunct_tokenize,
        )

        self.base = self.retriever.fit_transform(text_database)  # train retriever

    def retrieve(self, query: str) -> np.array:
        return self.retriever.transform([query])

    def retrieve_documents(self, query: str, out="best") -> np.array:
        """
        The query needs to be transformed into the same vector space as your
        document base.
        Utilize cosine_similarity to compute the similarity between the query
        vector and all document vectors in the base.
        Remember that cosine_similarity returns a matrix; you might need to
        flatten it to get a 1D array of similarity scores.
        Sort the documents based on their cosine similarity scores to find k
        most relevant ones to the query and return them as answer.
        """
        query_vector = self.retrieve(query)
        cosine_similarities = cosine_similarity(query_vector, self.base).flatten()
        if out == "best":
            relevant_indices = np.argsort(cosine_similarities, axis=0)[::-1][
                : self.top_k
            ]
        elif out == "bad":
            relevant_indices = np.argsort(cosine_similarities, axis=0)[::-1][
                self.top_k :
            ]
            relevant_indices = np.random.choice(
                relevant_indices, self.top_k, replace=False
            )
        return relevant_indices

    def _init_inverted_index(self, text_database: list[str]):
        self.inverted_index = dict(enumerate(text_database))

    def display_relevant_docs(self, query: str, full_database, out="best") -> list[str]:
        docs_indexes = self.retrieve_documents(query, out=out)
        return [self.inverted_index[ind].replace("\n", "") for ind in docs_indexes]

In [10]:
simple_search_engine = SimpleSearchEngine(raw_data["Line"], 3)
query = "What is the best rest?"

best_results = simple_search_engine.display_relevant_docs(
    query, raw_data["Character"], "best"
)
print(f"best_results {best_results}")

bad_results = simple_search_engine.display_relevant_docs(
    query, raw_data["Character"], "bad"
)
print(f"bad_results {bad_results}")

best_results ['And the rest of them?', "There's only one ship. Where are the rest of them? Where are the rest of them?", 'As long as it pleases me. Do you want to see the rest?']
bad_results ["The only way out is through the gates. And they're at the gates.", "Robb's in the Riverlands. My sister's in Deepwood Motte. She'll get here long before they do. And Ned Stark always said 500 men could hold Winterfell again 10,000.", 'The Lord Commander, , we are sworn to protect -']


In [11]:
def window_back(id, win):
    if (id - win) < 0:
        win = id
    return win


def window(id, win, top):
    if (id + win) > top:
        win = id
    return win


CON_WIND = 5  # context_window
CLOSE_REP = 4  # close_reply
TOP_K = 3  # top_k

DF_OUT = raw_data[["Season", "Episode"]]

simple_search_engine = SimpleSearchEngine(raw_data["Line"], TOP_K)



In [13]:
# speaker - тот кто говорит первый
DF_OUT["speaker"] = raw_data["Character"].shift(1)

# Что говорит speaker
DF_OUT["query"] = raw_data["Line"].shift(1)

# Собеседники
DF_OUT["companions"] = [
    raw_data.iloc[id + 1 : id + 1 + window(id, CLOSE_REP, raw_data.shape[0])][
        "Character"
    ].to_list()
    for id in raw_data.index
]

# Ближайшие реплики собеседников
DF_OUT["close_reply"] = [
    raw_data.iloc[id + 1 : id + 1 + window(id, CLOSE_REP, raw_data.shape[0])][
        "Line"
    ].to_list()
    for id in raw_data.index
]

# реплики подобранные на Tf-Idf
DF_OUT["neutral_reply"] = raw_data["Line"].apply(
    lambda query: simple_search_engine.display_relevant_docs(
        query, raw_data["Character"], "best"
    )
)

# плохие реплики подобранные на Tf-Idf
DF_OUT["bad_reply"] = raw_data["Line"].apply(
    lambda query: simple_search_engine.display_relevant_docs(
        query, raw_data["Character"], "bad"
    )
)

# контекст прошлых реплик разговора
DF_OUT["context"] = [
    ". ".join(raw_data.iloc[id - window_back(id, CON_WIND) : id]["Line"].to_list())
    for id in raw_data.index
]

DF_OUT = DF_OUT.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DF_OUT["speaker"] = raw_data["Character"].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DF_OUT["query"] = raw_data["Line"].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DF_OUT["companions"] = [
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row

In [14]:
file_name = "talks_dataset.df"
DF_OUT.to_pickle(file_name)

In [15]:
DF_OUT

Unnamed: 0,Season,Episode,speaker,query,companions,close_reply,neutral_reply,bad_reply,context
18,Season 1,Episode 1,jon snow,And your mother.,"[eddard stark, bran stark, eddard stark, bran ...","[But do you understand why I had to kill him?,...","[Thank you., Thank you., Thank you.]","[We held them off., And how many lives have yo...",King of the Andals and the First Men …. Father...
20,Season 1,Episode 1,sansa stark,Thank you.,"[eddard stark, bran stark, eddard stark, bran ...",[The man who passes the sentence should swing ...,[And which one of you was a marksman at ten? K...,[What gods are those? The trees your husband p...,Lord of the Seven Kingdoms and protector of th...
21,Season 1,Episode 1,eddard stark,And which one of you was a marksman at ten? Ke...,"[bran stark, eddard stark, bran stark, eddard ...","[Is it true he saw the White Walkers?, The Whi...","[Don't think too much, Bran., I have to go bac...",[Council business. We all have so much to disc...,You did well.. You understand why I did it?. J...
22,Season 1,Episode 1,jon snow,"Don't think too much, Bran.","[eddard stark, bran stark, eddard stark, jon s...",[The White Walkers have been gone for thousand...,"[Relax your bow arm., Relax your bow arm., You...","[So, the fleet It's been burned, Five., There ...",You understand why I did it?. Jon said he was ...
25,Season 1,Episode 1,robb stark,Relax your bow arm.,"[jon snow, theon greyjoy, eddard stark, theon ...","[What is it?, Mountain lion?, There are no mou...","[You've completely ruined horses for me., Get ...","[No. No, no, no. Not very well., An alliance m...",Our way is the old way?. The man who passes th...
...,...,...,...,...,...,...,...,...,...
23905,Season 8,Episode 6,sam,Uh... the Archmaester is less than enthusiasti...,[],[],"[Well, I imagine he isn't using them properly....","[It's going to be a very dull walk., You been ...",
23906,Season 8,Episode 6,bronn,"Well, I imagine he isn't using them properly.",[],[],[I think we can all agree that ships take prec...,"[Ah. That's a pity., I'll tell you what doesn'...",
23907,Season 8,Episode 6,brienne,I think we can all agree that ships take prece...,[],[],[I think that's a very presumptuous statement....,"[To discuss an armistice., You're torturing th...",
23908,Season 8,Episode 6,bronn,I think that's a very presumptuous statement.,[],[],[I once brought a jackass and a honeycomb into...,"[For which you blame me., Perhaps they should ...",
