In [1]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import spacy
import re

from concurrent.futures import ThreadPoolExecutor # klasa przydatna do multi-threadingu

In [3]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

df_timestamps = pd.read_csv('with_timestamps.csv', index_col=0)
df_who_said = pd.read_csv('who_said_splitted.csv', index_col=0)

print(df_timestamps.head(2))
print('='*80)
print(df_who_said.head(2))
print('='*50)
print(df_timestamps.shape, df_who_said.shape)

                                        line         start           end
0       Report to your stations immediately.  00:00:42,800  00:00:44,802
1  This is not a drill. We are under attack!  00:00:44,960  00:00:47,721
     speaker                                 line
0  Announcer  Report to your stations immediately
1  Announcer                  This is not a drill
(1893, 3) (1847, 2)


### Usuwanie interpunkcji, żeby zwiększyć poziom similarity

In [None]:
'''
usuwamy znaki interpunkcyjne, bo wpływa to mocno na spacy.similarity

'''

def remove_punctuation(line: str):
    if isinstance(line, str):
        doc = nlp(line)
        tokens = []
        for token in doc:
            if token.text.isalnum():
                tokens.append(token.text.lower())

        return ' '.join(tokens)
    else:
        return ''


'''
aplikujemy metodę dla całej kolumny "line" jednocześnie i zapisujemy rezultat w nowej kolumnie "with_no_punctuation"

'''
 
df_timestamps['with_no_punctuation'] = df_timestamps['line'].apply(remove_punctuation)
print(df_timestamps['with_no_punctuation'])

0           report to your stations immediately
1       this is not a drill we are under attack
2                           we are under attack
3                                          shit
4                                      language
                         ...                   
1888                  they good they not a team
1889                   let beat them into shape
1890                                   avengers
1891                                       fine
1892                             i do it myself
Name: with_no_punctuation, Length: 1893, dtype: object


In [2]:
'''
mapujemy do kategorii, bo jest to bardziej useful -> lepiej operować na dtype.category niż dtype.object

''' 

def map_to_category(similarity: float):
    if similarity == 1.0:
        return '100%'
    elif 0.9 <= similarity < 1.0:
        return r'90% - 99%'
    elif 0.8 <= similarity < 0.9:
        return r'80% - 89%'
    else:
        return '< 80%'

In [None]:
def match_lines():
    comparison_range = 20 # zasięg indeksów, w których szukamy podobnych zdań -> osobę, która je wypowiedziała
    for i, row in df_timestamps.iterrows():
        if i > 600: # z czasem dwa dataframy się delikatnie rozjezdzają
            comparison_range = 30
        if i > 1600: # a czasem bardziej
            comparison_range = 60


        time_line = nlp(row['with_no_punctuation']) # obiekt spacy, który tokenizuje sobie stringa

        # ustawiamy dolny limit w taki sposób, żeby nie było index out of range (np. i < 0)
        # ta konstrukcja to pythonowski ternary operator (tak jak w C# a > b ? a : b)
        bottom_limit = i-comparison_range if i-comparison_range >= 0 else i
        highest_similarity = 0

        for j in range(bottom_limit, min(i + 1 + comparison_range, len(df_who_said))):
            compared_row = df_who_said.loc[j]
            # znowu robimy obiekt spacy, który tokenizuje sobie stringa
            compared_line = nlp(remove_punctuation(compared_row['line']))
            similarity = time_line.similarity(compared_line)

            # jeżeli najbardziej zbliżony jak do tej pory, to podmieniamy dane
            if similarity >= highest_similarity:
                df_timestamps.at[i, 'who_said'] = df_who_said.iloc[j]['speaker']
                df_timestamps.at[i, 'similarity'] = np.round(similarity, 2)
                df_timestamps.at[i, 'similarity_category'] = map_to_category(np.round(similarity, 2))
                df_timestamps.at[i, 'similar_line'] = compared_row['line']
                highest_similarity = similarity
                
                
with ThreadPoolExecutor(max_workers=4) as executor:
    future = executor.submit(match_lines)

In [None]:
# konwertujemy typ danych kolumny (dtype) z object na category 
categories = pd.CategoricalDtype(['< 80%', r'80% - 89%', r'90% - 99%', '100%'], ordered=True)
df_timestamps['similarity_category'] = df_timestamps['similarity_category'].astype(categories)

# dla wygody i estetyki zmieniamy kolejność kolumn i zapisujemy do nowego DataFramu
df = df_timestamps[['start', 'end', 'who_said', 'line', 'similar_line', 'similarity', 'similarity_category']]
df.to_csv('merged.csv')

In [None]:
print(df.similarity_category.value_counts(normalize=True))

100%         0.451664
90% - 99%    0.187005
80% - 89%    0.185420
< 80%        0.175911
Name: similarity_category, dtype: float64


<hr>

### Następnym krokiem będzie polepszenie niektórych wierszy

In [3]:
df = pd.read_csv('merged.csv', index_col=0) # z tym zakończyliśmy etap 2.
compared_df = pd.read_csv('who_said_splitted.csv', index_col=0) # ponownie do tego samego .csv będziemy porównywać
df

Unnamed: 0,start,end,who_said,line,similar_line,similarity,similarity_category
0,"00:00:42,800","00:00:44,802",Announcer,Report to your stations immediately.,Report to your stations immediately,1.00,100%
1,"00:00:44,960","00:00:47,721",Announcer,This is not a drill. We are under attack!,This is not a drill,0.90,90% - 99%
2,"00:00:49,120","00:00:51,088",Announcer,We are under attack!,We are under attack,1.00,100%
3,"00:01:52,200","00:01:54,487",Tony Stark,Shit!,Shit,1.00,100%
4,"00:01:52,200","00:01:54,487",Steve Rogers,Language!,Language,1.00,100%
...,...,...,...,...,...,...,...
1888,"02:10:42,400","02:10:44,801",Steve Rogers,They're good. They're not a team.,They're not a team,0.97,90% - 99%
1889,"02:10:44,960","02:10:46,610",Natasha Romanoff,Let's beat them into shape.,Let's beat 'em into shape,0.96,90% - 99%
1890,"02:11:05,080","02:11:06,605",Steve Rogers,Avengers...,Avengers,1.00,100%
1891,"02:13:25,440","02:13:26,885",Thanos,Fine.,"Fine, I'll do it myself",0.46,< 80%


In [4]:
df['previously_similar'] = df['similar_line']
df

Unnamed: 0,start,end,who_said,line,similar_line,similarity,similarity_category,previously_similar
0,"00:00:42,800","00:00:44,802",Announcer,Report to your stations immediately.,Report to your stations immediately,1.00,100%,Report to your stations immediately
1,"00:00:44,960","00:00:47,721",Announcer,This is not a drill. We are under attack!,This is not a drill,0.90,90% - 99%,This is not a drill
2,"00:00:49,120","00:00:51,088",Announcer,We are under attack!,We are under attack,1.00,100%,We are under attack
3,"00:01:52,200","00:01:54,487",Tony Stark,Shit!,Shit,1.00,100%,Shit
4,"00:01:52,200","00:01:54,487",Steve Rogers,Language!,Language,1.00,100%,Language
...,...,...,...,...,...,...,...,...
1888,"02:10:42,400","02:10:44,801",Steve Rogers,They're good. They're not a team.,They're not a team,0.97,90% - 99%,They're not a team
1889,"02:10:44,960","02:10:46,610",Natasha Romanoff,Let's beat them into shape.,Let's beat 'em into shape,0.96,90% - 99%,Let's beat 'em into shape
1890,"02:11:05,080","02:11:06,605",Steve Rogers,Avengers...,Avengers,1.00,100%,Avengers
1891,"02:13:25,440","02:13:26,885",Thanos,Fine.,"Fine, I'll do it myself",0.46,< 80%,"Fine, I'll do it myself"


In [5]:
'''
Szukamy wszystkich kotwic. Pomiędzy nimi będziemy szukać braków. 
Wstępnie ustalamy kotwice na wiersze, w których similiarity > 0.95

'''

df['is_anchor'] = df.similarity >= 0.95
df

Unnamed: 0,start,end,who_said,line,similar_line,similarity,similarity_category,previously_similar,is_anchor
0,"00:00:42,800","00:00:44,802",Announcer,Report to your stations immediately.,Report to your stations immediately,1.00,100%,Report to your stations immediately,True
1,"00:00:44,960","00:00:47,721",Announcer,This is not a drill. We are under attack!,This is not a drill,0.90,90% - 99%,This is not a drill,False
2,"00:00:49,120","00:00:51,088",Announcer,We are under attack!,We are under attack,1.00,100%,We are under attack,True
3,"00:01:52,200","00:01:54,487",Tony Stark,Shit!,Shit,1.00,100%,Shit,True
4,"00:01:52,200","00:01:54,487",Steve Rogers,Language!,Language,1.00,100%,Language,True
...,...,...,...,...,...,...,...,...,...
1888,"02:10:42,400","02:10:44,801",Steve Rogers,They're good. They're not a team.,They're not a team,0.97,90% - 99%,They're not a team,True
1889,"02:10:44,960","02:10:46,610",Natasha Romanoff,Let's beat them into shape.,Let's beat 'em into shape,0.96,90% - 99%,Let's beat 'em into shape,True
1890,"02:11:05,080","02:11:06,605",Steve Rogers,Avengers...,Avengers,1.00,100%,Avengers,True
1891,"02:13:25,440","02:13:26,885",Thanos,Fine.,"Fine, I'll do it myself",0.46,< 80%,"Fine, I'll do it myself",False


In [6]:
'''
Wybieramy wiersze, dla których szukamy lepszego dopasowania
Wstępnie jest to similarity <= 0.75

'''


df['to_replace'] = df.similarity < 0.75
df

Unnamed: 0,start,end,who_said,line,similar_line,similarity,similarity_category,previously_similar,is_anchor,to_replace
0,"00:00:42,800","00:00:44,802",Announcer,Report to your stations immediately.,Report to your stations immediately,1.00,100%,Report to your stations immediately,True,False
1,"00:00:44,960","00:00:47,721",Announcer,This is not a drill. We are under attack!,This is not a drill,0.90,90% - 99%,This is not a drill,False,False
2,"00:00:49,120","00:00:51,088",Announcer,We are under attack!,We are under attack,1.00,100%,We are under attack,True,False
3,"00:01:52,200","00:01:54,487",Tony Stark,Shit!,Shit,1.00,100%,Shit,True,False
4,"00:01:52,200","00:01:54,487",Steve Rogers,Language!,Language,1.00,100%,Language,True,False
...,...,...,...,...,...,...,...,...,...,...
1888,"02:10:42,400","02:10:44,801",Steve Rogers,They're good. They're not a team.,They're not a team,0.97,90% - 99%,They're not a team,True,False
1889,"02:10:44,960","02:10:46,610",Natasha Romanoff,Let's beat them into shape.,Let's beat 'em into shape,0.96,90% - 99%,Let's beat 'em into shape,True,False
1890,"02:11:05,080","02:11:06,605",Steve Rogers,Avengers...,Avengers,1.00,100%,Avengers,True,False
1891,"02:13:25,440","02:13:26,885",Thanos,Fine.,"Fine, I'll do it myself",0.46,< 80%,"Fine, I'll do it myself",False,True


In [7]:
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

def remove_punctuation(line: str):
    if isinstance(line, str):
        doc = nlp(line)
        tokens = []
        for token in doc:
            if token.text.isalnum():
                tokens.append(token.text.lower())

        return ' '.join(tokens)
    else:
        return ''


# aplikujemy metodę dla całej kolumny "line" jednocześnie i zapisujemy rezultat w nowej kolumnie "with_no_punctuation"
df['with_no_punctuation'] = df['line'].apply(remove_punctuation)
compared_df['with_no_punctuation'] = compared_df['line'].apply(remove_punctuation)

In [8]:
anchor_indexes = df[df.is_anchor].index
anchor_indexes = anchor_indexes.to_numpy()

def find_top_anchor(idx: int) -> int:
    tmp = idx
    while idx >= 0:
        if idx in anchor_indexes:
            return idx
        idx -= 1
    return tmp - 20


def find_bot_anchor(idx: int) -> int:
    tmp = idx
    while idx <= df.shape[0]:
        if idx in anchor_indexes:
            return idx
        idx += 1
    return tmp + 20
    

for i, row in df.iterrows():
    time_line = nlp(row['with_no_punctuation'])
    if row['to_replace']:
        # w tych ramach będziemy szukać
        top_anchor = find_top_anchor(i)
        bottom_anchor = find_bot_anchor(i)
        
        subset = compared_df.iloc[top_anchor:bottom_anchor]
        
        highest_similarity = 0
        for j, r in subset.iterrows():
            compared_line = nlp(r['with_no_punctuation'])
            similarity = time_line.similarity(compared_line)
            
            if similarity >= highest_similarity:
                if df.iloc[i]['with_no_punctuation'] == r['with_no_punctuation']:
                    df.at[i, 'was_replaced'] = None # w teorii powinien byc False, ale skróci to linijki potem
                else:
                    df.at[i, 'was_replaced'] = True

                df.at[i, 'who_said'] = r['speaker']
                df.at[i, 'similarity'] = np.round(similarity, 2)
                df.at[i, 'similarity_category'] = map_to_category(np.round(similarity, 2))
                df.at[i, 'similar_line'] = r['line']
                highest_similarity = similarity
    else:
        continue

  similarity = time_line.similarity(compared_line)
  similarity = time_line.similarity(compared_line)
  similarity = time_line.similarity(compared_line)


In [9]:
df

Unnamed: 0,start,end,who_said,line,similar_line,similarity,similarity_category,previously_similar,is_anchor,to_replace,with_no_punctuation,was_replaced
0,"00:00:42,800","00:00:44,802",Announcer,Report to your stations immediately.,Report to your stations immediately,1.00,100%,Report to your stations immediately,True,False,report to your stations immediately,
1,"00:00:44,960","00:00:47,721",Announcer,This is not a drill. We are under attack!,This is not a drill,0.90,90% - 99%,This is not a drill,False,False,this is not a drill we are under attack,
2,"00:00:49,120","00:00:51,088",Announcer,We are under attack!,We are under attack,1.00,100%,We are under attack,True,False,we are under attack,
3,"00:01:52,200","00:01:54,487",Tony Stark,Shit!,Shit,1.00,100%,Shit,True,False,shit,
4,"00:01:52,200","00:01:54,487",Steve Rogers,Language!,Language,1.00,100%,Language,True,False,language,
...,...,...,...,...,...,...,...,...,...,...,...,...
1888,"02:10:42,400","02:10:44,801",Steve Rogers,They're good. They're not a team.,They're not a team,0.97,90% - 99%,They're not a team,True,False,they good they not a team,
1889,"02:10:44,960","02:10:46,610",Natasha Romanoff,Let's beat them into shape.,Let's beat 'em into shape,0.96,90% - 99%,Let's beat 'em into shape,True,False,let beat them into shape,
1890,"02:11:05,080","02:11:06,605",Steve Rogers,Avengers...,Avengers,1.00,100%,Avengers,True,False,avengers,
1891,"02:13:25,440","02:13:26,885",Thanos,Fine.,"Fine, I'll do it myself",0.46,< 80%,"Fine, I'll do it myself",False,True,fine,


In [10]:
def mapping(x):
    if isinstance(x, bool):
        return True
    else:
        return False
df['was_replaced'] = df['was_replaced'].apply(mapping)
df

Unnamed: 0,start,end,who_said,line,similar_line,similarity,similarity_category,previously_similar,is_anchor,to_replace,with_no_punctuation,was_replaced
0,"00:00:42,800","00:00:44,802",Announcer,Report to your stations immediately.,Report to your stations immediately,1.00,100%,Report to your stations immediately,True,False,report to your stations immediately,False
1,"00:00:44,960","00:00:47,721",Announcer,This is not a drill. We are under attack!,This is not a drill,0.90,90% - 99%,This is not a drill,False,False,this is not a drill we are under attack,False
2,"00:00:49,120","00:00:51,088",Announcer,We are under attack!,We are under attack,1.00,100%,We are under attack,True,False,we are under attack,False
3,"00:01:52,200","00:01:54,487",Tony Stark,Shit!,Shit,1.00,100%,Shit,True,False,shit,False
4,"00:01:52,200","00:01:54,487",Steve Rogers,Language!,Language,1.00,100%,Language,True,False,language,False
...,...,...,...,...,...,...,...,...,...,...,...,...
1888,"02:10:42,400","02:10:44,801",Steve Rogers,They're good. They're not a team.,They're not a team,0.97,90% - 99%,They're not a team,True,False,they good they not a team,False
1889,"02:10:44,960","02:10:46,610",Natasha Romanoff,Let's beat them into shape.,Let's beat 'em into shape,0.96,90% - 99%,Let's beat 'em into shape,True,False,let beat them into shape,False
1890,"02:11:05,080","02:11:06,605",Steve Rogers,Avengers...,Avengers,1.00,100%,Avengers,True,False,avengers,False
1891,"02:13:25,440","02:13:26,885",Thanos,Fine.,"Fine, I'll do it myself",0.46,< 80%,"Fine, I'll do it myself",False,True,fine,False


In [23]:
output = df[['start', 'end', 'who_said', 'line', 'similar_line', 'similarity', 'similarity_category']]
output.to_csv('mergeAfterAnchoring.csv')

In [19]:
replaced = df[df.was_replaced==True]
replaced = replaced.loc[:, ['who_said', 'line', 'similar_line', 'previously_similar']].copy()

In [21]:
replaced

Unnamed: 0,who_said,line,similar_line,previously_similar
45,Iron Legion,Costel!,We wish to avoid collateral damage and will in...,"Stark, we really need to get inside"
48,Iron Legion,"Avengers, go home!",We are here to help,You didn't see that coming
287,Tony Stark,designed by Mr Stark.,"Thanks, buddy","You are Ultron, a global peacekeeping initiati..."
296,JARVIS,Mr Stark?,Stark,Stark
316,Ultron,May I...,they can't mean,I am contacting Mr
...,...,...,...,...
1781,Natasha Romanoff,Zrinka!,Right,They're good
1782,Nick Fury,Costel! Baby!,"But with Stark's stealth tech, we still can't ...",And you're gonna miss me
1784,Natasha Romanoff,Fine.,Right,I'm fine
1817,Tony Stark,Well...,Triple Yahtzee,"Well, the Vision's artificial intelligence"


In [22]:
for i, row in replaced.iterrows():
    print('''
        [{}]
        "{}"
        
        Na podstawie:
        {}
        
        Wcześniejsza opcja:
        {}  
    '''.format(row['who_said'], row['line'], row['similar_line'], row['previously_similar']))


        [Iron Legion]
        "Costel!"
        
        Na podstawie:
        We wish to avoid collateral damage and will inform you when this current conflict is resolved
        
        Wcześniejsza opcja:
        Stark, we really need to get inside  
    

        [Iron Legion]
        "Avengers, go home!"
        
        Na podstawie:
        We are here to help
        
        Wcześniejsza opcja:
        You didn't see that coming  
    

        [Tony Stark]
        "designed by Mr Stark."
        
        Na podstawie:
        Thanks, buddy
        
        Wcześniejsza opcja:
        You are Ultron, a global peacekeeping initiative designed by Mr  
    

        [JARVIS]
        "Mr Stark?"
        
        Na podstawie:
        Stark
        
        Wcześniejsza opcja:
        Stark  
    

        [Ultron]
        "May I..."
        
        Na podstawie:
        they can't mean
        
        Wcześniejsza opcja:
        I am contacting Mr  
    

        [Ultron]
   