In [1]:
import srt

import pandas as pd
import numpy as np

from thefuzz import fuzz
from thefuzz import process
from os import listdir
from os.path import isfile, join, basename

In [2]:
complete_info_df = pd.read_csv('../complete_info.csv')
films = complete_info_df['film'].unique()

In [3]:
print(films)

['the hours' 'the reader' 'falling down' 'a beautiful mind'
 'the interpreter' 'bicentennial man' 'cinderella man' 'the impossible'
 'fligthplan' 'casinò' 'saw3' 'titanic' 'poltergeist' 'dancer in the dark'
 'mystic river' 'the others' 'my life without me' 'the devil wears prada'
 'crash' 'side effects' 'big fish' 'atonement' 'brokeback mountain'
 'patch adams' 'good will hunting' 'antichrist' 'philadelphia'
 'the sixth sense' 'finding neverland' 'notting hill'
 'la vida secreta de las palabras' 'hook' 'pretty woman' 'joe black'
 'matrimonio greco' 'million dollar baby' 'american history x'
 'the lives of others' 'lost in translation' 'meet joe black' 'saw'
 'intelligenza artificiale' "l'uomo che sussurrava" 'million dolar baby']


In [4]:
subtitles_path = '../subs/'
subtitle_files = [join(subtitles_path, f) for f in listdir(
    subtitles_path) if isfile(join(subtitles_path, f))]
print(subtitle_files)

['../subs/american_history_x_en.srt', '../subs/american_history_x_es.srt', '../subs/antichrist_es.srt', '../subs/atonement_en.srt', '../subs/atonement_es.srt', '../subs/a_beautiful_mind_en.srt', '../subs/a_beautiful_mind_es.srt', '../subs/a_beautiful_mind_it.srt', '../subs/big_fish_en.srt', '../subs/brokeback_mountain_en.srt', '../subs/brokeback_mountain_es.srt', '../subs/casino_en.srt', '../subs/casino_es.srt', '../subs/cinderella_man_en.srt', '../subs/cinderella_man_es.srt', '../subs/dancer_in_the_dark_en.srt', '../subs/dancer_in_the_dark_es.srt', '../subs/falling_down_en.srt', '../subs/falling_down_es.srt', '../subs/finding_neverland_en.srt', '../subs/finding_neverland_es.srt', '../subs/good_will_hunting_en.srt', '../subs/good_will_hunting_es.srt', '../subs/hook_en.srt', '../subs/hook_es.srt', '../subs/hook_it.srt', '../subs/lost_in_translation_en.srt', '../subs/lost_in_translation_es.srt', '../subs/meet_joe_black_en.srt', '../subs/meet_joe_black_es.srt', '../subs/million_dollar_bab

In [5]:
# with open('../subs/the_hours_en.srt', 'r', encoding='utf-8') as f:
#     subtitles_text = f.read()

# subtitles = list(srt.parse(subtitles_text))

In [6]:
film_info_df = complete_info_df.loc[complete_info_df['film'] == 'the hours']
film_info_df.head(2)

Unnamed: 0,file,speaker,film,emotion,language,transcritpion
0,f_ans001aen,Meryl Streep,the hours,ans,en,"[non-verbal] I don't know what's happening, I'..."
1,f_ans001aes,Rosa Guiñón,the hours,ans,es,"No sé lo que me pasa, lo siento."


In [7]:
from exceptiongroup import catch


def get_phrase_time(subtitles_text: str, film: str):

    if len(subtitles_text) == 0:
        return
    
    try:

        subtitles = list(srt.parse(subtitles_text))
    except srt.SRTParseError as e:
        print(f'Could not parse subtitles for {film}. expected start: {e.expected_start} - actual start {e.actual_start}')
        subtitles = list(srt.parse(subtitles_text[e.actual_start:]))

    film_info_df = complete_info_df.loc[complete_info_df['film'] == film]


    for index, row in film_info_df.iterrows():
        s1 = row['transcritpion']
        if len(s1) > 50:
            s1 = s1[0:50]
        for s in subtitles:
            if fuzz.ratio(s1, s.content) >= 70:
                # print(srt.timedelta_to_srt_timestamp(s.start), s.content)
                complete_info_df.at[index, 'timestamp'] = srt.timedelta_to_srt_timestamp(
                    s.start)
                subtitles.remove(s)
                break



def get_closest_film(filepath: str, films) -> str:
    closest = ''

    name = filepath.replace('_', ' ')[:-7] 

    max = 0
    for film in films:

        ratio = fuzz.ratio(name, film)

        if (ratio > max):
            closest = film

            max = ratio

    return closest
    

In [15]:
from pathlib import Path


subtitles_text = '' 
complete_info_df['timestamp'] = pd.Series()

for file in subtitle_files:

    with open(file) as f:
        
        try:
            subtitles_text = f.read()
        except Exception as e:
            print(f'Could not open file {f.name}. {e}') 

    film = get_closest_film(file, films)
    get_phrase_time(subtitles_text, film)

    

Could not parse subtitles for antichrist. expected start: 0 - actual start 3
Could not parse subtitles for casinò. expected start: 0 - actual start 3
Could not parse subtitles for the others. expected start: 0 - actual start 3
Could not parse subtitles for titanic. expected start: 0 - actual start 3


In [20]:
complete_info_df

Unnamed: 0,file,speaker,film,emotion,language,transcritpion,timestamp
0,f_ans001aen,Meryl Streep,the hours,ans,en,"[non-verbal] I don't know what's happening, I'...","00:54:58,518"
1,f_ans001aes,Rosa Guiñón,the hours,ans,es,"No sé lo que me pasa, lo siento.","00:54:59,350"
2,f_ans001ait,Maria Pia di Meo,the hours,ans,it,Non so che cosa mi succede scusami.,
3,f_ans002aes,Rosa Guiñón,the hours,ans,es,"Lo siento, he sido muy grosera.",
4,f_ans002ait,Maria Pia di Meo,the hours,ans,it,Scusa è davvero scortese da parte mia.,
...,...,...,...,...,...,...,...
1110,m_tri040aen,Billy Crudup,big fish,tri,en,Maybe he never wanted a family.,
1111,m_tri041aen,Billy Crudup,big fish,tri,en,"Whatever it is, he likes his second life ...","01:16:12,610"
1112,m_tri041ait,Vittorio de Angelis,big fish,tri,it,Comunque sia lui preferisce la sua seconda vita.,
1113,m_tri041ben,Billy Crudup,big fish,tri,en,... better and the reason that he tells his st...,


In [17]:
complete_info_df.to_csv('../complete_info_t.csv',  mode='w', index=False)

In [18]:
total = len(complete_info_df)
missing = complete_info_df['timestamp'].isnull().sum()

In [19]:
print(f"total entries: {total}\nhas timestamp: {total - missing}\nmissing: {missing}")

total entries: 1115
has timestamp: 347
missing: 768


In [13]:
# s1 = 'the_devil_wears_prada_en.srt'
# fuzz.ratio(s1.replace('_', ' '), 'the devil wears prada')

In [14]:
# film_info_df = complete_info_df.loc[complete_info_df['film'] == 'the hours']