In [1]:
from sklearn.linear_model import Ridge
import numpy as np
import time
import string
# from copy import copy
import pickle
import pandas as pd
import nibabel as nib
import matplotlib.pyplot as plt
import IPython
import itertools
from collections import defaultdict
from sklearn.cross_validation import cross_val_score
import sklearn.metrics

%pylab inline

Populating the interactive namespace from numpy and matplotlib




# Подготовка субтитров

In [4]:
umlautDictionary = {u'Ä': 'Ae', u'Ö': 'Oe', u'Ü': 'Ue',  u'ä': 'ae', u'ö': 'oe', u'ü': 'ue', u'ß':u'ss'}
with open('/home/anya/Semantic-Atlas/data/forrest_srt/forrest_from_3.srt') as f_srt:
    text = f_srt.read()
    text = text.decode('utf-8')
    
for item, val in umlautDictionary.iteritems():
    text = text.replace(item, val)

text = text.split('\r\n')
text = filter(lambda s: s != u'', text)
print text[:5]

[u'1', u'00:03:10,160 --> 00:03:14,631', u'Hallo. Mein Name ist Forrest.', u'Forrest Gump.', u'2']


In [5]:
timestamps = filter(lambda s: '>' in s, text)
number_indecies = filter(lambda s: text[s].isdigit(), range(len(text)))
timestamp_indecies = filter(lambda s: text[s] in timestamps, range(len(text)))
text_indecies = filter(lambda s: not (s in number_indecies or s in timestamp_indecies) , range(len(text)))

assert len(timestamps) == len(number_indecies)

In [6]:
def timestamp_str_to_seconds(timestamp):
    
    def time_to_sec(time):
        time = time.split(':')
        return float(time[0]) * 60 * 60 + float(time[1]) * 60 + float(time[2])
    
    time_start, time_end = timestamp.split(' --> ')
    return time_to_sec(time_start), time_to_sec(time_end)

In [7]:
parsed_timestamps = map(lambda s: [time.strptime(s[:12], '%H:%M:%S,%f'), 
                            time.strptime(s[17:], '%H:%M:%S,%f')], timestamps)
print timestamps[0]
print parsed_timestamps[0][0].tm_hour, parsed_timestamps[0][0].tm_min, parsed_timestamps[0][0].tm_sec
print parsed_timestamps[0][1].tm_hour, parsed_timestamps[0][1].tm_min, parsed_timestamps[0][1].tm_sec

00:03:10,160 --> 00:03:14,631
0 3 10
0 3 14


In [8]:
timestamps = map(lambda s: s.replace(',', '.'), timestamps)
seconds = map(timestamp_str_to_seconds, timestamps)
print seconds[:5]

[(190.16, 194.631), (200.32, 202.151), (204.48, 208.837), (208.92000000000002, 213.63), (216.0, 219.31)]


In [9]:
table = string.maketrans("", "")
parsed_phrases = []

for i, phrase in enumerate(text):
    if i not in text_indecies:
        continue
    
    phrase = phrase.encode('utf-8')
    phrase = phrase.translate(table,string.punctuation)
    phrase = phrase.translate(table, string.digits)
    phrase = phrase.lower()
    #phrase = phrase.split(' ')
    phrase = filter(lambda s: s != u'', phrase)
    
    if phrase != []:
        if i - 1 in text_indecies:
            parsed_phrases[-1] += " " + phrase
        else:
            parsed_phrases.append(phrase)

In [10]:
assert len(parsed_phrases) == len(timestamps)

In [11]:
srt = pd.DataFrame(columns=['start', 'end', 'text'])
srt.start = [seconds[i][0] for i in range(len(seconds))]
srt.end = [seconds[i][1] for i in range(len(seconds))]
srt.text = [parsed_phrases[i] for i in range(len(seconds))]

# Удаление сцен

In [12]:
# http://www.nature.com/articles/sdata20143/tables/1

times_to_stay = [
'00:00:00.00 --> 00:21:32.12',
'00:24:13.24 --> 00:38:31.23', 
'00:38:58.20 --> 00:57:19.22', 
'00:59:31.17 --> 01:18:14.00', 
'01:20:24.16 --> 01:34:18.06', 
'01:37:14.19 --> 01:41:30.19',
'01:42:49.19 --> 02:09:51.17']

times_to_stay = map(timestamp_str_to_seconds, times_to_stay)
times_to_stay

[(0.0, 1292.12),
 (1453.24, 2311.23),
 (2338.2, 3439.22),
 (3571.17, 4694.0),
 (4824.16, 5658.06),
 (5834.19, 6090.19),
 (6169.19, 7791.17)]

In [13]:
times_to_delete = [(times_to_stay[i][1], times_to_stay[i+1][0]) for i in range(0, len(times_to_stay)-1)]

In [14]:
def is_in_stimulus(record, times_to_stay=times_to_stay):
    for time_moment_start, time_moment_end in times_to_delete:
        if record.start >= time_moment_start and record.end <= time_moment_end:
            return False
        
    return True

In [15]:
srt = srt.ix[[i for i in range(len(srt)) if is_in_stimulus(srt.ix[i])]]

# Пересчет времени

In [16]:
def recount_time(time_moment):

    index_to_delete = 0
    for end_moment in times_to_delete[:, 0]:
        if time_moment > end_moment:
            index_to_delete += 1
        else:
            break
            
    return time_moment - deleted_contin[index_to_delete]

In [17]:
deleted_contin = np.cumsum(np.array([t[1] - t[0] for t in times_to_delete]))
deleted_contin = np.hstack([[0], deleted_contin])
times_to_delete = np.array(times_to_delete)

srt['start'] = srt.start.apply(recount_time)
srt['end'] = srt.end.apply(recount_time)

#data['phrase'] = data.text.apply(lambda x: ' '.join(x))
#srt = srt[['start', 'end', 'text']]
srt.to_csv('/home/anya/Semantic-Atlas/data/forrest_srt/recounted_times.csv', encoding='utf-8')

# Подготовка описания сцен

In [18]:
description = pd.read_csv('/home/anya/Semantic-Atlas/data/forrest_srt/german_audio_description.csv')
umlautDictionary = {u'Ä': 'Ae', u'Ö': 'Oe', u'Ü': 'Ue',  u'ä': 'ae', u'ö': 'oe', u'ü': 'ue', u'ß':u'ss'}

for i, phrase in enumerate(description.text):
    phrase = description.text[i].lower()
    phrase = phrase.translate(table,string.punctuation)
    phrase = phrase.translate(table, string.digits)
    phrase = phrase.decode('utf-8')
    for uml, norm_let in umlautDictionary.iteritems():
        phrase = phrase.replace(uml, norm_let)
    
    #phrase = phrase.split()
    description.text[i] = 'ss ' + phrase

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


# Соединение субтитров и описания сцен

In [19]:
data = pd.concat([description, srt])
data.sort(columns='start', inplace=True)
data.index = range(len(data))
data

  from ipykernel import kernelapp as app


Unnamed: 0,start,end,text
0,1.30,10.500,ss eine computeranimation auf einen schroffen ...
1,15.90,19.500,ss vor bewoelktem himmel schwebt eine grauweis...
2,22.60,25.800,ss in einer stadt segelt sie ueber die baumkro...
3,31.70,36.700,ss ein robertzemeckisfilm tom hanks als forres...
4,40.70,61.000,ss die feder fliegt ein paar meter ueber den w...
5,68.70,73.000,ss in der naehe eines kirchturms sinkt die fed...
6,77.00,91.500,ss die hoerfilmfassung wurde produziert vom ba...
7,93.00,101.000,ss die feder trudelt zu einer strasse hinunter...
8,105.60,124.900,ss der mann ueberquert die fahrbahn die feder ...
9,126.20,145.700,ss der mann hebt sie auf und betrachtet sie e...


# Разбиение на сегменты

In [20]:
# http://www.nature.com/articles/sdata20143/#f2
segments_time = [902., 882., 876., 976., 924., 878., 1084., 675.]

In [21]:
data['duration'] = data.end - data.start

In [22]:
def print_subs(data, filename, filepath, delay=0.):
    start = data.ix[data.index[0]].start
    with open(filepath + filename, 'w') as f_out:
        for item in data.iterrows():
            f_out.write('{}\t{}\t{}\n'.format(item[1].start - start + delay,
                                              item[1].end - start + delay,
                                              item[1].text.encode('utf-8')))

In [24]:
def write_script_for_cut_audio(segment_start, segment_end, filepathname, segment_numb, data=data, delay=0.):
    data1 = data[data.start > segment_start]
    data1 = data1[data1.end < segment_end]
    
    moment_start = data1.ix[data1.index[0]].start
    data1.start = data1.start - moment_start
    data1.end = data1.end - moment_start
    
    with open(filepathname, 'w') as do:
        for ix, row in data1.iterrows():
            start, end = row.start, row.end
            command = ('ffmpeg -i /home/anya/Semantic-Atlas/audio/forrest_segments/fg_ad_seg{segment_id}.mkv -ss {start} -t {duration} '
                '-acodec pcm_s16le -ar 16000 -ac 1 -y /home/anya/Semantic-Atlas/data/aligned_segments/segment{segment_id}/audio_phrases/{index}.wav').format(
                start=start + delay,
                duration=end - start,
                index=ix,
                segment_id=segment_numb)
            do.write(command + '\n')

In [23]:
data1 = data[data.start > 885]
data1 = data1[data1.end < 1768]
print_subs(data1, 'segment1_phrases.txt', '/home/anya/Semantic-Atlas/data/forrest_srt/')
print_subs(data1, 'segment1_phrases.txt', '/home/anya/Semantic-Atlas/data/aligned_segments/segment1/')
data1.to_csv('/home/anya/Semantic-Atlas/data/forrest_srt/segment1_phrases.csv', encoding='utf-8')

In [27]:
write_script_for_cut_audio(885, 1768, '/home/anya/Semantic-Atlas/sh/segm1_cut.sh', 1)

In [None]:
data1 = data[data.start > 1750]
data1 = data1[data1.end < 2630]
data1.to_csv('/home/anya/Semantic-Atlas/data/aligned_segments/segment2/segment2_phrases.csv', encoding='utf-8')
print_subs(data1, 'segment2_phrases.txt', '/home/anya/Semantic-Atlas/data/aligned_segments/segment2/', delay=-0.5)
data1.to_csv('/home/anya/Semantic-Atlas/data/forrest_srt/segment2_phrases.csv', encoding='utf-8')
write_script_for_cut_audio(1750, 2638, '/home/anya/Semantic-Atlas/sh/segm2_cut.sh', segment_numb=2,delay=-0.5)

In [None]:
data1 = data[data.start > 2610]
data1 = data1[data1.end < 3590]
data1.to_csv('/home/anya/Semantic-Atlas/data/aligned_segments/segment3/segment3_phrases.csv', encoding='utf-8')
print_subs(data1, 'segment3_phrases.txt', '/home/anya/Semantic-Atlas/data/aligned_segments/segment3/', delay=-0.7)
data1.to_csv('/home/anya/Semantic-Atlas/data/forrest_srt/segment3_phrases.csv', encoding='utf-8')
write_script_for_cut_audio(2610, 3590, '/home/anya/Semantic-Atlas/sh/segm3_cut.sh', segment_numb=3, delay=-0.7)
data1.tail()

In [None]:
start = 3570
end = 4505
data1 = data[data.start > start]
data1 = data1[data1.end < end]
segment_id = 4
delay = 2.2

data1.to_csv('/home/anya/Semantic-Atlas/data/aligned_segments/segment{}/segment{}_phrases.csv'.format(segment_id,
                                                                                                     segment_id),
             encoding='utf-8')
print_subs(data1, 'segment{}_phrases.txt'.format(segment_id),
           '/home/anya/Semantic-Atlas/data/aligned_segments/segment{}/'.format(segment_id),
          delay=delay)
data1.to_csv('/home/anya/Semantic-Atlas/data/forrest_srt/segment{}_phrases.csv'.format(segment_id),
             encoding='utf-8')
write_script_for_cut_audio(start, end, '/home/anya/Semantic-Atlas/sh/segm{}_cut.sh'.format(segment_id),
                           segment_numb=4,delay=delay)
data1.tail()

In [75]:
start = 4480
end = 5355
data1 = data[data.start > start]
data1 = data1[data1.end < end]
segment_id = 5
delay = 0.

data1.to_csv('/home/anya/Semantic-Atlas/data/aligned_segments/segment{}/segment{}_phrases.csv'.format(segment_id,
                                                                                                     segment_id),
             encoding='utf-8')
print_subs(data1, 'segment{}_phrases.txt'.format(segment_id),
           '/home/anya/Semantic-Atlas/data/aligned_segments/segment{}/'.format(segment_id),
          delay=delay)
data1.to_csv('/home/anya/Semantic-Atlas/data/forrest_srt/segment{}_phrases.csv'.format(segment_id),
             encoding='utf-8')
write_script_for_cut_audio(start, end, '/home/anya/Semantic-Atlas/sh/segm{}_cut.sh'.format(segment_id),
                           segment_numb=segment_id,delay=delay)
data1.tail()

Unnamed: 0,start,end,text
1346,5317.4,5327.3,ss er sieht jenny im weissen kleid ueber den r...
1347,5329.8,5331.4,ss bedrueckt dreht sich forrest weg
1348,5334.7,5336.0,ss er schlurft zur tuer…
1349,5339.8,5341.7,ss … und geht zoegernd zurueck ins haus
1350,5349.1,5354.0,ss es ist tag neben seinem haus sitzt forrest ...


In [74]:
start = 5349
end = 6430
segment_id = 6
delay = 7.


data1 = data[data.start > start]
data1 = data1[data1.end < end]
data1.to_csv('/home/anya/Semantic-Atlas/data/aligned_segments/segment{}/segment{}_phrases.csv'.format(segment_id,
                                                                                                     segment_id),
             encoding='utf-8')
print_subs(data1, 'segment{}_phrases.txt'.format(segment_id),
           '/home/anya/Semantic-Atlas/data/aligned_segments/segment{}/'.format(segment_id),
          delay=delay)
data1.to_csv('/home/anya/Semantic-Atlas/data/forrest_srt/segment{}_phrases.csv'.format(segment_id),
             encoding='utf-8')
write_script_for_cut_audio(start, end, '/home/anya/Semantic-Atlas/sh/segm{}_cut.sh'.format(segment_id),
                           segment_numb=segment_id,delay=delay)
data1.tail()

Unnamed: 0,start,end,text
1588,6399.39,6402.826,was guckst du da an ernie und bert
1589,6401.2,6402.6,ss forrest setzt sich neben ihn
1590,6409.0,6410.8,ss jenny beobachtet die beiden geruehrt
1591,6417.6,6421.9,ss auf einem spielplatz in einem park schaukel...
1592,6424.35,6428.628,forrest ich bin krank


In [73]:
start = 6410
end = 9000
segment_id = 7
delay = 5.


data1 = data[data.start > start]
data1 = data1[data1.end < end]
data1.to_csv('/home/anya/Semantic-Atlas/data/aligned_segments/segment{}/segment{}_phrases.csv'.format(segment_id,
                                                                                                     segment_id),
             encoding='utf-8')
print_subs(data1, 'segment{}_phrases.txt'.format(segment_id),
           '/home/anya/Semantic-Atlas/data/aligned_segments/segment{}/'.format(segment_id),
          delay=delay)
data1.to_csv('/home/anya/Semantic-Atlas/data/forrest_srt/segment{}_phrases.csv'.format(segment_id),
             encoding='utf-8')
write_script_for_cut_audio(start, end, '/home/anya/Semantic-Atlas/sh/segm{}_cut.sh'.format(segment_id),
                           segment_numb=segment_id,delay=delay)
data1.head()

Unnamed: 0,start,end,text
1591,6417.6,6421.9,ss auf einem spielplatz in einem park schaukel...
1592,6424.35,6428.628,forrest ich bin krank
1593,6431.07,6434.426,was hast du einen husten wegen einer erkaeltung
1594,6434.51,6439.584,ich habe einen virus die aerzte wissen nicht w...
1595,6439.67,6442.707,und es gibt nichts was sie dagegen tun koennen


In [35]:
start = 0
end = 903
segment_id = 0
delay = 1.


data1 = data[data.start > start]
data1 = data1[data1.end < end]
data1.to_csv('/home/anya/Semantic-Atlas/data/aligned_segments/segment{}/segment{}_phrases.csv'.format(segment_id,
                                                                                                     segment_id),
             encoding='utf-8')
print_subs(data1, 'segment{}_phrases.txt'.format(segment_id),
           '/home/anya/Semantic-Atlas/data/aligned_segments/segment{}/'.format(segment_id),
          delay=delay)
data1.to_csv('/home/anya/Semantic-Atlas/data/forrest_srt/segment{}_phrases.csv'.format(segment_id),
             encoding='utf-8')
write_script_for_cut_audio(start, end, '/home/anya/Semantic-Atlas/sh/segm{}_cut.sh'.format(segment_id),
                           segment_numb=segment_id,delay=delay)
data1.tail()

Unnamed: 0,start,end,text,duration
208,890.0,892.5,ss die krankenschwester sieht ungeruehrt in ih...,2.5
209,892.6,896.309,meine mama sagte immer dass wunder jeden tag p...,3.709
210,896.2,897.3,ss ss die schwester blickt hoch,1.1
211,896.4,899.949,manche leute glauben das nicht aber es stimmt,3.549
212,900.2,902.8,ss ss forrest und jenny auf einem weg er krieg...,2.6
