In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import json
import pandas as pd

# Read the CSV file
csv_path = '/content/drive/MyDrive/to_fill.csv'
df = pd.read_csv(csv_path)

# Define a function to extract the body, start time, and end time from a JSON file
def extract_info(json_path):
    with open(json_path, 'r') as f:
        json_data = json.load(f)
    
    body = json_data['text']  # replace 'full_text' with 'text'
    start = json_data['words'][0]['start']
    end = json_data['words'][-1]['end']
    
    return body, start, end


# Define a function to extract the first 8 and last 8 words from a story
def extract_first_last_words(row):
    words = row['body'].split()
    first_words = ' '.join(words[:8])
    last_words = ' '.join(words[-8:])
    return first_words, last_words



# Iterate over the JSON file paths and add the required columns to the dataframe
json_paths = ['/content/drive/MyDrive/18246.json',
              '/content/drive/MyDrive/16859.json',
              '/content/drive/MyDrive/12387.json']

for json_path in json_paths:
    transcription_id = json_path.split('/')[-1].split('.')[0]
    body, start, end = extract_info(json_path)
    df.loc[df['source_video_id'] == int(transcription_id), 'body'] = body
    df.loc[df['source_video_id'] == int(transcription_id), 'start'] = start
    df.loc[df['source_video_id'] == int(transcription_id), 'end'] = end

# Apply the function to create new columns for first and last words
df['first_words'], df['last_words'] = zip(*df.apply(extract_first_last_words, axis=1))

# Display the resulting dataframe
print(df.head())


                                         first_words  \
0              And that's just what Guy Al has done.   
1  Bit unusual. Ktv crime reporter Henry Lee joining   
2       Yesterday. We're on 115 for those hot spots,   
3              And that's just what Guy Al has done.   
4       Yesterday. We're on 115 for those hot spots,   

                                        last_words  source_video_id  \
0  the country. Mandela's visit comes as the most.            18246   
1        win. It's time to make this thing happen.            12387   
2   With your good credit, celebrate the start of.            16859   
3  the country. Mandela's visit comes as the most.            18246   
4   With your good credit, celebrate the start of.            16859   

                                                body  start        end  
0  And that's just what Guy Al has done. She take...   70.0   717620.0  
1  Bit unusual. Ktv crime reporter Henry Lee join...  190.0   717560.0  
2  Yesterday. We'

In [None]:
print(df['body'])

0     And that's just what Guy Al has done. She take...
1     Bit unusual. Ktv crime reporter Henry Lee join...
2     Yesterday. We're on 115 for those hot spots, s...
3     And that's just what Guy Al has done. She take...
4     Yesterday. We're on 115 for those hot spots, s...
5     And that's just what Guy Al has done. She take...
6     And that's just what Guy Al has done. She take...
7     Bit unusual. Ktv crime reporter Henry Lee join...
8     Yesterday. We're on 115 for those hot spots, s...
9     Yesterday. We're on 115 for those hot spots, s...
10    Yesterday. We're on 115 for those hot spots, s...
11    And that's just what Guy Al has done. She take...
12    And that's just what Guy Al has done. She take...
13    And that's just what Guy Al has done. She take...
14    And that's just what Guy Al has done. She take...
15    Bit unusual. Ktv crime reporter Henry Lee join...
16    Bit unusual. Ktv crime reporter Henry Lee join...
17    Bit unusual. Ktv crime reporter Henry Lee 

In [None]:
print(df['start'])

0      70.0
1     190.0
2     790.0
3      70.0
4     790.0
5      70.0
6      70.0
7     190.0
8     790.0
9     790.0
10    790.0
11     70.0
12     70.0
13     70.0
14     70.0
15    190.0
16    190.0
17    190.0
Name: start, dtype: float64


In [None]:
print(df['end'])


0      717620.0
1      717560.0
2     1316540.0
3      717620.0
4     1316540.0
5      717620.0
6      717620.0
7      717560.0
8     1316540.0
9     1316540.0
10    1316540.0
11     717620.0
12     717620.0
13     717620.0
14     717620.0
15     717560.0
16     717560.0
17     717560.0
Name: end, dtype: float64
