In [None]:
import os
import pandas as pd
import numpy as np
import ast
import math
import json

rootdir = "./"  # Set Root folder directory.

In [None]:
# Helper Functions

def get_advert_buffer(video_id, subdir, values):
    if not values['Skippable']:
        # Since Non Skippable ads will be completely buffered We can replace the buffer logs the duration to get 1/1 ratio later on.
        return values['Duration']

    filepath = os.path.join(subdir, 'AdvertBufferState.txt')
    count = values['Count']
    id_list = []
    advert_buffer = []
    id_list.append(video_id)
    if count > 1:
        for i in range(count, count+1):
            temp = video_id+"_"+str(i)
            id_list.append(temp)

    stripped_dict = {}
    with open(filepath) as data:
        buffer_dict = json.loads(data.read())
        for key, value in buffer_dict.items():
            stripped_dict[key.strip()] = value

    for vid_id in id_list:
        buffer = stripped_dict[vid_id]['buffer']
        ad_skip_duration = int(values['SkipDuration'])

        ahead_buffer = [
            ad_skip_duration, (ad_skip_duration+1), (ad_skip_duration+2), (ad_skip_duration+3)]
        buffer_list = []
        for val in ahead_buffer:
            for datapoint in buffer:
                current_buffer, video_player_in_seconds, resolution = datapoint
                if float(video_player_in_seconds) < float(val):
                    buffer_list.append(current_buffer)
                else:
                    # Read Buffer at skippable duration
                    temp_val = float(current_buffer)
                    temp_val += val  # Add spent buffer
                    advert_buffer.append(temp_val)
                    break  # To stop iterating over rest of list

    return advert_buffer


def get_buffer_index(buffer, timestamp):
    for i in range(len(buffer)):
        _, data_timestamp = buffer[i]
        if data_timestamp == timestamp:
            return i


def get_lost_buffer(video_id, subdir, main_vid_duration):
    '''
    TODO: Check validity against a file that has ad count > 2
    '''
    wasted_buffer = []

    filepath = os.path.join(subdir, 'BufferAdvert.txt')
    with open(filepath) as f:
        # Load List (This also works for list apparently)
        data = json.loads(f.read())

    filepath = os.path.join(subdir, 'buffer_details.txt')
    with open(filepath) as f:
        buffer = json.loads(f.read())  # Load the buffer into memory.

    for datapoint in data:
        dp_video_id, buff_info, main_timestamp = datapoint

        if dp_video_id.strip() == video_id:
            if (math.floor(abs(main_vid_duration-main_timestamp)) <= 1):  # Ad was at the end of the video
                return -1

            # Case where ad was at start of video
            if type(buff_info) == float and float(buff_info) == float(0.0):
                return -1
            buff_size, timestamp = buff_info
            datapoint_index_in_buffer = get_buffer_index(buffer, timestamp)
            # try:
            next_buffer_value, next_timestamp = buffer[datapoint_index_in_buffer+1]
            inter_value = float(buff_size-next_buffer_value)
            if inter_value < 0:
                print("Negative Value of Buffer Lost: ",
                      dp_video_id, inter_value, "Folder: ", subdir)
                inter_value = 0

            wasted_buffer.append(inter_value)

    if len(wasted_buffer) > 1:
        # For ads that came multiple times. Add all the buffer that was wasted.
        return np.array(wasted_buffer).sum()
    else:
        return wasted_buffer[0]

In [None]:
all_data = []

# For each Folder
for subdir, dirs, files in os.walk(rootdir):
    new_dict = {}
    dictionary_made = False
    if subdir == rootdir:
        continue

    # Get path to stream details.
    path = os.path.join(subdir, "stream_details.txt")
    if not os.path.exists(path):
        continue

    # Open File and read
    # print("Collecting buffer from path:", path)
    with open(path) as f:
        data_json = json.loads(f.read())  # Load Dictionary

    duration_path = os.path.join(subdir, "BufferAdvert.txt")
    with open(duration_path) as f:
        duration_json = json.loads(f.read())

    main_vid_duration = data_json['Main_Video']['Total Duration']

    for key, value in data_json.items():
        if (key.strip() == "empty_video"):
            continue

        # If selected value is main_video, skip it as it doesn't require any processing.
        if key.strip() == "Main_Video":
            new_dict["Main_Video"] = value
            continue

        print("Getting advert buffer for key:", subdir, value)

        if (value['SkipDuration'] != 999):
            value['SkipDuration'] = 5

        advert_buffers = get_advert_buffer(key.strip(), subdir, value)

        lost_while_played = get_lost_buffer(
            key.strip(), subdir, main_vid_duration)

        if (type(advert_buffers) == list):
            value['Advert_Buffer'] = advert_buffers[0]
        else:
            value['Advert_Buffer'] = advert_buffers

        value['Seconds_Lost_To_Ad'] = lost_while_played
        new_dict[key] = value

    for index in range(len(duration_json)):
        print(duration_json[index])
        key = duration_json[index][0]

        if (key.strip() == "empty_video"):
            continue
        print(key)
        try:
            new_dict[key]['Timestamp'] = duration_json[index][1][1]
        except:
            new_dict[key]['Timestamp'] = duration_json[index][2]

    all_data.append(new_dict)

In [None]:
# Making Rows for DataFrame
max_number_of_ads = -1
for i in range(len(all_data)):
    for key, value in all_data[i].items():
        if key == "Main_Video":
            if value['UniqueAds'] > max_number_of_ads:
                max_number_of_ads = value['UniqueAds']
            # max_number_of_ads=value['Main_Video']['UniqueAds']

column_names = ["Main_Video_Url", "Unique_Ads", "Total Number of Ads",
                "Duration(s)", "Size Original (Bytes)", "Resolution"]
add_columns = []

print(all_data)

for i in range(max_number_of_ads):
    ad_id = f"Advertisement {i+1} ID"
    add_columns.append(ad_id)
    ad_count = f"Advertisement {i+1} Count"
    add_columns.append(ad_count)
    ad_skippable = f"Advertisement {i+1} Skippable"
    add_columns.append(ad_skippable)
    skip_duration = f"Advertisement {i+1} Skip Duration"
    add_columns.append(skip_duration)
    resolution = f"Advertisement {i+1} Resolution"
    add_columns.append(resolution)
    size = f"Advertisement {i+1} Size(Bytes)"
    add_columns.append(size)
    duration = f"Advertisement {i+1} Durationn(s)"
    add_columns.append(duration)
    advert_buffer = f"Advertisement {i+1} Advert_Buffer(s)"
    add_columns.append(advert_buffer)
    seconds_lost = f"Advertisement {i+1} Main_Buffer_Lost(s)"
    add_columns.append(seconds_lost)
    timestamp = f"Avertisement {i+1} Timestamp"
    add_columns.append(timestamp)

new_columns = column_names+add_columns

In [None]:
# Generate the rows
all_data_rows = []
for i in range(len(all_data)):
    row = []
    main_video_url = all_data[i]['Main_Video']['Url']
    row.append(main_video_url)
    unique_ads = all_data[i]['Main_Video']['UniqueAds']
    row.append(unique_ads)

    row.append(0)  # For Total Number of ads #Index 2

    duration = all_data[i]['Main_Video']['Total Duration']
    row.append(duration)
    size = all_data[i]["Main_Video"]["Size"]
    row.append(size)
    # size_240p=all_data[i]["Main_Video"]["Size240p"]
    # row.append(size_240p)
    # size_1080p=all_data[i]["Main_Video"]["Size720p"]
    # row.append(size_1080p)
    res = all_data[i]['Main_Video']['Resolution']
    row.append(res)
    total_ads = 0
    for key, value in all_data[i].items():
        if key == "Main_Video":
            continue
        else:
            row.append(key)
            row.append(value['Count'])
            total_ads = total_ads+value['Count']
            row.append(value['Skippable'])
            row.append(value['SkipDuration'])
            row.append(value['Resolution'])
            row.append(value['Size'])
            # row.append(value['Size240p'])
            # row.append(value['Size720p'])
            row.append(value['Duration'])
            row.append(value['Advert_Buffer'])
            row.append(value['Seconds_Lost_To_Ad'])
            row.append(value['Timestamp'])
    row[2] = total_ads
    all_data_rows.append(row)

In [None]:
df = pd.DataFrame(all_data_rows, columns=new_columns)
df.to_csv('PakistanTrending.csv')