In [2]:
import os
import re
import tiktoken

encode = tiktoken.encoding_for_model("gpt-3.5-turbo").encode
count_token = lambda x: len(encode(x))

def remove_ignored_dirs(dirs, ignore_dirs):
    dirs[:] = [dir for dir in dirs if dir not in ignore_dirs]


def generate_tree(start_path: str, ignore_dirs: list, indent_size: int = 1) -> str:
    tree_lines = []
    for root, dirs, files in os.walk(start_path):
        remove_ignored_dirs(dirs, ignore_dirs)
        level = root.replace(start_path, '').count(os.sep)
        indent = ' ' * indent_size * level
        tree_lines.append(f"{indent}{os.path.basename(root)}/")
        sub_indent = ' ' * indent_size * (level + 1)
        tree_lines.extend(f"{sub_indent}{f}" for f in files)
    return '\n'.join(tree_lines[:-1])

FILE_EXTENSION_LANG_MAP = {
    ".py": "python",
    ".js": "javascript",
    ".ts": "typescript",
    ".java": "java",
    ".c": "c",
    ".cpp": "cpp",
    ".cs": "csharp",
    ".php": "php",
    ".rb": "ruby",
    ".swift": "swift",
    ".go": "go",
    ".r": "r",
    ".m": "objective-c",
    ".pl": "perl",
    ".md": "markdown",
    ".tsx": "typescript",
    ".jsx": "javascript",
}
COMMENT_SYMBOL_MAP = {
    "python": "#",
    "javascript": "//",
    "typescript": "//",
    "java": "//",
    "c": "//",
    "cpp": "//",
    "csharp": "//",
    "php": "//",
    "ruby": "#",
    "swift": "//",
    "go": "//",
    "r": "#",
    "objective-c": "//",
    "perl": "#",
    "css": "/*"
}

def get_lang_from_extension(file_extension: str) -> str:
    return FILE_EXTENSION_LANG_MAP.get(file_extension, file_extension[1:])

def process_file(root: str, file: str, relative_path: str, no_formatting: bool, print_token_counts: bool):
    lang = get_lang_from_extension(os.path.splitext(file)[-1])
    comment_symbol = COMMENT_SYMBOL_MAP.get(lang, "//")

    try:
        with open(os.path.join(root, file), "r", encoding='utf-8') as infile:
            data = infile.read()
    except UnicodeDecodeError:
        print(f"Error reading {relative_path}")
        return None

    if no_formatting:
        comment_pattern = re.escape(comment_symbol) + '.*$'
        data = re.sub(comment_pattern, ' ', data, flags=re.MULTILINE)
        data = re.sub(r'(?<=\n)[ \t]*(?=\n)', '', data)
        # remove all newlines
        data = re.sub(r'\n', ' ', data)
        data = re.sub(r';+', '; ', data)
        data = re.sub(r' +', ' ', data)

    token_count = count_token(data) + count_token(relative_path) if print_token_counts else 0

    return {
        "path": relative_path,
        "token_count": token_count,
        "data": data,
        "comment_symbol": comment_symbol,
        "lang": lang
    }

def walk_files(config: dict):
    cwd = os.getcwd()
    outputs = []

    zero_target_files = not config["target_files"]
    zero_allowed_extensions = not config["allowed_extensions"]

    for root, dirs, files in os.walk(config['path']):
        remove_ignored_dirs(dirs, config['ignore_dirs'])

        for file in files:
            if zero_allowed_extensions or (file.endswith(config['allowed_extensions']) and not file.endswith(config['ignored_extensions'])):
                if file in config['ignore_files']:
                    continue
                if zero_target_files or file in config['target_files']:
                    relative_path = f'{root.replace(cwd, "")}/{file}'
                    if relative_path[0] == '/':
                        relative_path = relative_path[1:]

                    output = process_file(root, file, relative_path, config['no_formatting'], config['print_token_counts'])
                    if output and output['data']:
                        outputs.append(output)

    outputs = sorted(outputs, key=lambda x: os.path.splitext(x['path'])[-1])
    sept = '\n\n---\n\n'
    tree = generate_tree(config['path'], config['ignore_dirs'], indent_size=2) + sept
    generate_output_content = lambda x: f"```{x['lang']}\n{x['comment_symbol']} {x['path']}\n{x['data']}\n```"

    if config['print_token_counts']:
        out_string = tree + sept.join([f"count: {x['token_count']}\n{generate_output_content(x)}" for x in outputs])
    else:
        out_string = tree + sept.join([generate_output_content(x) for x in outputs])

    total_tokens = count_token(out_string)

    with open(config['output_file'], "w", encoding='utf-8') as outfile:
        outfile.write(f'total: {total_tokens}\n\n{out_string}')

config = {
    'path': os.getcwd(), #"src/components/game", #
    'output_file': "output.txt",
    'target_files': [ "App.tsx", "index.tsx"], # [],
    'allowed_extensions': (), 
    'ignored_extensions': (),
    'print_token_counts': False,
    'no_formatting': True,
    'ignore_dirs': ["favicon", ".husky", ".next", "node_modules", ".git", ".vscode", "__pycache__", "old", "test_page", "dist"],
    'ignore_files': ["example.ts", "output.txt", "yarn.lock", "package-lock.json", "package.json", "tsconfig.json", "run.ipynb", "tailwind.config.cjs"],
}

walk_files(config)
os.system("code output.txt")

0

In [None]:
# url = 'https://ls7ojkm1lh.execute-api.us-west-2.amazonaws.com/Prod'
# headers = {
#     "Content-Type": "application/json",
#     "x-api-key": "evEol4JklL9AGHHijU03j4kGctKFg79d3tJb1zBh",
# }

In [2]:

import requests
import json
import base64

# load old/devEnv/base64/example.txt
with open('old/devEnv/base64/example.txt', 'r') as f:
    base64_audio = f.read()
    
print(len(base64_audio))

# lang = 'hi'
lang = 'pt'
# lang = 'ko'

url = 'https://dyhweydzb4ws7peuxwmy5dzifm0zhnww.lambda-url.us-west-2.on.aws/'

headers = {
    "Content-Type": "application/json",
}

data = {
    "audioData": base64_audio,
    "startTime": 0,
    "endTime": 10,
    "sourceLanguage": "en",
    "targetLanguage": lang,
    "videoURL": "http://example.com/video.mp4",
    "videoId": "test"
}

jsonData = json.dumps(data)
response = requests.post(url, headers=headers, data=jsonData)


if response.status_code == 200:
    content = response.json()
    print(response.__dict__)
    audioDataTrans = content['audioData']
    # convert back into binary
    audioDataTrans = base64.b64decode(audioDataTrans)
    # save to mp3 file
    outpit_file_name = f'old/devEnv/test_audio_{lang}.mp3'
    with open(outpit_file_name, 'wb') as f:
        f.write(audioDataTrans)
    print('success')
else:
    print(response.json())

159416
{'error': 'Unexpected error: can only concatenate str (not "NoneType") to str'}


In [6]:
import base64
import filetype
import re

def identify_file_type(base64_string):
    # Decode the base64 string to bytes
    actual_base64_data = base64_string.split(",")[-1]
    
    # Check if padding adjustment is necessary
    missing_padding = len(actual_base64_data) % 4
    if missing_padding:
        actual_base64_data += '=' * (4 - missing_padding)
    
    # Ensure the base64 string is valid
    if not re.fullmatch(r'[A-Za-z0-9+/]*={0,2}', actual_base64_data):
        return "Invalid base64 string"

    binary_data = base64.b64decode(actual_base64_data)
    
    kind = filetype.guess(binary_data)

    if kind is None:
        return "Unknown file type"
    else:
        return kind.extension


with open('old/devEnv/base64/example.txt', 'r') as f:
    base64_audio = f.read()
    
identify_file_type(base64_audio)


'Unknown file type'

In [None]:
def get_audio_extension(mime_type_str):
    codec_to_extension_map = {
        'opus': '.opus',
        'vorbis': '.ogg',
        'mp4a.40.2': '.m4a',
        'mpeg3': '.mp3',
        'flac': '.flac',
        'alac': '.m4a',
        '1': '.wav',    # PCM codec for wav
        'samr': '.amr',
        'mp4v.20.8': '.mp4',
        'XVID': '.avi',
        'wmv3': '.wmv',
        'vp9': '.webm',
        'vp8': '.webm',
        'V_MPEG4/ISO/AVC': '.mkv',
        'theora': '.ogg',
        'mpeg1video': '.mpg',
        'A_AAC': '.aac',
        'oga': '.oga'    # Ogg Vorbis audio
    }

    # Split on semicolon to get the base mime type and parameters
    parts = mime_type_str.split(';')
    base_mime_type = parts[0]
    base_type, subtype = base_mime_type.split('/')

    # If there's no semicolon, we guess the extension based on the subtype
    if len(parts) < 2:
        return codec_to_extension_map.get(subtype, f".{subtype}")

    params = parts[1]
    
    # Extract the codec from the parameters, if it exists
    if 'codecs=' in params:
        codecs = params.split('codecs=')[1].replace('"', '').split(',')
        
        # if there are multiple codecs, we assume the second one is the audio codec
        if len(codecs) > 1:
            audio_codec = codecs[1].strip()
        else:
            audio_codec = codecs[0].strip()
            
        return codec_to_extension_map.get(audio_codec, f".{subtype}")

    # If there's no codec, return based on the subtype
    else:
        return codec_to_extension_map.get(subtype, f".{subtype}")



In [None]:
video_mime_types = [
    'video/webm; codecs="vp9, opus"',
    'video/mp4; codecs="avc1.42E01E, mp4a.40.2"',
    'video/avc; codecs="avc1.42E01E, mp4a.40.2"',
    'video/webm; codecs="vp8, vorbis"',
    'video/mp4; codecs="hev1.1.2.L93.B0, mp4a.40.2"',
    'video/quicktime; codecs="avc1.42E01E, mp4a.40.2"',
    'video/x-matroska; codecs="V_MPEG4/ISO/AVC, A_AAC"',
    'video/ogg; codecs="theora, vorbis"',
    'video/3gpp; codecs="mp4v.20.8, samr"',
    'video/mpeg; codecs="mpeg1video"',
    'video/x-m4v; codecs="avc1.42E01E, mp4a.40.2"',
    'video/x-msvideo; codecs="XVID"',
    'video/x-ms-wmv; codecs="wmv3"',
]

audioMimeTypes = [
  "audio/webm; codecs=opus",          
  "audio/mp4; codecs=mp4a.40.2",      
  "audio/aac; codecs=mp4a.40.2",      
  "audio/mpeg; codecs=mpeg3",
  "audio/ogg; codecs=vorbis",
  "audio/flac",
  "audio/alac",
  "audio/wav; codecs=1", 
  "audio/amr",
]

# test every mime type and print the result
for mime_type_str in video_mime_types + audioMimeTypes:
    audio_extension = get_audio_extension(mime_type_str)
    print(f'{mime_type_str} -> {audio_extension}')

In [None]:
import pytube

url = 'https://www.youtube.com/watch?v=VTOO_9_ECA8'

video = pytube.YouTube(url)
streams = video.streams

# download the video
video.streams.get_highest_resolution().download()




In [None]:
import moviepy
# vid.mp4

# Get 59s to 1:22s

# load with moviepy
from moviepy.editor import VideoFileClip
clip = VideoFileClip("vid.mp4").subclip(59, 82)

# save as mp4
clip.write_videofile("vid2.mp4")

In [None]:
# Define the magic numbers for common audio file formats
magic_numbers = {
    "mp3": ["494433", "fff3", "fffb"],
    "wav": ["52494646"],
    "ogg": ["4f676753"],
    "flac": ["664c6143"],
    "aac": ["fff1", "fff9", "fff3", "fffd"],
    "wma": ["3026b275"],
    "midi": ["4d546864"],
    "aiff": ["464f524d"],
}

# Convert the first few bytes of the binary content to hexadecimal
hex_signature = binary_content[:4].hex()

# Function to identify the file type from the magic number
def identify_file_type(hex_signature, magic_numbers):
    for file_type, signatures in magic_numbers.items():
        if any(hex_signature.startswith(sig) for sig in signatures):
            return file_type
    return "Unknown file type"

# Identify the file type
file_type = identify_file_type(hex_signature, magic_numbers)
file_type


// Define the magic numbers for common audio file formats
const magicNumbers: { [key: string]: string[] } = {
    mp3: ["494433", "fff3", "fffb"],
    wav: ["52494646"],
    ogg: ["4f676753"],
    flac: ["664c6143"],
    aac: ["fff1", "fff9", "fff3", "fffd"],
    wma: ["3026b275"],
    midi: ["4d546864"],
    aiff: ["464f524d"],
};

// Assume you have binaryContent as an ArrayBuffer or Buffer
// Convert the first few bytes of the binary content to hexadecimal
const hexSignature = binaryContent.slice(0, 4).toString('hex');

// Function to identify the file type from the magic number
function identifyFileType(hexSignature: string, magicNumbers: { [key: string]: string[] }): string {
    for (const [fileType, signatures] of Object.entries(magicNumbers)) {
        if (signatures.some(sig => hexSignature.startsWith(sig))) {
            return fileType;
        }
    }
    return "Unknown file type";
}

// Identify the file type
const fileType = identifyFileType(hexSignature, magicNumbers);
console.log(fileType);


In [None]:
pip install filetype

In [14]:
from typing import List, Tuple
from itertools import groupby

def find_buffer_sizes(sample_rate: int, max_chunk_duration: int = 10) -> List[Tuple[int, int]]:
    solutions = []
    
    # Iterate through the possible chunk durations (in seconds)
    for chunk_duration in range(1, max_chunk_duration + 1):
        required_samples = sample_rate * chunk_duration
        
        # Check for buffer sizes in the sequence 2^n that evenly divide the required samples
        for n in range(1, 15):
            buffer_size = 2**n
            if required_samples % buffer_size == 0:
                solutions.append((chunk_duration, buffer_size))

    # group on chunk duration
    solutions = groupby(solutions, key=lambda x: x[0])
    solutions = [(chunk_duration, list(size for _, size in sizes)) for chunk_duration, sizes in solutions]
    return solutions

# Sample rate of 44,100
sample_rate = 256000

# Finding possible solutions
solutions = find_buffer_sizes(sample_rate, max_chunk_duration=20)
solutions


[(1, [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]),
 (2, [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]),
 (3, [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]),
 (4, [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]),
 (5, [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]),
 (6, [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]),
 (7, [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]),
 (8, [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]),
 (9, [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]),
 (10, [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]),
 (11, [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]),
 (12, [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]),
 (13, [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]),
 (14, [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]),
 (15, [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]),
 (16, [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]),
 (17, [2, 4

In [18]:
8192/48_000

0.17066666666666666