In [3]:
%pip install --upgrade tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp38-cp38-macosx_11_0_arm64.whl.metadata (6.6 kB)
Downloading tiktoken-0.7.0-cp38-cp38-macosx_11_0_arm64.whl (906 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m906.8/906.8 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mInstalling collected packages: tiktoken
Successfully installed tiktoken-0.7.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
# https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
import tiktoken
encoding = tiktoken.get_encoding("cl100k_base")
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [5]:
num_tokens_from_string("tiktoken is great!", "cl100k_base")

6

In [6]:
import os
import pandas as pd
from datetime import datetime

# Get the file names of all txt files in the 'data_ingested' directory
file_names = [f for f in os.listdir('data_ingested') if f.endswith('.txt')]

# Initialize an empty list to store the DataFrame generated at each iteration step
dfs = []

# Process each file one by one
for file_name in file_names:
    # Parse the file name
    parts = file_name.split('_')
    file_type = parts[0]
    
    # Handle the file type
    if file_type == 'doc':
        main_chapter = parts[1]
        lesson_title = parts[2].split('.')[0]
    else:
        lesson_title = parts[1][8:].replace('-', ' ').title()
    
    # Read the file content
    with open(os.path.join('data_ingested', file_name), 'r') as file:
        content = file.read()

    token_count = num_tokens_from_string(content, "cl100k_base")
    
    # Create a temporary DataFrame
    temp_df = pd.DataFrame({
                            'Level': ['Foundation'],
                            'Main Chapter': [main_chapter if file_type == 'doc' else None],
                            'Lesson Title': [lesson_title],
                            'Source Type': [file_type],
                            'Source URL': None,
                            'Content': [content],
                            'Token Count': [token_count],
                            'Access': ['public'],
                            'Last Updated': [datetime.now()],
                            'Last Index Time': [None],
                            })
    
    # Add the temporary DataFrame to the list
    dfs.append(temp_df)


# Concatenate all temporary DataFrames into one DataFrame
df = pd.concat(dfs, ignore_index=True)

df.to_csv('devElearning_Foundation.csv', index=False)


In [7]:
df

Unnamed: 0,Level,Main Chapter,Lesson Title,Source Type,Source URL,Content,Token Count,Access,Last Updated,Last Index Time
0,Foundation,,Referencing Media Library Files Within Applic...,transcription,,hello and welcome to the sea learning video th...,557,public,2024-05-16 14:34:35.161563,
1,Foundation,,Diagram User Group Role Relationship.Txt,text,,\n\n,1,public,2024-05-16 14:34:35.163977,
2,Foundation,,Introduction Presentation.Txt,transcription,,Neptune software currently supports two kinds ...,543,public,2024-05-16 14:34:35.166483,
3,Foundation,,Exercise Generate A Connector.Txt,text,,\nExercise:Generate a connector based on a tab...,48,public,2024-05-16 14:34:35.169167,
4,Foundation,,App Builder Introduction.Txt,text,,\nWelcome to the App Builder - this is a uniqu...,212,public,2024-05-16 14:34:35.170814,
...,...,...,...,...,...,...,...,...,...,...
169,Foundation,,Understanding The Different Application Types...,text,,"\nApplication, Adaptive Template, Building Blo...",278,public,2024-05-16 14:34:35.296408,
170,Foundation,,First Look How To Navigate The Developer Cock...,text,,\n\nThere is a search bar at the top of the co...,321,public,2024-05-16 14:34:35.296778,
171,Foundation,,Responsive Design Principles.Txt,text,,\nFoundation eLearning – Responsive Design Pri...,529,public,2024-05-16 14:34:35.297261,
172,Foundation,,Headers And Footers Bar Bar Content.Txt,text,,\nThere are pre-made building blocks for Heade...,112,public,2024-05-16 14:34:35.297557,
