# Useful snippets from other notebooks

## Printing enhancement

In [4]:
import pprint
results = ['spam', 'eggs', 'lumberjack', 'knights', 'ni']
print(results)
pprint.pprint(results, width=20, compact=True)
# See docs at https://docs.python.org/3/library/pprint.html

['spam', 'eggs', 'lumberjack', 'knights', 'ni']
['spam', 'eggs',
 'lumberjack',
 'knights', 'ni']


## Restart kernel after install of packages

In [5]:
from IPython.display import display_html
def restartkernel() :
    display_html("<script>Jupyter.notebook.kernel.restart()</script>",raw=True)
restartkernel()

## Setting OpenAI API key if not present in environment variable

In [9]:
!pip install -q -U openai

In [12]:
import openai
import os
import getpass

# Configuring the environment variable OPENAI_API_KEY
if "OPENAI_API_KEY" not in os.environ:
    # OR set the key here as a variable
    openai.api_key = getpass.getpass("OpenAI API Key:")
    
assert len(openai.Model.list()["data"]) > 0 #Will generate an error if the key is not valid

OpenAI API Key: ········


## Testing if you have a CUDA device setup with pytorch

In [16]:
import torch

In [17]:
# Alternative from pytorch docs that allows for Mac GPU training as well
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")


Using cuda device


## To have some 'useful' raw data for RAG stuff, I extracted my own bookmarks

In [19]:
import json
import os

def extract_chrome_bookmarks():
    # Define the path to the Chrome bookmarks file
    # This is the default location on Windows. You may need to adjust this for other operating systems. Also may need to change the user to match yours
    path_to_bookmarks = "/mnt/c/Users/nissa/AppData/Local/Google/Chrome/User Data/Default/Bookmarks"

    # Ensure the file exists
    if not os.path.exists(path_to_bookmarks):
        print("The bookmarks file was not found at the specified location. Ensure Chrome is installed and you have bookmarks saved.")
        return []

    # Load the JSON data
    with open(path_to_bookmarks, 'r', encoding='utf-8') as file:
        bookmarks_data = json.load(file)

    # Define a recursive function to extract bookmarks
    def extract_from_node(node):
        if 'url' in node:
            return [(node['name'], node['url'])]
        else:
            urls = []
            if 'children' in node:
                for child in node['children']:
                    urls.extend(extract_from_node(child))
            return urls

    # Extract the bookmarks
    bookmarks = extract_from_node(bookmarks_data['roots']['bookmark_bar']) 
    bookmarks += extract_from_node(bookmarks_data['roots']['other'])  # Other bookmarks

    return bookmarks

bookmarks_list = extract_chrome_bookmarks()
# Run below if you want to be sure your bookmarks are imported as you expect
# for title, url in bookmarks_list:
    # print(f"{title}: {url}")

## Importing data from Kaggle

> Using [Jeremy Howard's notebook](https://www.kaggle.com/code/jhoward/getting-started-with-nlp-for-absolute-beginners/) as a learning sample starting point for my modified settings

In [1]:
!pip install kaggle -q

In [2]:
# for working with paths in Python, Jeremy Howard recommends using `pathlib.Path`
from pathlib import Path

cred_path = Path('~/.kaggle/kaggle.json').expanduser() #If you don't have the kaggle.json read about this at https://www.kaggle.com/docs/api

In [3]:
competition_name='us-patent-phrase-to-phrase-matching' #change to match competition name

In [4]:
path = Path(f'data/{competition_name}') 

In [5]:
import subprocess
import os

def download_competition_data(competition_name):
    # Create 'data' folder if it doesn't exist
    if not os.path.exists('data'):
        os.makedirs('data')

    # Run the Kaggle CLI command to download the competition data into 'data' folder
    cmd = f"kaggle competitions download -c {competition_name} -p data"
    subprocess.run(cmd, shell=True)

In [6]:
import os
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '') # This is to check if this notebook is running on Kaggle

In [7]:
# Note if an error, make sure to visit the competition page and accept the rules so you can download the data
if not iskaggle and not path.exists():
    import zipfile,kaggle
    # Replace 'your-competition-name-here' with the desired competition's name
    download_competition_data(competition_name)
    zipfile.ZipFile(f'{path}.zip').extractall(path)

In [8]:
!ls {path}

sample_submission.csv  test.csv  train.csv
