In [22]:
from openai import OpenAI
client = OpenAI()

response = client.chat.completions.create(
  model="gpt-4o-mini",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Who won the world series in 2020?"},
    {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
    {"role": "user", "content": "Where was it played?"}
  ]
)

print(response.choices[0].message.content)

XXX
The 2020 World Series was played at Globe Life Field in Arlington, Texas. This was due to the COVID-19 pandemic, which led to a neutral-site format for the series.


In [37]:
import csv
import os
from openai import OpenAI

# Initialize the OpenAI client
client = OpenAI()


def extract_url_parts(url):
    # Parse the URL
    parsed_url = urlparse(url)
    
    # Extract domain and subdomain
    domain = parsed_url.netloc
    subdomain = ''
    if '.' in domain:
        parts = domain.split('.')
        if len(parts) > 2:
            subdomain = '.'.join(parts[:-2])
            domain = '.'.join(parts[-2:])
    
    # Extract path and other word-like substrings
    path = parsed_url.path.strip('/')
    words_in_path = re.findall(r'\b\w+\b', path)
    
    # Extract query string parameters and fragments
    query_words = re.findall(r'\b\w+\b', parsed_url.query)
    fragment_words = re.findall(r'\b\w+\b', parsed_url.fragment)
    
    # Combine all parts
    parts_list = [subdomain, domain] + words_in_path + query_words + fragment_words
    # Filter out empty strings and join as a comma-separated list
    parts_list = [part for part in parts_list if part]
    return ' url keywords: ' + ', '.join(parts_list)


def classify_media_type(url):
    # Lowercase the URL to make checking easier
    url = url.lower()

    # Check for common media types
    if '.pdf' in url:
        return 'PDF'
    elif 'youtube.com' in url or 'youtu.be' in url:
        return 'YouTube Video'
    elif any(ext in url for ext in ['.jpg', '.jpeg', '.png', '.gif']):
        return 'Image'
    elif any(ext in url for ext in ['.mp4', '.mov', '.wmv', '.avi']):
        return 'Video'
    elif 'wikipedia.org' in url:
        return 'Wikipedia Article'
    elif 'slideshare.net' in url:
        return 'Slide Presentation'
    elif '.html' in url or '.htm' in url or urlparse(url).path.endswith('/'):
        return 'Web Page'
    else:
        return 'Other'

def get_resource_description(title, url):
    # Extract parts of the URL
    url_parts = extract_url_parts(url)
    
    # Create the prompt with the URL part
    prompt = f"""
    You are compiling a list of resources for college faculty and staff on incorporating AI in the classroom or in their workplace.  Write a paragraph describing this resource related to AI, and why it may be helpful to their work. The title of the resource is: {title}. Begin the description with **Description**.  Do not put the url in the description, but here are parts of the url that you may provide helpful information about the resource: {url_parts}.  Also, provide several keywords users can use to quickly see what it is and what it offers.
     Begin the list of keywords with **Keywords:**.  Make it a comma-sparated list and format items using title case, where the first letter of each word is capitalized. """
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are an expert in AI and education resources."},
            {"role": "user", "content": prompt},
        ]
    )
    return response.choices[0].message.content


def process_tsv(input_file_path, output_file_path):
    with open(input_file_path, mode='r', newline='', encoding='utf-8') as tsvfile:
        reader = csv.DictReader(tsvfile, delimiter='\t')
        fieldnames = reader.fieldnames + ['Description', 'Media Type']
        
        with open(output_file_path, mode='w', newline='', encoding='utf-8') as outfile:
            writer = csv.DictWriter(outfile, fieldnames=fieldnames, delimiter='\t')
            writer.writeheader()
            
            for row in reader:
                title = row['Title']
                url = row['URL']
                description = get_resource_description(title, url)
                media_type = classify_media_type(url)
                row['Description'] = description
                row['Media Type'] = media_type
                writer.writerow(row)

# Replace 'test.tsv' with the path to your input TSV file and 'test_out.tsv' with the desired output path
process_tsv('in_file.tsv', 'out_file.tsv')
