In [None]:
!pip install openai==0.28.1
!pip install fpdf==1.7.2

Collecting openai==0.28.1
  Downloading openai-0.28.1-py3-none-any.whl.metadata (11 kB)
Downloading openai-0.28.1-py3-none-any.whl (76 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/77.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m71.7/77.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.54.4
    Uninstalling openai-1.54.4:
      Successfully uninstalled openai-1.54.4
Successfully installed openai-0.28.1
Collecting fpdf==1.7.2
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-

In [None]:
import pandas as pd
import openai
import time
from fpdf import FPDF

openai.api_key = ''

## Load the dataset

In [None]:
df = pd.read_csv('')

In [None]:
df.sample(5)

Unnamed: 0,Freight ID,Origin,Destination,Carrier,Freight Rate (USD),Departure Time,Transit Time (Hours),Available Seats,Freight Type
20,F021,Philadelphia,Miami,Quick Logistics,920,07:30,38,5,Refrigerated
6,F007,Boston,Nashville,GoLogistics,800,11:10,24,9,Standard
29,F030,New York,San Jose,Quick Logistics,1000,09:00,50,3,Standard
1,F002,Miami,Chicago,Speedy Ship,700,09:30,36,8,Standard
10,F011,Dallas,Miami,Speedy Ship,890,07:00,40,6,Refrigerated


## Understanding the Dataset

In [None]:
# Get Column Names and Data Types
column_info = df.dtypes
print(column_info, "\n")

Freight ID              object
Origin                  object
Destination             object
Carrier                 object
Freight Rate (USD)       int64
Departure Time          object
Transit Time (Hours)     int64
Available Seats          int64
Freight Type            object
dtype: object 



In [None]:
# Identify Categorical and Numerical Features
categorical_features = df.select_dtypes(include=['object']).columns
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns

print("Categorical Features:", categorical_features)
print("Numerical Features:", numerical_features)

Categorical Features: Index(['Freight ID', 'Origin', 'Destination', 'Carrier', 'Departure Time',
       'Freight Type'],
      dtype='object')
Numerical Features: Index(['Freight Rate (USD)', 'Transit Time (Hours)', 'Available Seats'], dtype='object')


In [None]:
# Get Unique Values for Categorical Features with Limitation
def get_unique_values(col, max_display=10):
    uniques = df[col].unique().tolist() # get unique values and turn them into lists
    if len(uniques) > max_display: # if there are more than 10 unique values
        return uniques[:max_display] + ["..."]
    return uniques

unique_values = {col: get_unique_values(col) for col in categorical_features}

print("Unique Values for Categorical Features:")
for col, uniques in unique_values.items():
    print(f" - {col}: {uniques}")
print()

Unique Values for Categorical Features:
 - Freight ID: ['F001', 'F002', 'F003', 'F004', 'F005', 'F006', 'F007', 'F008', 'F009', 'F010', '...']
 - Origin: ['Los Angeles', 'Miami', 'Seattle', 'San Diego', 'Denver', 'Atlanta', 'Boston', 'Detroit', 'Phoenix', 'Portland', '...']
 - Destination: ['New York', 'Chicago', 'Dallas', 'Houston', 'Philadelphia', 'Orlando', 'Nashville', 'Kansas City', 'San Francisco', 'Las Vegas', '...']
 - Carrier: ['Global Freight', 'Speedy Ship', 'Quick Logistics', 'CargoLink', 'FastTrack', 'MoveIT', 'GoLogistics', 'SkyCargo', 'CargoPlus']
 - Departure Time: ['08:00', '09:30', '10:45', '13:20', '07:15', '06:00', '11:10', '15:30', '14:45', '05:30', '...']
 - Freight Type: ['Standard', 'Express', 'Refrigerated']



In [None]:
# Get Descriptive Statistics for Numerical Features
numerical_stats = df[numerical_features].describe()
print("Descriptive Statistics for Numerical Features:\n", numerical_stats, "\n")

Descriptive Statistics for Numerical Features:
        Freight Rate (USD)  Transit Time (Hours)  Available Seats
count            30.00000             30.000000         30.00000
mean            767.00000             28.400000          6.80000
std             136.38561             15.388307          2.79655
min             480.00000              4.000000          3.00000
25%             655.00000             13.000000          5.00000
50%             795.00000             31.000000          6.00000
75%             875.00000             41.500000          8.00000
max            1000.00000             50.000000         15.00000 



## Generating the Template

In [14]:
def generate_template(df,
                      column_info,
                      categorical_features,
                      numerical_features,
                      unique_values,
                      numerical_stats):

    # Construct a summary of the dataframe's structure
    column_summary = "Column Names and Data Types:\n"
    for col, dtype in column_info.items():
        column_summary += f" - {col}: {dtype}\n"

    # Unique values for categorical features
    unique_values_str = "Unique Values for Categorical Features:\n"
    for col, uniques in unique_values.items():
        unique_values_str += f" - {col}: {uniques}\n"

    # Descriptive statistics for numerical features
    numerical_stats_str = "Descriptive Statistics for Numerical Features:\n"
    for col in numerical_features:
        numerical_stats_str += f" - {col}:\n"
        for stat_name, value in numerical_stats[col].items():
            numerical_stats_str += f"   {stat_name}: {value}\n"

    # Define the system prompt
    system_prompt = """
    You are an intelligent assistant that creates descriptive templates for transforming dataframe rows into coherent paragraphs.
    Analyze the provided dataframe structure and generate a template sentence that includes placeholders for each column.
    Ensure the template is contextually relevant and maintains grammatical correctness."""

    # Define the user prompt
    user_prompt = f"""
    Analyze the following dataframe structure and create a descriptive template with placeholders for each column.

    <column_summary>
    {column_summary}
    </column_summary>

    <unique_values>
    {unique_values_str}
    </unique_values>

    <numerical_stats>
    {numerical_stats_str}
    </numerical_stats>

    Use the exact column names from the column_summary in generating the variable names in the template,
    as they will be populated with the actual values in the dataset.

    Example Template about a Spotify dataset:
    "{{artist}} gained {{streams}} streams in the song '{{song}}' that was a hit in {{date}}."

    Output only the template without any explanation or introduction.
    The template's variables will be dynamically replaced so make sure they're formatted properly."""

    retries = 3
    for attempt in range(retries):
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4o-mini",
                temperature=0.3,
                max_tokens=1024,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ]
            )
            template = response['choices'][0]['message']['content'].strip()
            return template
        except Exception as e:
            print(f"Error generating template (Attempt {attempt + 1}/{retries}): {e}")
            time.sleep(2)  # Wait before retrying

    return None

In [15]:
# Generate the template
template = generate_template(df,
                             column_info,
                             categorical_features,
                             numerical_features,
                             unique_values,
                             numerical_stats)
print(template)

"The freight shipment with ID '{Freight ID}' is scheduled to depart from '{Origin}' to '{Destination}' via '{Carrier}' at '{Departure Time}'. The freight rate is set at ${Freight Rate (USD)} and it is expected to take approximately {Transit Time (Hours)} hours. Currently, there are {Available Seats} available for this '{Freight Type}' shipment."


## Populating the Template with Actual Values

In [25]:
def populate_template(template, row):

    # Convert row to dictionary and replace NaN with 'N/A'
    row_dict = row.to_dict()
    for key, value in row_dict.items():
        if pd.isna(value):
            row_dict[key] = 'N/A'

    # Generate the populated template per row
    paragraph = template.format(**row_dict)

    return paragraph

In [26]:

# Apply the populate_template function to each row in the dataframe
df['paragraph'] = df.apply(lambda row: populate_template(template, row), axis=1)

In [27]:
df['paragraph'][0]

'"The freight shipment with ID \'F001\' is scheduled to depart from \'Los Angeles\' to \'New York\' via \'Global Freight\' at \'08:00\'. The freight rate is set at $950 and it is expected to take approximately 48 hours. Currently, there are 5 available for this \'Standard\' shipment."'

## Saving the Populated Templates (Natural Language Texts)

In [19]:
def save_content_to_txt(paragraphs, txt_filename):
    try:
        with open(txt_filename, 'w', encoding='utf-8') as file:

            for para in paragraphs:
                file.write(para + '\n\n')  # Double newline for separation

        print(f"All content has been saved as '{txt_filename}'.\n")
    except Exception as e:
        print(f"Error saving content to text file: {e}")

In [20]:
def save_content_to_pdf(paragraphs, pdf_filename):
    try:
        pdf = FPDF()
        pdf.add_page()
        pdf.set_auto_page_break(auto=True, margin=15)
        pdf.set_font("Arial", size=12)

        # Add each paragraph
        for para in paragraphs:
            pdf.multi_cell(0, 10, para)
            pdf.ln()  # Add a line break between paragraphs

        pdf.output(pdf_filename)
        print(f"All content has been saved as '{pdf_filename}'.\n")
    except Exception as e:
        print(f"Error saving content to PDF: {e}")


In [21]:
# Save the paragraphs to both text and PDF files
save_content_to_txt(df['paragraph'].to_list(),
                    txt_filename='nlg.txt')

#save_content_to_pdf(df['paragraph'].to_list(),
#                    pdf_filename='nlg.pdf')



All content has been saved as 'nlg.txt'.

