In [5]:
import sys
import os
from openai import OpenAI

# Go to the parent of the parent directory
grandparent_dir = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
sys.path.append(grandparent_dir)

# Now you can import your config
from config import api_key

client = OpenAI(api_key=api_key)

In [100]:
import openai
from langchain.llms import OpenAI
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage, AIMessage
from langchain.prompts import HumanMessagePromptTemplate, ChatPromptTemplate
from langchain.document_loaders import CSVLoader

import pandas as pd
from tabulate import tabulate
import os

In [20]:
import os

os.environ["OPENAI_API_KEY"] = api_key

In [21]:
loader = CSVLoader(file_path="dataset.csv")
data = loader.load()

In [31]:
data[0].page_content.split("\n")

['User ID: 1',
 'Age: 56.0',
 'Gender: Female',
 'Country: USA',
 'Purchase Amount: 331.79',
 'Purchase Date: 2021-11-21',
 'Product Category: Sports']

In [30]:
def parse_page_content(content):
    fields = content.split("\n")
    return {field.split(': ')[0]: field.split(": ")[1] for field in fields if field}

In [35]:
parsed_data = [parse_page_content(document.page_content) for document in data]

In [38]:
df = pd.DataFrame(parsed_data)
df.head()

Unnamed: 0,User ID,Age,Gender,Country,Purchase Amount,Purchase Date,Product Category
0,1,56.0,Female,USA,331.79,2021-11-21,Sports
1,2,69.0,Male,Australia,335.72,2022-03-05,Home & Kitchen
2,3,46.0,,Germany,493.18,,Books
3,4,32.0,Male,Germany,80.97,2023-06-08,Sports
4,5,60.0,Female,UK,21.75,,Beauty


In [82]:
df = df[df["Age"]!=""]
average_age = df["Age"].astype(float).mean()

# most popular product category
popular_category = df["Product Category"].value_counts().idxmax()

# gender distribution 
gender_distribution = df["Gender"].value_counts()
print(tabulate(gender_distribution.items(), headers=["Gender","Count"], tablefmt="grid"))

+----------+---------+
| Gender   |   Count |
| Other    |    4296 |
+----------+---------+
| Female   |    4293 |
+----------+---------+
| Male     |    4227 |
+----------+---------+
|          |     684 |
+----------+---------+


In [109]:
chat_template = ChatPromptTemplate.from_messages(
        [SystemMessage(content=("You are and expert data analysis assistent")),
         HumanMessagePromptTemplate.from_template(
             """
             I have a dataset of customer purchaes with the following characteristics:
             - Average age of customers {average_age}
             - Gender distribution: {gender_distribution}

             Based on this information, can you provide insights into the potential marketing strategies and product recommendation
             """
         )
        ])

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
output = llm.invoke(chat_template.format_messages(average_age=average_age, gender_distribution=gender_distribution))
print(output.content)

Based on the dataset characteristics you've provided, here are some insights and potential marketing strategies and product recommendations:

### Customer Demographics Insights

1. **Average Age**: The average age of customers is approximately 43.4 years. This suggests that your target audience is likely to be middle-aged adults. Marketing strategies should consider the preferences and needs of this age group.

2. **Gender Distribution**: The gender distribution is relatively balanced, with a slight skew towards "Other" and "Female" customers. This indicates that marketing strategies should be inclusive and cater to diverse gender identities.

### Marketing Strategies

1. **Targeted Advertising**:
   - **Age-Specific Campaigns**: Create campaigns that resonate with middle-aged adults. Highlight products that cater to their lifestyle, such as health and wellness products, financial planning services, or travel experiences.
   - **Gender-Inclusive Messaging**: Develop marketing materials

In [117]:
import openai
import pandas as pd
from langchain_openai import OpenAI
from langchain.chains import LLMChain, SequentialChain
from langchain.prompts import PromptTemplate
from IPython.display import display, Markdown
import os

In [118]:
os.environ["OPENAI_API_KEY"] = api_key

In [127]:
try:
    data = pd.read_csv("dataset.csv")
except Exception as e:
    print(f"Error loading CSV file: {e}")
    raise # Raise for furhter handling

In [131]:
df = df[:100]
df.head()


Unnamed: 0,User ID,Age,Gender,Country,Purchase Amount,Purchase Date,Product Category
0,1,56.0,Female,USA,331.79,2021-11-21,Sports
1,2,69.0,Male,Australia,335.72,2022-03-05,Home & Kitchen
2,3,46.0,,Germany,493.18,,Books
3,4,32.0,Male,Germany,80.97,2023-06-08,Sports
4,5,60.0,Female,UK,21.75,,Beauty


In [149]:
from langchain.schema.output_parser import StrOutputParser

# Create an OpenAI chat LLM
llm = ChatOpenAI(model="gpt-4o-mini", api_key=api_key)

analysis_template = """
Analyze the following data and extract key insight
Data:
{content_data}

Key insight:
"""

analysis_prompt_template = PromptTemplate(input_variables=["content_data"], template=analysis_template)
lmm_chain = analysis_prompt_template | llm | StrOutputParser()

response = lmm_chain.invoke({"content_data": data})
print(response)

Based on the provided data from 15,000 user transactions, here are several key insights:

1. **Demographics**:
   - The dataset includes users of various ages, ranging from as young as 21 to 69 years old. 
   - There is a diversity in gender representation with Female, Male, and Other categories. However, a notable number of users do not have specified gender, which could indicate a gap in data collection.
   - The geographic distribution indicates users from multiple countries including the USA, UK, Canada, Germany, and Australia, which may highlight different regional market behaviors.

2. **Purchase Behavior**:
   - The **average Purchase Amount** can be determined for the different cohorts, which will help in understanding spending patterns. However, note that some entries may have missing values in the 'Purchase Amount' or 'Purchase Date' columns that could skew results.
   - NA values in 'Purchase Amount' (specifically for Users 3, 4, and 5) and 'Purchase Date' indicate opportuni

In [153]:
summary_template = """
Generate a report based on these key insights

Key Insights: {insights} 

Summary: report"""
summary_prompt_template = PromptTemplate(
    input_variables=['insights'], 
    template=summary_template)

seq_chain = ({"insights": analysis_prompt_template | llm | StrOutputParser()}
             | summary_prompt_template
             | llm
             | StrOutputParser())

print(seq_chain.invoke({"content_data": data}))

### User Purchase Data Analysis Report

#### Executive Summary
This report presents key insights derived from a dataset containing 15,000 records of user purchases. The analysis highlights demographic trends, purchasing behavior, product preferences, and potential areas for strategic improvements in marketing and inventory management. 

#### Key Insights

1. **Age Distribution**:
   - The dataset showcases a wide age range among users, from individuals in their 20s to seniors in their 60s and above. The majority of users are categorized as young adults to seniors, indicating a diverse customer base that could be targeted with varied marketing strategies tailored to each age group.

2. **Gender Representation**:
   - The dataset includes users identifying as female, male, and 'Other'. Analyzing purchasing behavior by gender could uncover significant trends in product preference and spending, which may inform tailored marketing campaigns aimed at increasing engagement across different ge