This notebook is heavily inspired by [link](https://www.youtube.com/watch?v=fCh7PKR5WqU&ab_channel=Rabbitmetrics)

Link to the data: https://cseweb.ucsd.edu/~jmcauley/datasets/amazon_v2/

The idea is the following: Lets assume that I am launching a new product, from marketing perspective one of the best strategies could be:
- find the similar products on amazon
- filter review and pick the ones with 4+ stars
- use the wording that people used in those reviews to describe your own product

In this way I using the words and tonality of those review in order to speak with customers in their own language

In [None]:
import os
import json
import gzip
import pandas as pd
import seaborn as sns

In [None]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

In [None]:
# Extract data from files
data = []
with gzip.open('./data/AMAZON_FASHION.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))
        
metadata = []
with gzip.open('./data/meta_AMAZON_FASHION.json.gz') as f:
    for l in f:
        metadata.append(json.loads(l.strip()))

In [None]:
df = pd.DataFrame.from_dict(data)
df = df[df['reviewText'].notna()]
display(df.head(3))

df_meta=pd.DataFrame.from_dict(metadata)
display(df_meta.head(3))

In [None]:
# the longest and the shortest reviewText column strings
print(df['reviewText'].str.len().min())
print(df['reviewText'].str.len().max())

In [None]:
# we see that the majority of reviews below 500 symbols, lets truncate the reviews accordingly
string_lengths = df['reviewText'].str.len()
sns.histplot(string_lengths, kde=False)

In [None]:
max_review_len = 500

def truncate_review(text):
    return text[:max_review_len]

df['truncated_reviews'] = df.apply(lambda row: truncate_review(row['reviewText']), axis=1)
sns.histplot(df['truncated_reviews'].str.len())

In [None]:
df.groupby('asin').count().sort_values('overall')

In [None]:
# Work on only a slice of the dataframe
df = df.loc[df['asin'] == 'B000KPIHQ4'].copy()
print(df.shape)
display(df.head(3))

In [None]:
vec_texts = df['truncated_reviews'].tolist()
print(vec_texts[0])
metadata = [dict(rating=i) for i in df['overall'].tolist()]
print(metadata[0])

In [None]:
import pinecone
from langchain.vectorstores import Pinecone

pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),
    environment=os.getenv('PINECONE_ENV')
)

In [None]:
# create the Pinecone index
emb_length = 768

pinecone.create_index(name='zapier', 
                      metric='euclidean', 
                      dimension=emb_length)
index = pinecone.Index('zapier')

In [None]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings

open_ai_emb = OpenAIEmbeddings(model_name="ada")
hugg_face_emb = HuggingFaceEmbeddings()

In [None]:
# add review embeddings with metadata
vec_store = Pinecone.from_texts(vec_texts, hugg_face_emb, index_name='zapier', metadatas=metadata)

In [None]:
query = "The Powerstep Pinnacle Shoe Insoles are fantastic"
reviews = vec_store.similarity_search(query=query,
                                     k=100,
                                     filter={"rating": 5.0}
)

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2)

In [None]:
promt_template = """
Write a summary of the reviews:

{text}

The summary should be about ten lines long and tailored to Millenials
"""
PROMT = PromptTemplate(template=promt_template,
                       input_variables=["text"])
# summarizing chain
chain = load_summarize_chain(llm, chain_type="stuff", prompt=PROMT)
summary = chain.run(reviews)

In [None]:
print(summary)

In [None]:
## FACEBOOK ADD
promt_template_fb = """
Write the copy for a facebook ad based on the reviews:

{text}

As far as text goes, you can have up to 40 characters in your headline, 
125 characters in your primary text, and 30 characters in your description
It should be tailored to GenZ
"""
PROMT = PromptTemplate(template=promt_template_fb,
                       input_variables=["text"])
# summarizing chain
chain = load_summarize_chain(llm, chain_type="stuff", prompt=PROMT)
fb_ad = chain.run(reviews)


In [None]:
fb_ad

In [None]:
# load tools needed for connecting LangChain and Zapier

from langchain.agents.agent_toolkits import ZapierToolkit
from langchain.utilities.zapier import ZapierNLAWrapper
from langchain.agents import initialize_agent

## SETTING UP ZAPIER & MAILGUN ACCOUNTS IS QUITE A HUSTLE, BUT THE IDEA OF TAILORED EMAILS IS RATHER CLEAR BY NOW
## FRO MORE INFO: https://github.com/rabbitmetrics/voice-of-customer/blob/main/notebooks/voice-of-customer.ipynb

#zapier = ZapierNLAWrapper()
#toolkit = ZapierToolkit.from_zapier_nla_wrapper(zapier)

In [None]:
index_name = "zapier"
pinecone.delete_index(index_name)