### Analysis of data with LLM with OpenAI

In [1]:
!pip install openai > /dev/null
!pip install pandas scikit-learn > /dev/null

In [None]:
from google.colab import files
files.upload()

In [9]:
import os
import argparse
import pandas as pd

In [4]:
os.environ['OPENAI_API_KEY'] = openai_api_key

In [5]:
''' https://platform.openai.com/docs/models for a list of models '''
from openai import OpenAI
import os

client = OpenAI(api_key=openai_api_key)

def set_environment():
  variable_dict = globals().items()
  for key, value in variable_dict:
    if 'API' in key or 'ID' in key:
      os.environ[key] = value
set_environment()

In [11]:
def create_prompt(text):
  instructions = 'Is the review sentiment positive or negative?'
  formatting = '"Positive" or "Negative"'
  return f'Text:{text}\n{instructions}\nAnswer ({formatting}):'

def invoke_llm(prompt):
  messages = [
        {'content':prompt, 'role':'user'} ]
  response = client.chat.completions.create(
        messages=messages, model='gpt-4o')
  return response.choices[0].message.content

def classify(text):
  prompt = create_prompt(text)
  return invoke_llm(prompt)

In [12]:
df = pd.read_csv('engagements.csv')

In [13]:
df.head()

Unnamed: 0,timestamp,media_id,media_caption,comment_text
0,2025-03-01 00:13:57.153000+00:00,1090986906404998,"Soft skin, soft life 🩷🌸🫧 get your hands on thi...",I bet this is good
1,2025-03-01 00:23:06.879000+00:00,17950254656929862,Why use one scrub when you can use them all at...,i know this smells so good
2,2025-03-01 00:04:05.094000+00:00,1090109319826090,Morning routine with Tree Hut 🍊🫧 Now available...,Love it
3,2025-03-01 00:41:59.467000+00:00,1098364052333950,Why use one scrub when you can use them all at...,Please carry these in Canada! I miss them so m...
4,2025-03-01 02:21:29.715000+00:00,1083943630442659,Vanilla Serum-Infused Hand Wash: A sweet escap...,I love it ..✌️


In [14]:
df['class_media_caption'] = df['media_caption'].head(100).apply(classify)

In [15]:
df['class_text'] = df['comment_text'].head(100).apply(classify)

In [16]:
statistics = df['class_media_caption'].value_counts()
print(statistics)
df.to_csv('class_media_caption.csv')

class_media_caption
Positive                                                                                                                                                                                                                                                                                                                 99
The text provided does not explicitly express a positive or negative sentiment about the product. It is simply an invitation to shop for the Tangerine Pre-Shave Foaming Polish without any evaluative language. Therefore, no definitive sentiment (positive or negative) can be inferred from the given text alone.     1
Name: count, dtype: int64


In [17]:
statistics = df['class_text'].value_counts()
print(statistics)
df.to_csv('class_text.csv')

class_text
Positive                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            47
Negative                                                                                                                                                                                                                                                                                                                                                                                                                                                              

In [None]:
!head class_media_caption.csv

In [None]:
!head class_text.csv

### Clustering

In [24]:
!pip install scikit-learn numpy > /dev/null

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

# Combine relevant text columns for clustering
df_subset = df.head(100).copy()
df_subset['combined_text'] = df_subset['media_caption'].fillna('') + ' ' + df_subset['comment_text'].fillna('')

# Vectorize the text data
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df_subset['combined_text'])

# Perform KMeans clustering
num_clusters = 5  # You can adjust the number of clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
df_subset['cluster'] = kmeans.fit_predict(X)

# Display the first few rows with cluster labels
display(df_subset[['combined_text', 'cluster']].head())

In [None]:
if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('path', type=str, help='Path to the input file')
    args = parser.parse_args()

    df = pd.read_csv(args.path)
    df['class_media_caption'] = df['media_caption'].head(100).apply(classify)
    df.to_csv('class_media_caption.csv')
    print(df)
    statistics = df['class_text'].value_counts()
    print(statistics)
    df.to_csv('class_text.csv')

### Sentiment pipeline using langchain

In [None]:
import argparse
import pandas as pd

In [28]:
!pip install langchain_openai > /dev/null
!pip install langchain_core > /dev/null

In [29]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts.chat import ChatPromptTemplate
from langchain_core.output_parsers.string import StrOutputParser
from langchain_core.runnables.passthrough import RunnablePassthrough

In [30]:
''' Creates chain for text classification'''
def create_chain():
    prompt = ChatPromptTemplate.from_template(
        '{text}\n'
        'Is the sentiment positive or negative?\n'
        'Answer ("Positive"/"Negative")\n')
    llm = ChatOpenAI(
        model='gpt-4o', temperature=0,
        max_tokens=1)
    parser = StrOutputParser()
    chain = ({'text':RunnablePassthrough()} | prompt | llm | parser)
    return chain


In [None]:
df = pd.read_csv('engagements.csv')
df,

In [None]:
chain = create_chain()

class_text_result = chain.batch(list(df['comment_text'[:10]]))
df['class_text'] = class_text_result
df.to_csv('class_text_result.csv')

In [None]:
''' process with batch_size '''
import time
chain = create_chain()
batch_size = 100  # Adjust batch size as needed
num_batches = (len(df) + batch_size - 1) // batch_size
all_results = []

for i in range(num_batches):
    start_index = i * batch_size
    end_index = min((i + 1) * batch_size, len(df))
    comment_batch = list(df['comment_text'][start_index:end_index])
    batch_results = chain.batch(comment_batch)
    all_results.extend(batch_results)
    time.sleep(1) # Add a delay between batches

df['class_text'] = all_results

In [None]:
if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('file_path', type=str, help='Path to input .csv file')
    args = parser.parse_args()

    df = pd.read_csv(args.file_path)
    chain = create_chain()

    class_text_result = chain.batch(list(df['comment_text']))
    df['class_text'] = class_text_result
    df.to_csv('class_text_result.csv')

### Setiment pipeline with Vertex

In [44]:
!pip install google-cloud-aiplatform > /dev/null

In [45]:
from google.colab import auth
auth.authenticate_user()

In [46]:
import vertexai

PROJECT_ID = 'llm-test-428715'  # Replace with your Google Cloud project ID
LOCATION = 'us-central1'  # Replace with your desired Vertex AI location
vertexai.init(project=PROJECT_ID, location=LOCATION)
print(f"Vertex AI initialized for project '{PROJECT_ID}' in location '{LOCATION}'.")

Vertex AI initialized for project 'llm-test-428715' in location 'us-central1'.


In [None]:
from vertexai.language_models import TextGenerationModel
# Choose a model suitable for text classification/sentiment analysis
# 'text-bison@001' is a good general-purpose text model
model = TextGenerationModel.from_pretrained("text-bison@001")

In [None]:
import pandas as pd
df = pd.read_csv('engagements.csv')
display(df.describe(include='all'))

In [None]:
df['vertex_sentiment'].fillna('Unknown', inplace=True)

In [None]:
# 1. Define a prompt template string
sentiment_prompt_template = """
Analyze the sentiment of the following text and classify it as either "Positive" or "Negative".

Text: {text}
Sentiment:
"""

def get_vertex_sentiment(text):
    if pd.isna(text):
        return None  # Handle missing values
    try:
        prompt = sentiment_prompt_template.format(text=text)
        response = model.predict(prompt)
        sentiment = response.text.strip()
        if sentiment not in ["Positive", "Negative"]:
             return "Unknown" # Or re-try, or log an error
        return sentiment
    except Exception as e:
        print(f"Error processing text: {text[:50]}... Error: {e}")
        return "Error" # Handle potential errors during prediction

df['vertex_sentiment'] = df['comment_text'].head(100).apply(get_vertex_sentiment)
df['vertex_sentiment'].fillna('Unknown', inplace=True)
display(df[['comment_text', 'vertex_sentiment']].head())