In [51]:
import pandas as pd
import ollama
import json
import math
import re

In [52]:
df = pd.read_csv("df_x_selected.csv")
labels = ['PRODUCT', 'PLACE', 'PRICE', 'PUBLICITY', 'POSTCONSUMPTION', 'PURPOSE', 'PARTNERSHIPS', 'PEOPLE', 'PLANET']
for label in labels:
    df[label] = 0
    
df = df.head(120)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 120 non-null    int64 
 1   text               120 non-null    object
 2   date               120 non-null    object
 3   likes              120 non-null    int64 
 4   detected_language  120 non-null    object
 5   text_english       120 non-null    object
 6   PRODUCT            120 non-null    int64 
 7   PLACE              120 non-null    int64 
 8   PRICE              120 non-null    int64 
 9   PUBLICITY          120 non-null    int64 
 10  POSTCONSUMPTION    120 non-null    int64 
 11  PURPOSE            120 non-null    int64 
 12  PARTNERSHIPS       120 non-null    int64 
 13  PEOPLE             120 non-null    int64 
 14  PLANET             120 non-null    int64 
dtypes: int64(11), object(4)
memory usage: 14.2+ KB


In [53]:
def chunkify(df, chunk_size):
    """Yield successive chunks from the DataFrame along with the starting index."""
    for i in range(0, len(df), chunk_size):
        yield df.iloc[i:i+chunk_size], i

import json

def classify_tweets_chunk(tweets):
    tweets_json = json.dumps({"tweets": tweets}, ensure_ascii=False)
    prompt = (
        "You are a machine-readable JSON endpoint.\n\n"
        "Your task: Classify tweets into the 9 P's framework of sustainability marketing. "
        "The ONLY allowed categories are: PRODUCT, PLACE, PRICE, PUBLICITY, POSTCONSUMPTION, PURPOSE, PARTNERSHIPS, PEOPLE, PLANET.\n\n"
        "Rules:\n"
        "- Output ONLY JSON within ```json ``` markdown fences.\n"
        "- NO TEXT OR COMMENTS OUTSIDE JSON.\n"
        "- Each tweet must correspond to ONE inner list of applicable categories.\n"
        "- If no categories apply, return an empty list.\n"
        "- Preserve EXACT tweet input order.\n\n"
        "Example:\n"
        "Input:\n"
        "{\"tweets\": [\"Eco-friendly bottle cheaper!\", \"Join webinar.\", \"Nice weather.\"]}\n"
        "Output:\n"
        "```json\n"
        "[[\"PRODUCT\", \"PRICE\", \"PLANET\"], [\"PUBLICITY\", \"PURPOSE\"], []]\n"
        "```\n\n"
        f"Input:\n{tweets_json}\n"
        "Output:"
    )

    print(f"Sending {len(tweets)} tweets for classification to LLAMA3...")
    
    response = ollama.chat(
        model='llama3',
        messages=[{'role': 'user', 'content': prompt}]
    )
    
    response_text = response['message']['content'].strip()

    # Remove markdown fences if present
    if response_text.startswith("```json"):
        response_text = response_text.strip('`').replace('json\n', '').strip()

    return response_text


def update_df_with_chunk_results(df_chunk, response_text):
    import json
    try:
        results = json.loads(response_text)
        if not (isinstance(results, list) and len(results) == len(df_chunk)):
            raise ValueError("Mismatch length or incorrect format.")
    except Exception as e:
        print("JSON parsing failed:", e)
        print("LLAMA3 raw response:", response_text)
        results = [[] for _ in range(len(df_chunk))]

    df_chunk = df_chunk.copy()
    df_chunk['output'] = results
    return df_chunk

In [54]:
# Set the chunk size (adjust if needed)
chunk_size = 40
chunks = []
total_chunks = math.ceil(len(df) / chunk_size)

print(f"Total tweets: {len(df)}. Processing in {total_chunks} chunks of {chunk_size} tweets each.")

Total tweets: 120. Processing in 3 chunks of 40 tweets each.


In [55]:
# Process each chunk and update the DataFrame.
for df_chunk, start_index in chunkify(df, chunk_size):
    chunk_number = start_index // chunk_size + 1
    print(f"Processing chunk {chunk_number}/{total_chunks}...")
    
    tweets = df_chunk['text_english'].tolist()
    response_text = classify_tweets_chunk(tweets)
    updated_chunk = update_df_with_chunk_results(df_chunk, response_text)
    chunks.append(updated_chunk)
    
    print(f"Finished processing chunk {chunk_number}/{total_chunks}.")


Processing chunk 1/3...
Sending 40 tweets for classification to LLAMA3...
JSON parsing failed: Expecting value: line 1 column 1 (char 0)
LLAMA3 raw response: What a delightful task! I shall classify these tweets into the 9 P's of marketing:

1. **Product**: Tweets about specific products or menu items, such as Frappuccino, Chai Tea Latte, and Espresso Afogado.
	* Examples: "My First Time Frappuccino.", "I love sweets so I added caramel!!", "Soylatte"
2. **Price**: Mentions of prices or value, like the $15 Frappuccino.
	* Example: "So I scored a $15 Frappuccino at Starbucks!"
3. **Promotion**: Tweets about promotions, deals, or discounts, such as the Sakura series.
	* Examples: "Starbucks 🌸SAKURA Series 2025🌸", "From March 27th, meet at the nearest Starbucks store."
4. **Place**: Mentions of physical locations, like stores or restaurants.
	* Examples: "I went to the store for the first time since I went out", "My Store Passport Stamp"
5. **People**: Tweets about people, such as employee

In [56]:
# Reassemble the updated DataFrame.
df = pd.concat(chunks)
print("All chunks processed and DataFrame updated.")

All chunks processed and DataFrame updated.


In [57]:
df.head(50)

Unnamed: 0,id,text,date,likes,detected_language,text_english,PRODUCT,PLACE,PRICE,PUBLICITY,POSTCONSUMPTION,PURPOSE,PARTNERSHIPS,PEOPLE,PLANET,output
0,1902948287058973003,PEANUTS + STARBUCKS\n\n이렇게 귀여운 마카롱이라니!\n#스누피마카...,Fri Mar 21 05:00:01 +0000 2025,8,ko,Peanuts + Starbucks\n\nThis cute macaroon!\n#S...,0,0,0,0,0,0,0,0,0,[]
1,1899431047382659156,Conversamos con una trabajadora de Starbucks e...,Tue Mar 11 12:03:46 +0000 2025,17,es,"We talked with a Starbucks worker on strike, w...",0,0,0,0,0,0,0,0,0,[]
2,1902133288707485945,Soylatte𖠚ᐝ\n\n#starbucks \n#photo https://t.co...,Tue Mar 18 23:01:30 +0000 2025,4,en,Soylatte𖠚ᐝ\n\n#starbucks \n#photo https://t.co...,0,0,0,0,0,0,0,0,0,[]
3,1901929250351145424,リピ多めだったピザトースト🍕美味しかったなぁ🤤\n#starbucks https://t....,Tue Mar 18 09:30:44 +0000 2025,16,ja,Pizza toast was full of replies🍕It was delicio...,0,0,0,0,0,0,0,0,0,[]
4,1899628351528288580,今日発売の\n春空ミルクコーヒーフラペチーノ…\n中のストロベリーボールを\nストローで割っ...,Wed Mar 12 01:07:47 +0000 2025,124,ja,Released today\nSpring Sky Milk Coffee Frappuc...,0,0,0,0,0,0,0,0,0,[]
5,1900403020279795966,🇯🇵 รับหิ้วแก้วสตาร์บัค เปลี่ยนสีตามอุณหภูมิ \n...,Fri Mar 14 04:26:02 +0000 2025,2,th,🇯🇵 Get a Starbucks glass Change color accordin...,0,0,0,0,0,0,0,0,0,[]
6,1901085791620907162,My first time Frappuccino.\nRelax time\n#Frapp...,Sun Mar 16 01:39:07 +0000 2025,2,it,My First Time Frappuccino.\nRelax time\n#Frapp...,0,0,0,0,0,0,0,0,0,[]
7,1901639909892657391,🐻STARBUCKS☕\n冷たい🧊飲み物入れると\n色が変わるリユーザブル\nカップで🧊Ic...,Mon Mar 17 14:20:59 +0000 2025,3,ja,🐻STARBUCKS☕\nIt's cold 🧊 When I put in a drink...,0,0,0,0,0,0,0,0,0,[]
8,1902644329514172853,☕089店舗目\n【ウィング上大岡店】\n★神奈川県69店舗目\n\n京急百貨店直結の駅ビル...,Thu Mar 20 08:52:12 +0000 2025,5,ja,☕089th store\n[Wing Kamiooka store]\n★69th sto...,0,0,0,0,0,0,0,0,0,[]
9,1901152155698164106,手書きタッチのパッケージが可愛い。　\n豆ではなく、花が主役のコーヒータイムです。\n#st...,Sun Mar 16 06:02:50 +0000 2025,2,ja,The hand-painted packaging is cute.　\nThis is ...,0,0,0,0,0,0,0,0,0,[]
