**Step 1. Extracting data from JSON**

In [1]:
import json
import csv
import os
from typing import Dict, List, Any

def extract_info(data: Dict[str, Any]) -> List[Dict[str, Any]]:
    owner = data.get('owner', {})
    user = f"{owner.get('full_name', '')} (@{owner.get('username', '')})"
    location = data.get('location', {})
    location_name = location.get('name', '') if location else 'N/A'
    
    likes = data.get('edge_media_preview_like', {}).get('count', 0)
    comments_data = data.get('edge_media_to_parent_comment', {})
    comments_count = comments_data.get('count', 0)
    comments_list = ' | '.join(comment['node']['text'] for comment in comments_data.get('edges', []))
    
    tagged_brands = ', '.join(user['node']['user']['full_name'] for user in data.get('edge_media_to_tagged_user', {}).get('edges', []))
    
    captions = data.get('edge_media_to_caption', {}).get('edges', [])
    return [{
        'User': user,
        'Location': location_name,
        'Caption Text': caption['node']['text'],
        'Tags': ' '.join(tag for tag in caption['node']['text'].split() if tag.startswith('#')),
        'Likes': likes,
        'Comments Count': comments_count,
        'Comments List': comments_list,
        'Tagged Brands': tagged_brands
    } for caption in captions]

def process_files(input_path: str) -> List[Dict[str, Any]]:
    data_list = []
    for file_name in os.listdir(input_path):
        if file_name.endswith('.info'):
            with open(os.path.join(input_path, file_name), 'r') as file:
                json_data = json.load(file)
            data_list.extend(extract_info(json_data))
    return data_list

def write_csv(output_csv: str, data_list: List[Dict[str, Any]]) -> None:
    fieldnames = ['User', 'Location', 'Caption Text', 'Tags', 'Likes', 'Comments Count', 'Comments List', 'Tagged Brands']
    with open(output_csv, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data_list)

def main():
    input_path = "/Users/nurgul/Documents/Projects/Dissertation Code/data/info"
    output_csv = "/Users/nurgul/Documents/Projects/Dissertation Code/data/1. extracted_data.csv"
    
    data_list = process_files(input_path)
    write_csv(output_csv, data_list)
    print("Data extraction and CSV file creation completed successfully.")

if __name__ == "__main__":
    main()

Data extraction and CSV file creation completed successfully.


**Step 2. Preprocessing** 

In [2]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from typing import Union, Callable

# Download required NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

# Define file locations
INPUT_FILE = "/Users/nurgul/Documents/Projects/Dissertation Code/data/1. extracted_data.csv"
OUTPUT_FILE = "/Users/nurgul/Documents/Projects/Dissertation Code/data/2. preprocessing_data.csv"
DESIRED_COLUMN_ORDER = ['User', 'Location', 'Cleaned Captions', 'Likes', 'Comments count', 'Comments list', 'Tagged brands', 'Tags']

# Compile regex patterns
SPECIAL_CHAR_PATTERN = re.compile(r'[^A-Za-z0-9\s|]')
URL_PATTERN = re.compile(r'http\S+|www\S+|https\S+', flags=re.MULTILINE)
USERNAME_PATTERN = re.compile(r'\((@[\w\._]+)\)')

# Get stop words
STOP_WORDS = set(stopwords.words('english'))

def clean_text(text: Union[str, float]) -> str:
    if not isinstance(text, str):
        return ""
    # Remove URLs
    text = URL_PATTERN.sub('', text)
    # Remove special characters (except '|') and convert to lowercase
    text = SPECIAL_CHAR_PATTERN.sub('', text.lower())
    # Tokenize and remove stop words
    return ' '.join(word for word in word_tokenize(text) if word not in STOP_WORDS)

def clean_username(username: str) -> str:
    match = USERNAME_PATTERN.search(username.strip().lower())
    return match.group(1).replace('@', '').strip() if match else username

def process_column(df: pd.DataFrame, old_col: str, new_col: str, process_func: Callable) -> pd.DataFrame:
    if old_col in df.columns:
        df[new_col] = df[old_col].apply(process_func)
        print(f"'{old_col}' column processed and renamed to '{new_col}'")
    else:
        print(f"'{old_col}' column not found in DataFrame")
    return df

def reorder_columns(df: pd.DataFrame, desired_order: list) -> pd.DataFrame:
    existing_columns = [col for col in desired_order if col in df.columns]
    remaining_columns = [col for col in df.columns if col not in existing_columns]
    return df[existing_columns + remaining_columns]

def main():
    # Load the CSV data
    data = pd.read_csv(INPUT_FILE)
    
    # Clean column names
    data.columns = data.columns.str.strip()
    
    # Clean the 'Caption Text' and 'Comments List' columns
    data = process_column(data, 'Caption Text', 'Cleaned Captions', clean_text)
    data = process_column(data, 'Comments List', 'Cleaned Comments', clean_text)
    
    # Clean the 'User' column
    data = process_column(data, 'User', 'User', clean_username)
    
    # Handle missing values
    data.dropna(subset=['Cleaned Captions'], inplace=True)
    data.fillna('', inplace=True)
    
    # Drop unnecessary columns
    data.drop(columns=['Caption Text', 'Comments List'], inplace=True)
    
    # Reorder columns
    data = reorder_columns(data, DESIRED_COLUMN_ORDER)
    
    # Save the cleaned data to a new CSV file
    data.to_csv(OUTPUT_FILE, index=False)
    print("Data preprocessing completed successfully.")
    print("Final columns:", data.columns)

if __name__ == "__main__":
    main()

'Caption Text' column processed and renamed to 'Cleaned Captions'
'Comments List' column processed and renamed to 'Cleaned Comments'
'User' column processed and renamed to 'User'
Data preprocessing completed successfully.
Final columns: Index(['User', 'Location', 'Cleaned Captions', 'Likes', 'Tags',
       'Comments Count', 'Tagged Brands', 'Cleaned Comments'],
      dtype='object')


**Step 3. Merging cleaned data with categories and number of followers.**

In [3]:
import pandas as pd

# Define file locations
FOLLOWERS_FILE = "/Users/nurgul/Documents/Projects/Dissertation Code/data/Number of followers for each influencer.csv"
CLEANED_DATA_FILE = "/Users/nurgul/Documents/Projects/Dissertation Code/data/2. preprocessing_data.csv"
OUTPUT_FILE = "/Users/nurgul/Documents/Projects/Dissertation Code/data/3. integrate_category_and_followers.csv"

def load_and_clean_df(file_path: str) -> pd.DataFrame:
    df = pd.read_csv(file_path)
    df.columns = df.columns.str.strip()
    return df

def merge_dataframes(cleaned_data_df: pd.DataFrame, followers_df: pd.DataFrame) -> pd.DataFrame:
    cleaned_data_df['User'] = cleaned_data_df['User'].str.strip().str.lower()
    cleaned_data_df['User'] = cleaned_data_df['User'].str.replace('@', '').str.strip()
    
    # Strip leading/trailing spaces in usernames
    followers_df['Username'] = followers_df['Username'].str.strip()
    cleaned_data_df['User'] = cleaned_data_df['User'].str.strip()

    # Convert usernames to lower case to avoid case sensitivity issues
    followers_df['Username'] = followers_df['Username'].str.lower()
    cleaned_data_df['User'] = cleaned_data_df['User'].str.lower()
    
    print(followers_df.head(2))
    print(cleaned_data_df.head(2))

    # Merge the two dataframes on the username column, including the Category column
    merged_df = pd.merge(cleaned_data_df, followers_df[['Username', 'Followers', 'Category']], left_on='User', right_on='Username', how='left')

    # Check if the merge was successful by printing some rows with followers and category
    print("Sample rows from merged_df with followers and category:")
    print(merged_df[~merged_df['Followers'].isna()].head())
    return merged_df 

def main():
    # Load and clean dataframes
    followers_df = load_and_clean_df(FOLLOWERS_FILE)
    cleaned_data_df = load_and_clean_df(CLEANED_DATA_FILE)

    # Merge dataframes
    merged_df = merge_dataframes(cleaned_data_df, followers_df)

    # Save merged dataframe
    merged_df.to_csv(OUTPUT_FILE, index=False)
    print("Followers and Category added to the cleaned Instagram data successfully.")

if __name__ == "__main__":
    main()


            Username Category  Followers  Followees  Posts
0        makeupbynvs   beauty       1432       1089    363
1  jaquelinevandoski   beauty     137600        548    569
           User Location                                   Cleaned Captions  \
0   lelien_tomo      NaN  danielwellington 75 75729 cuff 50off offtomo20...   
1  veverkakokos      NaN  dneska tohle pocasi zase bylo co vic chtit sun...   

   Likes                                               Tags  Comments Count  \
0    321  #danielwellington #ダニエルウェリントン #myclassicdw #サマ...              16   
1     33  #sunnyday #photography #nature #pond #energy #...               0   

       Tagged Brands        Cleaned Comments  
0  Daniel Wellington  | | | tomotomo | | w |  
1                NaN                     NaN  
Sample rows from merged_df with followers and category:
            User     Location  \
0    lelien_tomo          NaN   
1   veverkakokos          NaN   
2     alicekings          NaN   
3  nicolenic1973  

**Step 4. Removing non-english words**

In [4]:
import sys
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector
import pandas as pd

# Diagnostic information
print(f"Python version: {sys.version}")
print(f"spaCy version: {spacy.__version__}")
print(f"Path to spaCy: {spacy.__file__}")

print("\nInstalled models:")
for model in spacy.util.get_installed_models():
    print(f" - {model}")

print("\nAttempting to load 'en_core_web_sm':")
try:
    nlp = spacy.load('en_core_web_sm')
    print(" - Model loaded successfully")
except Exception as e:
    print(f" - Error loading model: {e}")
    print("Attempting to download 'en_core_web_sm'...")
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load('en_core_web_sm')

# Define a factory function to create the LanguageDetector if it does not already exist
if not Language.has_factory("language_detector"):
    @Language.factory("language_detector")
    def create_language_detector(nlp, name):
        return LanguageDetector()

# Add the LanguageDetector component to the pipeline if it's not already there
if "language_detector" not in nlp.pipe_names:
    nlp.add_pipe("language_detector", last=True)

# Read the CSV file
df = pd.read_csv('/Users/nurgul/Documents/Projects/Dissertation Code/data/3. integrate_category_and_followers.csv')

# Function to detect language
def detect_language(text):
    if isinstance(text, str):  # Check if the input is a string
        doc = nlp(text)
        return doc._.language['language']
    else:
        return 'unknown'  # Handle non-string inputs

# Apply the function to the DataFrame and create a new column
df['Language'] = df['Cleaned Comments'].apply(detect_language)

# Filter the DataFrame to keep only English rows
df_english = df[df['Language'] == 'en']

# Save the filtered DataFrame back to the file
df_english.to_csv('/Users/nurgul/Documents/Projects/Dissertation Code/data/4. english_captions.csv', index=False)

print(df_english)

# Print success message
print("Non-English captions and comments removed successfully.")

Python version: 3.12.2 (v3.12.2:6abddd9f6a, Feb  6 2024, 17:02:06) [Clang 13.0.0 (clang-1300.0.29.30)]
spaCy version: 3.7.5
Path to spaCy: /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/spacy/__init__.py

Installed models:
 - en_core_web_sm

Attempting to load 'en_core_web_sm':
 - Model loaded successfully
                          User     Location  \
2                   alicekings          NaN   
4                      aleyahs  White Point   
6                   merakilane          NaN   
7                 zanabfarooq_          NaN   
8             lightscamerabake          NaN   
...                        ...          ...   
39540  caroliinaneerotofficial  Dover, Kent   
39541       practicalbydefault          NaN   
39543                 power953          NaN   
39544                    q1043          NaN   
39545                prekpages          NaN   

                                        Cleaned Captions  Likes  \
2                          

**Step 5. Calculating number of influencers per each category.**

In [5]:
import pandas as pd
from pathlib import Path

# Define file locations
INPUT_FILE = Path("/Users/nurgul/Documents/Projects/Dissertation Code/data/4. english_captions.csv")
OUTPUT_FILE = Path("/Users/nurgul/Documents/Projects/Dissertation Code/data/5. influencers_per_category.csv")

def load_data(file_path: Path) -> pd.DataFrame:
    return pd.read_csv(file_path)

def calculate_influencers_per_category(df: pd.DataFrame) -> pd.DataFrame:
    result = df.groupby('Category')['Username'].nunique().reset_index()
    result.columns = ['Category', 'Number of Influencers']
    return result

def save_data(df: pd.DataFrame, file_path: Path) -> None:
    df.to_csv(file_path, index=False)
    print(f"Data saved successfully to {file_path}")

def main():
    # Load data
    merged_df = load_data(INPUT_FILE)

    # Calculate influencers per category
    influencers_per_category = calculate_influencers_per_category(merged_df)

    # Print results
    print("Number of influencers in each category:")
    print(influencers_per_category)

    # Save results
    save_data(influencers_per_category, OUTPUT_FILE)

if __name__ == "__main__":
    main()


Number of influencers in each category:
   Category  Number of Influencers
0    beauty                    637
1    family                   1824
2   fashion                   5338
3    fasion                      1
4   fitness                    547
5      food                   1730
6  interior                    566
7     other                   2103
8       pet                    304
9    travel                   1881
Data saved successfully to /Users/nurgul/Documents/Projects/Dissertation Code/data/5. influencers_per_category.csv



**Step 6. Remaining only necessary categories.**

In [6]:
import pandas as pd
from pathlib import Path
from typing import List

# Define file locations
INPUT_FILE = Path("/Users/nurgul/Documents/Projects/Dissertation Code/data/4. english_captions.csv")
OUTPUT_FILE = Path("/Users/nurgul/Documents/Projects/Dissertation Code/data/6. cleaned_data.csv")
CATEGORIES_TO_KEEP = ['beauty', 'family', 'fashion', 'fitness', 'food', 'travel']

def load_data(file_path: Path) -> pd.DataFrame:
    return pd.read_csv(file_path)

def filter_categories(df: pd.DataFrame, categories: List[str]) -> pd.DataFrame:
    return df[df['Category'].isin(categories)]

def save_data(df: pd.DataFrame, file_path: Path) -> None:
    df.to_csv(file_path, index=False)
    print(f"Filtered data saved successfully to {file_path}")

def main():
    # Load data
    merged_df = load_data(INPUT_FILE)

    # Filter categories
    filtered_df = filter_categories(merged_df, CATEGORIES_TO_KEEP)

    # Save filtered data
    save_data(filtered_df, OUTPUT_FILE)

if __name__ == "__main__":
    main()


Filtered data saved successfully to /Users/nurgul/Documents/Projects/Dissertation Code/data/6. cleaned_data.csv



**Step 7. Sentiment and keywords analysis - positive only**

In [9]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from pathlib import Path
from typing import Dict, List, Callable

# Define file locations and names
DATA_DIR = Path("/Users/nurgul/Documents/Projects/Dissertation Code/data")
INPUT_FILE = "6. cleaned_data.csv"
POSITIVE_OUTPUT_FILE = "7. sentiment_analysis.csv"
FINAL_OUTPUT_FILE = "7.1 keywords_analysis.csv"

CATEGORY_KEYWORDS = {
    'beauty': ['makeup', 'skincare', 'beauty', 'cosmetics', 'hair', 'nails', 'facial', 'lipstick', 'eyeliner', 'mascara',
               'foundation', 'concealer', 'blush', 'bronzer', 'highlighter', 'eyeshadow', 'serum', 'moisturizer', 'cleanser', 'toner'],
    'fashion': ['style', 'outfit', 'fashion', 'clothes', 'accessories', 'dress', 'shoes', 'handbag', 'jewelry', 'trend',
                'designer', 'runway', 'vintage', 'streetwear', 'couture', 'boutique', 'sustainable', 'chic', 'glamour', 'wardrobe'],
    'family': ['family', 'kids', 'parenting', 'children', 'home', 'baby', 'mom', 'dad', 'sibling', 'grandparent',
               'adoption', 'education', 'teenager', 'toddler', 'childcare', 'family-planning', 'homeschooling', 'co-parenting', 'stepfamily', 'foster'],
    'fitness': ['workout', 'gym', 'fitness', 'exercise', 'health', 'muscle', 'training', 'cardio', 'strength', 'yoga',
                'pilates', 'crossfit', 'weightlifting', 'running', 'cycling', 'meditation', 'nutrition', 'flexibility', 'HIIT', 'bodyweight'],
    'food': ['recipe', 'cooking', 'food', 'meal', 'restaurant', 'cuisine', 'diet', 'nutrition', 'chef', 'baking',
             'vegan', 'organic', 'gluten-free', 'barbecue', 'farm-to-table', 'meal-prep', 'slow-cooker', 'fusion', 'foodie', 'paleo'],
    'travel': ['travel', 'vacation', 'trip', 'adventure', 'destination', 'tourism', 'hotel', 'flight', 'explore', 'sightseeing',
               'backpacking', 'ecotourism', 'cruise', 'resort', 'cultural-exchange', 'roadtrip', 'staycation', 'digital-nomad', 'budget-travel', 'luxury-travel']
}

ALL_KEYWORDS = [word for words in CATEGORY_KEYWORDS.values() for word in words]


def setup_nltk():
    nltk.download('vader_lexicon', quiet=True)

def load_data(file_path: Path) -> pd.DataFrame:
    return pd.read_csv(file_path)

def analyze_sentiment(text: str) -> Dict[str, float or str]:
    if pd.isna(text):
        return {'score': 0, 'category': 'Neutral'}
    sentiment_score = SentimentIntensityAnalyzer().polarity_scores(text)['compound']
    category = 'Positive' if sentiment_score > 0.05 else 'Negative' if sentiment_score < -0.05 else 'Neutral'
    return {'score': sentiment_score, 'category': category}

def apply_sentiment_analysis(df: pd.DataFrame) -> pd.DataFrame:
    sentiment_results = df['Cleaned Comments'].apply(analyze_sentiment)
    df['Sentiment Score'] = sentiment_results.apply(lambda x: x['score'])
    df['Sentiment Category'] = sentiment_results.apply(lambda x: x['category'])
    return df

def categorize_post(text: str) -> List[str]:
    text = str(text).lower()
    return [category for category, keywords in CATEGORY_KEYWORDS.items() if any(keyword in text for keyword in keywords)]

def get_top_keywords_predefined(text: str, keywords: List[str], top_n: int = 5) -> List[str]:
    text = str(text).lower()
    return [word for word in keywords if word in text][:top_n]

def get_top_keywords_tfidf(tfidf_vector, feature_names: List[str], top_n: int = 5) -> List[str]:
    sorted_items = sorted(zip(tfidf_vector.tocoo().col, tfidf_vector.tocoo().data), key=lambda x: (x[1], x[0]), reverse=True)
    return [feature_names[idx] for idx, _ in sorted_items[:top_n]]

def apply_keyword_analysis(df: pd.DataFrame, method: str = 'predefined') -> pd.DataFrame:
    df['categories'] = df['Cleaned Captions'].apply(categorize_post)
    df = df[df['categories'].apply(len) > 0]
    
    if method == 'predefined':
        df['top_keywords'] = df['Cleaned Captions'].apply(lambda x: get_top_keywords_predefined(x, ALL_KEYWORDS))
    else:
        tfidf = TfidfVectorizer(max_features=1000)
        tfidf_matrix = tfidf.fit_transform(df['Cleaned Captions'].fillna(''))
        feature_names = tfidf.get_feature_names_out()
        df['top_keywords'] = [get_top_keywords_tfidf(tfidf_matrix[i], feature_names) for i in range(tfidf_matrix.shape[0])]
    
    return df

def print_results(df: pd.DataFrame):
    print("Category Distribution:")
    print(df['categories'].explode().value_counts(normalize=True))
    print("\nSample Top Keywords:")
    print(df['top_keywords'].head())

def save_data(df: pd.DataFrame, file_path: Path):
    df.to_csv(file_path, index=False)
    print(f"Data saved to {file_path}")

def main():
    setup_nltk()
    
    df = load_data(DATA_DIR / INPUT_FILE)
    df_sentiment = apply_sentiment_analysis(df)
    save_data(df_sentiment, DATA_DIR / POSITIVE_OUTPUT_FILE)
    
    df_categorized = apply_keyword_analysis(df_sentiment)
    print_results(df_categorized)
    save_data(df_categorized, DATA_DIR / FINAL_OUTPUT_FILE)

if __name__ == "__main__":
    main()


Data saved to /Users/nurgul/Documents/Projects/Dissertation Code/data/7. sentiment_analysis.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['top_keywords'] = df['Cleaned Captions'].apply(lambda x: get_top_keywords_predefined(x, ALL_KEYWORDS))


Category Distribution:
categories
fashion    0.263747
family     0.224729
travel     0.149235
food       0.139556
beauty     0.124070
fitness    0.098663
Name: proportion, dtype: float64

Sample Top Keywords:
1    [vacation, trip, adventure, destination, explore]
4                                             [recipe]
6                                   [fashion, couture]
8                                            [explore]
9                          [dress, health, food, chef]
Name: top_keywords, dtype: object
Data saved to /Users/nurgul/Documents/Projects/Dissertation Code/data/7.1 keywords_analysis.csv


**STEP 8. SEMANTIC ANALYSIS**

In [12]:
%pip install sentence-transformers
import csv
import chromadb
from chromadb.utils import embedding_functions
import re

# Function to clean and convert number strings
def clean_number(value):
    if isinstance(value, str):
        # Remove any non-digit characters (except decimal point)
        clean_value = re.sub(r'[^\d.]', '', value)
        try:
            return int(float(clean_value))
        except ValueError:
            return 0
    elif isinstance(value, (int, float)):
        return int(value)
    else:
        return 0

# Define file locations
file_path = '/Users/nurgul/Documents/Projects/Dissertation Code/data/7. sentiment_analysis.csv'

# Initialize ChromaDB client
client = chromadb.Client()

# Get or create a collection
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
collection = client.get_or_create_collection(
    name="sentiment_analysis", 
    embedding_function=sentence_transformer_ef
)

# Function to delete documents in batches
def delete_in_batches(collection, batch_size=5000):
    while True:
        results = collection.get(limit=batch_size)
        if not results['ids']:
            break
        collection.delete(ids=results['ids'])
        print(f"Deleted batch of {len(results['ids'])} documents.")

# Check if the collection already has data
if collection.count() > 0:
    user_input = input("The collection already contains data. Do you want to clear it and repopulate? (yes/no): ")
    if user_input.lower() == 'yes':
        print("Clearing existing data...")
        delete_in_batches(collection)
        print("Collection cleared. Repopulating with new data.")
    else:
        print("Keeping existing data. Script will now exit.")
        exit()

# Load data
with open(file_path, 'r', encoding='utf-8') as file:
    csv_reader = csv.DictReader(file)
    
    documents = []
    metadatas = []
    ids = []
    
    for i, row in enumerate(csv_reader):
        # Combine Cleaned Caption and Cleaned Comments for document content
        document_content = f"Captions: {row['Cleaned Captions']} | Comments: {row['Cleaned Comments']}"
        documents.append(document_content)
        
        metadatas.append({
            "user": row['User'],
            "location": row['Location'],
            "tags": row['Tags'],
            "likes": clean_number(row['Likes']),
            "comments_count": clean_number(row['Comments Count']),
            "tagged_brands": row['Tagged Brands'],
            "username": row['Username'],
            "followers": clean_number(row['Followers']),
            "category": row['Category'],
            "language": row['Language'],
            "sentiment_score": float(row['Sentiment Score']) if row['Sentiment Score'] else 0.0,
            "sentiment_category": row['Sentiment Category'],
            "cleaned_captions": row['Cleaned Captions'],
            "cleaned_comments": row['Cleaned Comments']
        })
        ids.append(str(i))

# Add data to the vector database in batches
batch_size = 500  
total_added = 0

for i in range(0, len(documents), batch_size):
    batch_documents = documents[i:i+batch_size]
    batch_metadatas = metadatas[i:i+batch_size]
    batch_ids = ids[i:i+batch_size]

    try:
        collection.add(
            documents=batch_documents,
            metadatas=batch_metadatas,
            ids=batch_ids
        )
        total_added += len(batch_documents)
        print(f"Added batch of {len(batch_documents)} documents. Total added: {total_added}")
    except Exception as e:
        print(f"Error adding batch: {e}")

print(f"Finished adding {total_added} documents to the collection.")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Added batch of 500 documents. Total added: 500
Added batch of 500 documents. Total added: 1000
Added batch of 500 documents. Total added: 1500
Added batch of 500 documents. Total added: 2000
Added batch of 500 documents. Total added: 2500
Added batch of 500 documents. Total added: 3000
Added batch of 500 documents. Total added: 3500
Added batch of 500 documents. Total added: 4000
Added batch of 500 documents. Total added: 4500
Added batch of 500 documents. Total added: 5000
Added batch of 500 documents. Total added: 5500
Added batch of 500 documents. Total added: 6000
Added batch of 500 documents. Total added: 6500
Added batch of 500 documents. Total added: 7000
Added batch 

**Step 9. Query Setup and Result Formatting for Sentiment Analysis Database**

In [13]:
import chromadb
from chromadb.utils import embedding_functions
# Initialize ChromaDB client
client = chromadb.Client()
# Get the existing collection
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
collection = client.get_collection(name="sentiment_analysis", embedding_function=sentence_transformer_ef)
def print_results(query, results):
    print(f"\nQuery: '{query}'")
    print(f"Number of results: {len(results['documents'][0])}")
    for doc, metadata, distance in zip(results['documents'][0], results['metadatas'][0], results['distances'][0]):
        print("-" * 50)
        print(f"Distance: {distance}")
        print(f"Caption: {metadata['cleaned_captions']}")
        print(f"Comments: {metadata['cleaned_comments']}")
        print(f"User: {metadata['user']}")
        print(f"Location: {metadata['location']}")
        print(f"Tags: {metadata['tags']}")
        print(f"Category: {metadata['category']}")
        print(f"Sentiment Category: {metadata['sentiment_category']}")
        print(f"Sentiment Score: {metadata['sentiment_score']}")
        print(f"Likes: {metadata['likes']}")
        print(f"Followers: {metadata['followers']}")

**Step 10. Executing and Displaying Targeted Query for Travel and Food Content**

In [14]:
# Query for travel influencers showcasing diverse cuisines
query = "I'm looking for travel influencers who shares diverse cuisines from around the world with high engagement rate."
results = collection.query(
    query_texts=[query],
    n_results=5,
    include=['documents', 'distances', 'metadatas']
)
print_results(query, results)


Query: 'I'm looking for travel influencers who shares diverse cuisines from around the world with high engagement rate.'
Number of results: 5
--------------------------------------------------
Distance: 1.1496665477752686
Caption: soul food sessions mission increase diversity culinary industry partnering cocacolaconsolidated bring popup dinners titled table set multiple cities tour wrapping thedewberrycharleston october 16th chef greg collier contributing featured dish new orleans grillades green strawberry relish paired cocacola happy give try ingredients harris teeter wont last time find recipe buy tickets via link bio swiping stories soulfoodsessionsclt soulfoodsessions yourseatiswaiting thetableisset sponsored
Comments: looks great sals delicious | wow looks delicious great initiative | looks awesome | excellent idea looks | fun cool idea | ill check | oh looks yummy | thats serious plate | earrings | looks fun ive seeing strawberry relish lately looks delicious | looks fantastic 

In [15]:
# Additional queries
queries = [
    "Popular food bloggers with kids"
]

for query in queries:
    results = collection.query(
        query_texts=[query],
        n_results=3,
        include=['documents', 'distances', 'metadatas']
    )
    print_results(query, results)




Query: 'Popular food bloggers with kids'
Number of results: 3
--------------------------------------------------
Distance: 0.838355541229248
Caption: kids away parents play first stop breakfast whenthekidsareaway grownups bloggermom momlife momblog mommyblogger mommylife motherhood boymom toddlermom motherhoodthroughinstagram littleandbrave eternalmotherhood lovelysquares mytinymoments thehappynow honestmotherhood joyfulmamas ohheymama momlifestyle dailyparenting pursuepretty lifestyleblogger foodblogger azblogger recipeforasweetlife
Comments: dont forget | cute check channel likeim poking fun better half might enjoy | cute sandals | gim leopard | enjoy | cute shoes | best one far
User: recipeforasweetlife
Location: 
Tags: #whenthekidsareaway #grownups #bloggermom #momlife #momblog #mommyblogger #mommylife #motherhood #boymom #toddlermom #motherhoodthroughinstagram #littleandbrave #eternalmotherhood #lovelysquares #mytinymoments #thehappynow #honestmotherhood #joyfulmamas #ohheymama #

**Step 11. Exporting ChromaDB Collection**

In [17]:
import chromadb
from chromadb.utils import embedding_functions
import json

def export_collection():
    # Initialize ChromaDB client
    client = chromadb.Client()

    # Get the existing collection
    sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
    collection = client.get_collection(name="sentiment_analysis", embedding_function=sentence_transformer_ef)

    # Get all items from the collection
    results = collection.get(include=['embeddings', 'documents', 'metadatas'])

    # Prepare the data for export
    export_data = {
        'ids': results['ids'],
        'embeddings': results['embeddings'],
        'documents': results['documents'],
        'metadatas': results['metadatas']
    }

    # Export to a JSON file
    with open('sentiment_analysis_collection_export.json', 'w') as f:
        json.dump(export_data, f)

    print("Collection exported successfully to 'sentiment_analysis_collection_export.json'")

# Call the function to export the collection
export_collection()



Collection exported successfully to 'sentiment_analysis_collection_export.json'
