In [1]:
import json
import csv
import os
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from langdetect import detect, LangDetectException
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
nltk.download('stopwords', quiet=True)
nltk.download('vader_lexicon', quiet=True)

# File paths
BASE_PATH = os.path.expanduser("~/Documents/Projects/Dissertation Code/data")
RAW_DATA_PATH = os.path.expanduser("~/Library/CloudStorage/OneDrive-UniversityofEastLondon/DS7010_Dissertation/Data/info")
FOLLOWERS_FILE = os.path.join(BASE_PATH, "Number of followers for each influencer.csv")
INTERIM_DATA_FILE = os.path.join(BASE_PATH, "instagram_data.csv")
CLEANED_DATA_FILE = os.path.join(BASE_PATH, "1.Cleaned_instagram_data.csv")
MERGED_DATA_FILE = os.path.join(BASE_PATH, "2.Merged_instagram_data.csv")
FILTERED_DATA_FILE = os.path.join(BASE_PATH, "3.Filtered_instagram_data.csv")
FINAL_DATA_FILE = os.path.join(BASE_PATH, "4.Final_instagram_data.csv")

os.makedirs(BASE_PATH, exist_ok=True)

def clean_text(text):
    if isinstance(text, float):
        return ""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join(text.split())
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

def extract_info(data):
    owner = data.get('owner', {})
    user = f"{owner.get('full_name', '')} (@{owner.get('username', '')})"
    location = data.get('location', {})
    location_name = location.get('name', '') if location else 'N/A'
    captions = data.get('edge_media_to_caption', {}).get('edges', [])
    likes = data.get('edge_media_preview_like', {}).get('count', 0)
    comments_data = data.get('edge_media_to_parent_comment', {})
    comments_count = comments_data.get('count', 0)
    comments = comments_data.get('edges', [])
    comments_list = ' | '.join([comment['node']['text'] for comment in comments])
    tagged_users = data.get('edge_media_to_tagged_user', {}).get('edges', [])
    tagged_brands = ', '.join([user['node']['user']['full_name'] for user in tagged_users])
    
    info_list = []
    for caption in captions:
        caption_text = caption['node']['text']
        tags = ' '.join([tag for tag in caption_text.split() if tag.startswith('#')])
        
        info = {
            'User': user,
            'Location': location_name,
            'Caption Text': caption_text,
            'Tags': tags,
            'Likes': likes,
            'Comments Count': comments_count,
            'Comments List': comments_list,
            'Tagged Brands': tagged_brands
        }
        info_list.append(info)
    
    return info_list

def process_raw_data():
    data_list = []
    for file_name in os.listdir(RAW_DATA_PATH):
        if file_name.endswith('.info'):
            file_path = os.path.join(RAW_DATA_PATH, file_name)
            with open(file_path, 'r') as file:
                json_data = json.load(file)
            info_list = extract_info(json_data)
            data_list.extend(info_list)

    with open(INTERIM_DATA_FILE, 'w', newline='', encoding='utf-8') as csv_file:
        fieldnames = ['User', 'Location', 'Caption Text', 'Tags', 'Likes', 'Comments Count', 'Comments List', 'Tagged Brands']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        for data in data_list:
            writer.writerow(data)

def clean_data():
    data = pd.read_csv(INTERIM_DATA_FILE)
    data['Cleaned Caption'] = data['Caption Text'].apply(clean_text)
    data['Cleaned Comments'] = data['Comments List'].apply(clean_text)
    data.dropna(subset=['Cleaned Caption'], inplace=True)
    data.fillna('', inplace=True)
    data.to_csv(CLEANED_DATA_FILE, index=False)

def merge_with_followers():
    try:
        followers_df = pd.read_csv(FOLLOWERS_FILE)
        cleaned_data_df = pd.read_csv(CLEANED_DATA_FILE)
        
        followers_df['Username'] = followers_df['Username'].str.strip().str.lower()
        cleaned_data_df['User'] = cleaned_data_df['User'].str.extract(r'\(@([^)]+)\)')[0].str.strip().str.lower()
        
        merged_df = pd.merge(cleaned_data_df, followers_df[['Username', 'Followers', 'Category']], 
                             left_on='User', right_on='Username', how='left')
        
        if merged_df.empty:
            return False
        
        merged_df.to_csv(MERGED_DATA_FILE, index=False)
        return True
    except Exception as e:
        return False

def filter_categories():
    try:
        merged_df = pd.read_csv(MERGED_DATA_FILE)
        
        categories_to_remain = ['beauty', 'family', 'fashion', 'fitness', 'food', 'travel']
        filtered_df = merged_df[merged_df['Category'].isin(categories_to_remain)]
        
        if filtered_df.empty:
            return False
        
        filtered_df.to_csv(FILTERED_DATA_FILE, index=False)
        return True
    except Exception as e:
        return False

def perform_sentiment_analysis():
    df = pd.read_csv(FILTERED_DATA_FILE)
    sia = SentimentIntensityAnalyzer()
    
    def analyze_sentiment(text):
        if pd.isna(text):
            return {'score': 0, 'category': 'Neutral'}
        sentiment_score = sia.polarity_scores(text)['compound']
        category = 'Positive' if sentiment_score > 0.05 else 'Negative' if sentiment_score < -0.05 else 'Neutral'
        return {'score': sentiment_score, 'category': category}
    
    sentiment_results = df['Cleaned Comments'].apply(analyze_sentiment)
    df['Sentiment Score'] = sentiment_results.apply(lambda x: x['score'])
    df['Sentiment Category'] = sentiment_results.apply(lambda x: x['category'])
    
    df_positive = df[df['Sentiment Category'] == 'Positive']
    return df_positive

def categorize_and_extract_keywords(df):
    category_keywords = {
        'beauty': ['makeup', 'skincare', 'beauty', 'cosmetics', 'hair', 'nails', 'facial', 'lipstick', 'eyeliner', 'mascara'],
        'fashion': ['style', 'outfit', 'fashion', 'clothes', 'accessories', 'dress', 'shoes', 'handbag', 'jewelry', 'trend'],
        'family': ['family', 'kids', 'parenting', 'children', 'home', 'baby', 'mom', 'dad', 'sibling', 'grandparent'],
        'fitness': ['workout', 'gym', 'fitness', 'exercise', 'health', 'muscle', 'training', 'cardio', 'strength', 'yoga'],
        'food': ['recipe', 'cooking', 'food', 'meal', 'restaurant', 'cuisine', 'diet', 'nutrition', 'chef', 'baking'],
        'travel': ['travel', 'vacation', 'trip', 'adventure', 'destination', 'tourism', 'hotel', 'flight', 'explore', 'sightseeing']
    }
    all_keywords = [word for words in category_keywords.values() for word in words]

    def categorize_post(text):
        text = str(text).lower()
        return [category for category, keywords in category_keywords.items() if any(keyword in text for keyword in keywords)]

    def get_top_keywords(text, keywords, top_n=5):
        text = str(text).lower()
        found_keywords = [word for word in keywords if word in text]
        return found_keywords[:top_n]

    df['categories'] = df['Cleaned Caption'].apply(categorize_post)
    df = df[df['categories'].apply(len) > 0]
    df['top_keywords'] = df['Cleaned Caption'].apply(lambda x: get_top_keywords(x, all_keywords))
    
    return df

def main():
    try:
        if not os.path.exists(INTERIM_DATA_FILE):
            process_raw_data()

        clean_data()

        if not merge_with_followers():
            print("Merging with followers data failed. Stopping execution.")
            return

        if not filter_categories():
            print("Filtering categories failed. Stopping execution.")
            return

        df_positive = perform_sentiment_analysis()
        final_df = categorize_and_extract_keywords(df_positive)
        final_df.to_csv(FINAL_DATA_FILE, index=False)
        print(f"Analysis complete. Final results saved to '{FINAL_DATA_FILE}'")
    except FileNotFoundError as e:
        print(f"File not found: {str(e)}")
        print("Please make sure all required input files exist and the paths are correct.")
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Merging with followers data failed. Stopping execution.


In [5]:
import json
import csv
import os
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from langdetect import detect, LangDetectException
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

nltk.download('stopwords')
nltk.download('vader_lexicon', quiet=True)

# File paths
BASE_PATH = os.path.expanduser("~/Documents/Projects/Dissertation Code/data")
RAW_DATA_PATH = os.path.expanduser("~/Library/CloudStorage/OneDrive-UniversityofEastLondon/DS7010_Dissertation/Data/info")
FOLLOWERS_FILE = os.path.join(BASE_PATH, "Number of followers for each influencer.csv")
INTERIM_DATA_FILE = os.path.join(BASE_PATH, "instagram_data.csv")
CLEANED_DATA_FILE = os.path.join(BASE_PATH, "1.Cleaned_instagram_data.csv")
MERGED_DATA_FILE = os.path.join(BASE_PATH, "2.Merged_instagram_data.csv")
FILTERED_DATA_FILE = os.path.join(BASE_PATH, "3.Filtered_instagram_data.csv")
FINAL_DATA_FILE = os.path.join(BASE_PATH, "4.Final_instagram_data.csv")

def extract_info(data):
    info_list = []
    owner = data.get('owner', {})
    user = f"{owner.get('full_name', '')} (@{owner.get('username', '')})"
    location = data.get('location') or {}
    location_name = location.get('name', 'N/A')
    captions = data.get('edge_media_to_caption', {}).get('edges', [])
    likes = data.get('edge_media_preview_like', {}).get('count', 0)
    comments_data = data.get('edge_media_to_parent_comment', {})
    comments_count = comments_data.get('count', 0)
    comments_list = ' | '.join(comment['node']['text'] for comment in comments_data.get('edges', []))
    tagged_brands = ', '.join(user['node']['user']['full_name'] for user in data.get('edge_media_to_tagged_user', {}).get('edges', []))

    for caption in captions:
        caption_text = caption['node']['text']
        tags = ' '.join(tag for tag in caption_text.split() if tag.startswith('#'))
        info_list.append({
            'Username': owner.get('username', '').lower().strip(),
            'Location': location_name,
            'Caption Text': caption_text,
            'Tags': tags,
            'Likes': likes,
            'Comments Count': comments_count,
            'Comments List': comments_list,
            'Tagged Brands': tagged_brands
        })

    return info_list

def process_json_files(input_dir, output_csv):
    data_list = []
    for file_name in os.listdir(input_dir):
        if file_name.endswith('.info'):
            with open(os.path.join(input_dir, file_name), 'r') as file:
                json_data = json.load(file)
            data_list.extend(extract_info(json_data))

    with open(output_csv, 'w', newline='', encoding='utf-8') as csv_file:
        fieldnames = ['Username', 'Location', 'Caption Text', 'Tags', 'Likes', 'Comments Count', 'Comments List', 'Tagged Brands']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        for data in data_list:
            writer.writerow(data)

    print("Data extraction and CSV file creation completed successfully.")
    print(f"Extracted data sample: {data_list[:5]}")

def clean_text(text):
    if isinstance(text, float):
        return ""
    text = re.sub(r'[^A-Za-z0-9\s]', '', text).lower()
    stop_words = set(stopwords.words('english'))
    return ' '.join(word for word in text.split() if word not in stop_words)

def clean_instagram_data(input_csv, output_csv):
    data = pd.read_csv(input_csv)
    data['Cleaned Caption'] = data['Caption Text'].apply(clean_text)
    data['Cleaned Comments'] = data['Comments List'].apply(clean_text)
    data.dropna(subset=['Cleaned Caption'], inplace=True)
    data.fillna('', inplace=True)
    data.to_csv(output_csv, index=False)
    print("Data cleaning completed successfully.")
    print(f"Cleaned data sample: {data.head()}")

def merge_followers_data(cleaned_csv, followers_csv, output_csv):
    cleaned_data_df = pd.read_csv(cleaned_csv)
    followers_df = pd.read_csv(followers_csv)
    
    # Clean column names
    cleaned_data_df.columns = cleaned_data_df.columns.str.strip()
    followers_df.columns = followers_df.columns.str.strip()
    
    # Ensure 'Username' column exists in both dataframes
    if 'Username' not in cleaned_data_df.columns or 'Username' not in followers_df.columns:
        print("Error: 'Username' column is missing in one of the data files.")
        return
    
    # Normalize 'Username' columns
    cleaned_data_df['Username'] = cleaned_data_df['Username'].str.strip().str.lower()
    followers_df['Username'] = followers_df['Username'].str.strip().str.lower()
    
    # Merge DataFrames
    merged_df = pd.merge(cleaned_data_df, followers_df[['Username', 'Followers', 'Category']], on='Username', how='left')
    
    # Print sample of merged data
    print("Merged DataFrame sample:")
    print(merged_df.head())
    
    merged_df.to_csv(output_csv, index=False)
    print("Followers and Category added to the cleaned Instagram data successfully.")

def filter_non_english_rows(input_csv, output_csv, text_column):
    df = pd.read_csv(input_csv)
    if text_column in df.columns:
        initial_count = len(df)
        df = df[df[text_column].apply(lambda x: len(str(x).strip()) > 0 and is_english(x))]
        final_count = len(df)
        print(f"Removed {initial_count - final_count} non-English rows.")
    df.to_csv(output_csv, index=False)
    print("Non-English rows were removed successfully.")
    print(f"Filtered data sample: {df.head()}")

def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

def calculate_influencers_per_category(merged_csv, output_csv):
    merged_df = pd.read_csv(merged_csv)
    
    influencers_per_category = merged_df.groupby('Category')['Username'].nunique().reset_index()
    influencers_per_category.columns = ['Category', 'Number of Influencers']
    
    print("Influencers per category sample:")
    print(influencers_per_category.head())
    
    influencers_per_category.to_csv(output_csv, index=False)
    print("Number of influencers in each category calculated and saved successfully.")

def filter_categories(input_csv, output_csv, categories):
    df = pd.read_csv(input_csv)
    
    print("Categories before filtering:", df['Category'].unique())
    filtered_df = df[df['Category'].isin(categories)]
    
    print("Filtered DataFrame sample:")
    print(filtered_df.head())
    
    filtered_df.to_csv(output_csv, index=False)
    print("Filtered data saved successfully.")

def analyze_sentiment(input_csv, output_csv):
    df = pd.read_csv(input_csv)
    sia = SentimentIntensityAnalyzer()
    
    print("DataFrame sample before sentiment analysis:")
    print(df.head())
    
    df['Sentiment Score'] = df['Cleaned Comments'].apply(lambda x: sia.polarity_scores(x)['compound'] if isinstance(x, str) else 0)
    df['Sentiment Category'] = df['Sentiment Score'].apply(lambda x: 'Positive' if x > 0.05 else 'Negative' if x < -0.05 else 'Neutral')
    df_positive = df[df['Sentiment Category'] == 'Positive']
    
    print("Positive sentiment DataFrame sample:")
    print(df_positive.head())
    
    df_positive.to_csv(output_csv, index=False)
    print("Sentiment analysis complete. Positive comments saved.")

def categorize_instagram_data(input_csv, output_csv, method='predefined'):
    df = pd.read_csv(input_csv)
    
    print("DataFrame sample before categorization:")
    print(df.head())
    
    category_keywords = {
        'beauty': ['makeup', 'skincare', 'beauty', 'cosmetics', 'hair', 'nails', 'facial', 'lipstick', 'eyeliner', 'mascara'],
        'fashion': ['style', 'outfit', 'fashion', 'clothes', 'accessories', 'dress', 'shoes', 'handbag', 'jewelry', 'trend'],
        'family': ['family', 'kids', 'parenting', 'children', 'home', 'baby', 'mom', 'dad', 'sibling', 'grandparent'],
        'fitness': ['workout', 'gym', 'fitness', 'exercise', 'health', 'muscle', 'training', 'cardio', 'strength', 'yoga'],
        'food': ['recipe', 'cooking', 'food', 'meal', 'restaurant', 'cuisine', 'diet', 'nutrition', 'chef', 'baking'],
        'travel': ['travel', 'vacation', 'trip', 'adventure', 'destination', 'tourism', 'hotel', 'flight', 'explore', 'sightseeing']
    }
    all_keywords = [word for words in category_keywords.values() for word in words]

    def categorize_post(text):
        text = str(text).lower()
        categories = [category for category, keywords in category_keywords.items() if any(keyword in text for keyword in keywords)]
        return categories

    df['categories'] = df['Cleaned Caption'].apply(categorize_post)
    df = df[df['categories'].apply(len) > 0]

    def get_top_keywords_predefined(text, keywords, top_n=5):
        text = str(text).lower()
        found_keywords = [word for word in keywords if word in text]
        return found_keywords[:top_n]

    if method == 'predefined':
        df['top_keywords'] = df['Cleaned Caption'].apply(lambda x: get_top_keywords_predefined(x, all_keywords))
    else:
        tfidf = TfidfVectorizer(max_features=1000)
        tfidf_matrix = tfidf.fit_transform(df['Cleaned Caption'].fillna(''))
        feature_names = tfidf.get_feature_names_out()
        df['top_keywords'] = [get_top_keywords_tfidf(tfidf_matrix[i], feature_names) for i in range(tfidf_matrix.shape[0])]

    print("Categorized DataFrame sample:")
    print(df.head())
    
    df.to_csv(output_csv, index=False)
    print(f"Keyword analysis complete. Results saved to '{output_csv}'")

if __name__ == "__main__":
    # process_json_files(RAW_DATA_PATH, INTERIM_DATA_FILE)
    # clean_instagram_data(INTERIM_DATA_FILE, CLEANED_DATA_FILE)
    # filter_non_english_rows(CLEANED_DATA_FILE, CLEANED_DATA_FILE, 'Caption Text')
    merge_followers_data(CLEANED_DATA_FILE, FOLLOWERS_FILE, MERGED_DATA_FILE)
    calculate_influencers_per_category(MERGED_DATA_FILE, os.path.join(BASE_PATH, "influencers_per_category.csv"))
    filter_categories(MERGED_DATA_FILE, FILTERED_DATA_FILE, ['beauty', 'family', 'fashion', 'fitness', 'food', 'travel'])
    analyze_sentiment(FILTERED_DATA_FILE, os.path.join(BASE_PATH, "instagram_comments_positive_only.csv"))
    categorize_instagram_data(os.path.join(BASE_PATH, "instagram_comments_positive_only.csv"), FINAL_DATA_FILE, method='predefined')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nurgul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Merged DataFrame sample:
                                   Username     Location  \
0                       tomo (@lelien_tomo)          NaN   
1           aneta sebestova (@veverkakokos)          NaN   
2                 alice kings (@alicekings)          NaN   
3               ☆ nicole ☆ (@nicolenic1973)       Ostsee   
4  aleyah solomon | photographer (@aleyahs)  White Point   

                                        Caption Text  \
0  大好きだったばあちゃんのミシン。\n高校生の時はよく学校帰りにばあちゃんの家に寄って、夕飯食...   
1  Dneska tohle pocasi zase bylo 😍 co vic chtit.\...   
2                                              Quack   
3  🌅 Im trüben November brauche ich "Stimmungsauf...   
4  Island attire inspired by Alex Colville.\n.\n....   

                                                Tags  Likes  Comments Count  \
0  #danielwellington #ダニエルウェリントン #myclassicdw #サマ...  321.0            16.0   
1  #sunnyday #photography #nature #pond #energy #...   33.0             0.0   
2                                       