- **Objective**: Extract data from JSON files containing Instagram post metadata.
- **Process**:
    - Load JSON files into a Python script.
    - Parse the JSON structure to extract relevant fields such as captions, user tags, hashtags, timestamps, sponsorship status, number of likes, and comments.
    - Store the extracted data in a structured format, such as a Pandas DataFrame, for further processing.

In [3]:
import json
import csv
import os

# Function to extract details from the JSON data
def extract_info(data):
    info_list = []
    
    # Extracting user information
    owner = data.get('owner', {})
    user = f"{owner.get('full_name', '')} (@{owner.get('username', '')})"
    
    # Extracting location information
    location = data.get('location', {})
    location_name = location.get('name', '') if location else 'N/A'
    
    # Extracting caption highlights and tags
    captions = data.get('edge_media_to_caption', {}).get('edges', [])
    
    # Extracting engagement details
    likes = data.get('edge_media_preview_like', {}).get('count', 0)
    comments_data = data.get('edge_media_to_parent_comment', {})
    comments_count = comments_data.get('count', 0)
    comments = comments_data.get('edges', [])
    comments_list = ' | '.join([comment['node']['text'] for comment in comments])
    
    # Extracting tagged brands
    tagged_users = data.get('edge_media_to_tagged_user', {}).get('edges', [])
    tagged_brands = ', '.join([user['node']['user']['full_name'] for user in tagged_users])
    
    for caption in captions:
        caption_text = caption['node']['text']
        tags = ' '.join([tag for tag in caption_text.split() if tag.startswith('#')])
        
        info = {
            'User': user,
            'Location': location_name,
            'Caption Text': caption_text,
            'Tags': tags,
            'Likes': likes,
            'Comments Count': comments_count,
            'Comments List': comments_list,
            'Tagged Brands': tagged_brands
        }
        
        info_list.append(info)
    
    return info_list

# Define the path where the JSON files are stored
input_path = "/Users/nurgul/Library/CloudStorage/OneDrive-UniversityofEastLondon/DS7010_Dissertation/Data/info"

# Define the output CSV file path
output_csv = "/Users/nurgul/Documents/Projects/Dissertation Code/data/instagram_data.csv"

# List to store all the extracted information
data_list = []

# Process each file in the input directory
for file_name in os.listdir(input_path):
    if file_name.endswith('.info'):
        file_path = os.path.join(input_path, file_name)
        
        # Read the JSON data from the file
        with open(file_path, 'r') as file:
            json_data = json.load(file)
        
        # Extract information from the JSON data
        info_list = extract_info(json_data)
        
        # Append the extracted information to the data list
        data_list.extend(info_list)

# Write the extracted information to the CSV file
with open(output_csv, 'w', newline='', encoding='utf-8') as csv_file:
    fieldnames = ['User', 'Location', 'Caption Text', 'Tags', 'Likes', 'Comments Count', 'Comments List', 'Tagged Brands']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    
    # Write the header
    writer.writeheader()
    
    # Write the data rows
    for data in data_list:
        writer.writerow(data)

print("Data extraction and CSV file creation completed successfully.")

Data extraction and CSV file creation completed successfully.


- **Objective**: Clean and preprocess the extracted data to ensure consistency and quality.
- **Process**:
    - **Text Normalization**: Remove special characters, emojis, and stopwords from captions and comments using libraries like NLTK and SpaCy.
    - **Metadata Cleaning**: Handle missing data by imputing or removing incomplete records.
    - **Column Renaming**: Standardize column names by stripping leading and trailing spaces and converting them to a consistent case.

In [4]:
import pandas as pd
import re
from nltk.corpus import stopwords

# Ensure you have downloaded the stopwords
import nltk
nltk.download('stopwords')

def clean_text(text):
    if isinstance(text, float):
        return ""
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)  # Remove stopwords
    return text

# Load the CSV data
data = pd.read_csv('/Users/nurgul/Documents/Projects/Dissertation Code/data/instagram_data.csv')

# Clean the 'Caption Text' and 'Comments List' columns
data['Cleaned Caption'] = data['Caption Text'].apply(clean_text)
data['Cleaned Comments'] = data['Comments List'].apply(clean_text)

# Handle missing values
data.dropna(subset=['Cleaned Caption'], inplace=True)
data.fillna('', inplace=True)

# Save the cleaned data to a new CSV file
cleaned_output_csv = '/Users/nurgul/Documents/Projects/Dissertation Code/data/cleaned_instagram_data.csv'
data.to_csv(cleaned_output_csv, index=False)

print("Data cleaning completed successfully.")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nurgul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Data cleaning completed successfully.


- **Objective**: Integrate the number of followers for each influencer into the cleaned Instagram data.
- **Process**:
    - Load the followers data from a CSV file.
    - Clean the followers data by stripping spaces and converting usernames to lowercase ***(file name: Cleaning the data step 2; cleaned_instagram_data_cleaned.csv)*.**
    - Merge the cleaned followers data with the Instagram data based on the username, ensuring the correct alignment of data.
    - Save the merged data to a new CSV file ***(file name: Merging cleaned datas.py; cleaned_instagram_data_final.csv)***.

In [17]:
import pandas as pd
import re

# Define the file path for cleaned_instagram_data
cleaned_data_file = "/Users/nurgul/Documents/Projects/Dissertation Code/data/cleaned_instagram_data.csv"

# Read the CSV file
cleaned_data_df = pd.read_csv(cleaned_data_file)

# Print the column names to debug
print("Original Columns in cleaned_data_df:", cleaned_data_df.columns)

# Ensure column names are stripped of extra spaces
cleaned_data_df.columns = cleaned_data_df.columns.str.strip()

# Rename columns to remove leading/trailing spaces
cleaned_data_df.rename(columns=lambda x: x.strip(), inplace=True)

# Function to clean text
def clean_text(text):
    if isinstance(text, str):
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        # Remove emoji and other special characters
        text = re.sub(r'[^\w\s]', '', text)
        # Remove extra whitespace
        text = ' '.join(text.split())
    return text

# Clean and rename 'User' column
if 'User' in cleaned_data_df.columns:
    cleaned_data_df['Cleaned User'] = cleaned_data_df['User'].str.strip().str.lower()
    cleaned_data_df['Cleaned User'] = cleaned_data_df['Cleaned User'].str.extract(r'\((@[\w\._]+)\)')[0]
    cleaned_data_df['Cleaned User'] = cleaned_data_df['Cleaned User'].str.replace('@', '').str.strip()
    cleaned_data_df = cleaned_data_df.drop('User', axis=1)
    print("'User' column cleaned, renamed to 'Cleaned User', and original 'User' column removed")
else:
    print("'User' column not found in cleaned_data_df")

# Clean 'captions' and add a new 'Cleaned Captions' column
if 'captions' in cleaned_data_df.columns:
    cleaned_data_df['Cleaned Captions'] = cleaned_data_df['captions'].apply(clean_text)
    cleaned_data_df = cleaned_data_df.drop('captions', axis=1)
    print("Original 'captions' column deleted and replaced with 'Cleaned Captions'")
else:
    print("'captions' column not found in cleaned_data_df")

# Drop Cleaned Comments and Caption Text columns
cleaned_data_df = cleaned_data_df.drop(['Cleaned Comments', 'Caption Text'], axis=1)

# Define the desired column order
desired_order = ['Cleaned User', 'Location', 'Cleaned Captions', 'Likes', 'Comments count', 'Comments list', 'Tagged brands', 'Tags']

# Create a new list with only the columns that exist in the dataframe, in the specified order
new_order = [col for col in desired_order if col in cleaned_data_df.columns]

# Add any remaining columns that weren't in the desired_order list
remaining_columns = [col for col in cleaned_data_df.columns if col not in new_order]
final_order = new_order + remaining_columns

# Reorder the columns
cleaned_data_df = cleaned_data_df[final_order]

# Save the cleaned dataframe to a new CSV file
cleaned_data_df.to_csv("/Users/nurgul/Documents/Projects/Dissertation Code/data/cleaned_instagram_data_cleaned.csv", index=False)
print("Cleaned Instagram data saved successfully.")

# Print the final column names
print("Final columns in cleaned_data_df:", cleaned_data_df.columns)

Original Columns in cleaned_data_df: Index(['User', 'Location', 'Caption Text', 'Tags', 'Likes', 'Comments Count',
       'Comments List', 'Tagged Brands', 'Cleaned Caption',
       'Cleaned Comments'],
      dtype='object')
'User' column cleaned, renamed to 'Cleaned User', and original 'User' column removed
'captions' column not found in cleaned_data_df
Cleaned Instagram data saved successfully.
Final columns in cleaned_data_df: Index(['Cleaned User', 'Location', 'Likes', 'Tags', 'Comments Count',
       'Comments List', 'Tagged Brands', 'Cleaned Caption'],
      dtype='object')


In [None]:
import pandas as pd

# Define the file paths
followers_file = "/Users/nurgul/Documents/Projects/Dissertation Code/data/Number of followers for each influencer.csv"
cleaned_data_file = "/Users/nurgul/Documents/Projects/Dissertation Code/data/cleaned_instagram_data_cleaned.csv"

# Read the CSV files
followers_df = pd.read_csv(followers_file)
cleaned_data_df = pd.read_csv(cleaned_data_file)

# Ensure column names are stripped of extra spaces
followers_df.columns = followers_df.columns.str.strip()
cleaned_data_df.columns = cleaned_data_df.columns.str.strip()

# Strip leading/trailing spaces in usernames
followers_df['Username'] = followers_df['Username'].str.strip()
cleaned_data_df['User'] = cleaned_data_df['User'].str.strip()

# Convert usernames to lower case to avoid case sensitivity issues
followers_df['Username'] = followers_df['Username'].str.lower()
cleaned_data_df['User'] = cleaned_data_df['User'].str.lower()

# Merge the two dataframes on the username column, including the Category column
merged_df = pd.merge(cleaned_data_df, followers_df[['Username', 'Followers', 'Category']], left_on='User', right_on='Username', how='left')

# Check if the merge was successful by printing some rows with followers and category
print("Sample rows from merged_df with followers and category:")
print(merged_df[~merged_df['Followers'].isna()].head())

# Save the merged dataframe to a new CSV file
merged_df.to_csv("/Users/nurgul/Documents/Projects/Dissertation Code/data/cleaned_instagram_data_final.csv", index=False)

print("Followers and Category added to the cleaned Instagram data successfully.")


**Step 4. Removing non-english captions**

In [10]:
import pandas as pd
from langdetect import detect, LangDetectException

# Define the file paths
followers_file = "/Users/nurgul/Documents/Projects/Dissertation Code/data/Number of followers for each influencer.csv"
cleaned_data_file = "/Users/nurgul/Documents/Projects/Dissertation Code/data/cleaned_instagram_data_cleaned.csv"

# Read the CSV files
followers_df = pd.read_csv(followers_file)
cleaned_data_df = pd.read_csv(cleaned_data_file)

# Ensure column names are stripped of extra spaces
followers_df.columns = followers_df.columns.str.strip()
cleaned_data_df.columns = cleaned_data_df.columns.str.strip()

# Strip leading/trailing spaces in usernames
followers_df['Username'] = followers_df['Username'].str.strip()
cleaned_data_df['User'] = cleaned_data_df['User'].str.strip()

# Convert usernames to lower case to avoid case sensitivity issues
followers_df['Username'] = followers_df['Username'].str.lower()
cleaned_data_df['User'] = cleaned_data_df['User'].str.lower()

# Print column names to verify the correct text column
print("Column names in cleaned_data_df:", cleaned_data_df.columns)

# Function to detect language and filter out non-English rows
def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

# Verify the column name that contains the text
text_column = 'Caption Text'  # Update this to the correct column name after inspecting the columns

if text_column in cleaned_data_df.columns:
    # Count the number of non-English rows
    initial_count = len(cleaned_data_df)
    cleaned_data_df = cleaned_data_df[cleaned_data_df[text_column].apply(is_english)]
    final_count = len(cleaned_data_df)
    non_english_count = initial_count - final_count
    print(f"Removed {non_english_count} non-English rows.")
else:
    print(f"Column '{text_column}' not found in the DataFrame.")

# Merge the two dataframes on the username column, including the Category column
merged_df = pd.merge(cleaned_data_df, followers_df[['Username', 'Followers', 'Category']], left_on='User', right_on='Username', how='left')

# Check if the merge was successful by printing some rows with followers and category
print("Sample rows from merged_df with followers and category:")
print(merged_df[~merged_df['Followers'].isna()].head())

# Save the merged dataframe to a new CSV file
merged_df.to_csv("/Users/nurgul/Documents/Projects/Dissertation Code/data/cleaned_instagram_data_final.csv", index=False)

print("Non-english rows were removed successfully.")


Column names in cleaned_data_df: Index(['User', 'Location', 'Caption Text', 'Tags', 'Likes', 'Comments Count',
       'Comments List', 'Tagged Brands', 'Cleaned Caption',
       'Cleaned Comments'],
      dtype='object')
Removed 9230 non-English rows.
Sample rows from merged_df with followers and category:
               User     Location  \
0           aleyahs  White Point   
1         pxl.house          NaN   
2        merakilane          NaN   
3      zanabfarooq_          NaN   
4  lightscamerabake          NaN   

                                        Caption Text  \
0  Island attire inspired by Alex Colville.\n.\n....   
1  Photograph for @reebokwomen's fall issue of RA...   
2  One of my Friday afternoon rituals is to sit d...   
3                     Tis the season to be jolly 🎄🎁.   
4  {NEW} on the #blog today! Peanut Butter Bars 💕...   

                                                Tags  Likes  Comments Count  \
0  #capebretonisland #whitepoint #newhaven #visit...   78.0

**Step 5. Calculating number of influencers per each category**

In [14]:
import pandas as pd

# Define the file path for the merged data
merged_data_file = "/Users/nurgul/Documents/Projects/Dissertation Code/data/cleaned_instagram_data_final.csv"

# Read the merged CSV file
merged_df = pd.read_csv(merged_data_file)

# Calculate the number of influencers in each category
influencers_per_category = merged_df.groupby('Category')['Username'].nunique().reset_index()
influencers_per_category.columns = ['Category', 'Number of Influencers']

# Print the number of influencers in each category
print("Number of influencers in each category:")
print(influencers_per_category)

# Save the influencers per category to a new CSV file
influencers_per_category.to_csv("/Users/nurgul/Documents/Projects/Dissertation Code/data/influencers_per_category.csv", index=False)

print("Number of influencers in each category calculated and saved successfully.")


Number of influencers in each category:
   Category  Number of Influencers
0    beauty                    771
1    family                   2153
2   fashion                   6231
3    fasion                      1
4   fitness                    616
5      food                   2038
6  interior                    646
7     other                   3050
8       pet                    349
9    travel                   2303
Number of influencers in each category calculated and saved successfully.


**Step 6. Remain only necessary categories**

In [17]:
import pandas as pd

# Define the file path for the merged data
merged_data_file = "/Users/nurgul/Documents/Projects/Dissertation Code/data/cleaned_instagram_data_final.csv"

# Read the merged CSV file
merged_df = pd.read_csv(merged_data_file)

# Define the categories to remain
categories_to_remain = ['beauty', 'family', 'fashion', 'fitness', 'food', 'travel']

# Filter the DataFrame to include only the specified categories
filtered_df = merged_df[merged_df['Category'].isin(categories_to_remain)]

# Save the filtered DataFrame to a new CSV file
filtered_data_file = "/Users/nurgul/Documents/Projects/Dissertation Code/data/cleaned_instagram_data_filtered.csv"
filtered_df.to_csv(filtered_data_file, index=False)

print("Filtered data saved successfully")


Filtered data saved successfully


**Step 7. Sentiment analysis** - positive only

In [4]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import os

# Download the NLTK sentiment analysis model
nltk.download('vader_lexicon', quiet=True)

# Set the file path and name
file_path = "/Users/nurgul/Documents/Projects/Dissertation Code/data"
file_name = "cleaned_instagram_data_filtered.csv"
full_path = os.path.join(file_path, file_name)

# Read the CSV file
df = pd.read_csv(full_path)

# Initialize the NLTK sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to perform sentiment analysis on a text
def analyze_sentiment(text):
    if pd.isna(text):
        return {'score': 0, 'category': 'Neutral'}  # Return neutral sentiment for empty/NaN values
    sentiment_score = sia.polarity_scores(text)['compound']
    
    if sentiment_score > 0.05:
        category = 'Positive'
    elif sentiment_score < -0.05:
        category = 'Negative'
    else:
        category = 'Neutral'
    
    return {'score': sentiment_score, 'category': category}

# Perform sentiment analysis on the 'Cleaned Comments' column
sentiment_results = df['Cleaned Comments'].apply(analyze_sentiment)

# Add new columns for Sentiment Score and Sentiment Category
df['Sentiment Score'] = sentiment_results.apply(lambda x: x['score'])
df['Sentiment Category'] = sentiment_results.apply(lambda x: x['category'])

# Filter to keep only positive rows
df_positive = df[df['Sentiment Category'] == 'Positive']

# Display the first few rows of the updated dataset (positive only)
print(df_positive[['Cleaned Comments', 'Sentiment Score', 'Sentiment Category']].head())

# Calculate and display overall sentiment statistics for positive comments
sentiment_stats = df_positive['Sentiment Category'].value_counts(normalize=True) * 100
print("\nPositive Sentiment Distribution:")
print(sentiment_stats)

# Save the updated dataset (positive only) to a new CSV file
output_file = os.path.join(file_path, "instagram_comments_positive_only.csv")
df_positive.to_csv(output_file, index=False)
print(f"\nSentiment analysis complete. Positive comments saved to '{output_file}'")

                                    Cleaned Comments  Sentiment Score  \
0  perfect great shot love good one loooove paint...           0.9565   
2  merakilane lifeisgood behappy thatsdarling lif...           0.9694   
3  omg love whole christmasy vibes fun seeing dec...           0.9970   
4  lightscamerabake foodporn foodofinsta instafoo...           0.9769   
5  haha caption though law handsome super model m...           0.9719   

  Sentiment Category  
0           Positive  
2           Positive  
3           Positive  
4           Positive  
5           Positive  

Positive Sentiment Distribution:
Sentiment Category
Positive    100.0
Name: proportion, dtype: float64

Sentiment analysis complete. Positive comments saved to '/Users/nurgul/Documents/Projects/Dissertation Code/data/instagram_comments_positive_only.csv'


**Step 8. Keywords analysis**

In [8]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import os

# Set the file path
file_path = "/Users/nurgul/Documents/Projects/Dissertation Code/data"
input_file = "instagram_comments_positive_only.csv"
output_file = "categorized_instagram_data.csv"

# Load the data
full_input_path = os.path.join(file_path, input_file)
df = pd.read_csv(full_input_path)

# Define category-specific keywords
category_keywords = {
    'beauty': ['makeup', 'skincare', 'beauty', 'cosmetics', 'hair', 'nails', 'facial', 'lipstick', 'eyeliner', 'mascara'],
    'fashion': ['style', 'outfit', 'fashion', 'clothes', 'accessories', 'dress', 'shoes', 'handbag', 'jewelry', 'trend'],
    'family': ['family', 'kids', 'parenting', 'children', 'home', 'baby', 'mom', 'dad', 'sibling', 'grandparent'],
    'fitness': ['workout', 'gym', 'fitness', 'exercise', 'health', 'muscle', 'training', 'cardio', 'strength', 'yoga'],
    'food': ['recipe', 'cooking', 'food', 'meal', 'restaurant', 'cuisine', 'diet', 'nutrition', 'chef', 'baking'],
    'travel': ['travel', 'vacation', 'trip', 'adventure', 'destination', 'tourism', 'hotel', 'flight', 'explore', 'sightseeing']
}

# Flatten the list of keywords
all_keywords = [word for words in category_keywords.values() for word in words]

# Function to categorize post based on keywords
def categorize_post(text):
    text = str(text).lower()  # Convert to string and lowercase
    categories = []
    for category, keywords in category_keywords.items():
        if any(keyword in text for keyword in keywords):
            categories.append(category)
    return categories

# Apply categorization to "Cleaned Caption"
df['categories'] = df['Cleaned Caption'].apply(categorize_post)

# Remove rows with empty categories (previously uncategorized)
df = df[df['categories'].apply(len) > 0]

# Function to get top keywords using predefined list
def get_top_keywords_predefined(text, keywords, top_n=5):
    text = str(text).lower()
    found_keywords = [word for word in keywords if word in text]
    return found_keywords[:top_n]

# Function to get top keywords using TF-IDF
def get_top_keywords_tfidf(tfidf_vector, feature_names, top_n=5):
    coo_matrix = tfidf_vector.tocoo()
    tuples = zip(coo_matrix.col, coo_matrix.data)
    sorted_items = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
    sorted_items = sorted_items[:top_n]
    return [feature_names[idx] for idx, _ in sorted_items]

# Choose method: 'predefined' or 'tfidf'
method = 'predefined'  # Change this to 'tfidf' if you want to use TF-IDF

if method == 'predefined':
    df['top_keywords'] = df['Cleaned Caption'].apply(lambda x: get_top_keywords_predefined(x, all_keywords))
else:  # TF-IDF method
    tfidf = TfidfVectorizer(max_features=1000)
    tfidf_matrix = tfidf.fit_transform(df['Cleaned Caption'].fillna(''))
    feature_names = tfidf.get_feature_names_out()
    df['top_keywords'] = [get_top_keywords_tfidf(tfidf_matrix[i], feature_names) for i in range(tfidf_matrix.shape[0])]

# Calculate category distribution
category_distribution = df['categories'].explode().value_counts(normalize=True)
print("Category Distribution:")
print(category_distribution)

# Print top keywords for the first few rows
print("\nSample Top Keywords:")
print(df['top_keywords'].head())

# Save the results
output_path = os.path.join(file_path, output_file)
df.to_csv(output_path, index=False)
print(f"\nKeyword analysis complete. Results saved to '{output_path}'")

Category Distribution:
categories
fashion    0.258260
family     0.231107
travel     0.155353
food       0.133486
beauty     0.130902
fitness    0.090893
Name: proportion, dtype: float64

Sample Top Keywords:
0    [vacation, trip, adventure, destination, explore]
3                                             [recipe]
5                                            [fashion]
7                                            [explore]
8                          [dress, health, food, chef]
Name: top_keywords, dtype: object

Keyword analysis complete. Results saved to '/Users/nurgul/Documents/Projects/Dissertation Code/data/categorized_instagram_data.csv'
