In [None]:
%pip install matplotlib
%pip install pandas 
%pip install numpy
%pip install seaborn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

In [None]:
# Define categories
categories = ['Food', 'Travel', 'Fashion', 'Fitness', 'Music', 'Culture', 'Family', 'Health',
                'Technology', 'Art', 'Finance', 'Politics', 'Sports', 'Pets', 'Environment']

# Generate random data
n = 1090  # Number of entries
data = {
    'Date': pd.date_range('2021-01-01', periods=n),
    'Category': [random.choice(categories) for _ in range(n)],
    'Likes': np.random.randint(0, 10000, size=n)
}

# Create a DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df)

In [None]:
# Print the DataFrame head
print("DataFrame Head:")
print(df.head())

# Print the DataFrame information
print("\nDataFrame Information:")
print(df.info())

# Print the DataFrame description
print("\nDataFrame Description:")
print(df.describe())

# Print the count of each 'Category' element
print("\nCount of each 'Category' element:")
print(df['Category'].value_counts())

In [None]:
# Remove null data
df.dropna(inplace=True)

# Remove duplicate data
df.drop_duplicates(inplace=True)

# Convert 'Date' field to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Convert 'Likes' data to integer
df['Likes'] = df['Likes'].astype(int)

# Display the cleaned DataFrame
print("Cleaned DataFrame:")
print(df)

In [None]:
# Visualize data with histogram
plt.figure(figsize=(12, 8))
sns.histplot(df['Likes'], bins=30, kde=True, color='skyblue', edgecolor='black')
plt.title('Histogram of Likes', fontsize=16)
plt.xlabel('Likes', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Create a boxplot
plt.figure(figsize=(12, 8))
sns.boxplot(x='Category', y='Likes', data=df)
plt.title('Boxplot of Likes by Category', fontsize=16)
plt.xlabel('Category', fontsize=14)
plt.ylabel('Likes', fontsize=14)
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Perform statistics
# Print mean of 'Likes' category
print("Mean of 'Likes' category:", round(df['Likes'].mean()))

# Print mean of 'Likes' for each Category
print("\nMean of 'Likes' for each Category:")
print(df.groupby('Category')['Likes'].mean().round().sort_values(ascending=False))

In [None]:

# Set style
sns.set_style("whitegrid")

# Create FacetGrid
g = sns.FacetGrid(df, col='Category', col_wrap=5, height=5, aspect=1)
g.map(sns.histplot, 'Likes', kde=True, color='skyblue', edgecolor='black')

# Set titles and labels
g.set_titles("{col_name}")
g.set_axis_labels("Likes", "Frequency")

# Adjust plot aesthetics
plt.subplots_adjust(top=0.9)  # Adjust top margin for titles
plt.suptitle("Distribution of Likes by Category", fontsize=16)  # Main title
plt.xticks(fontsize=10)  # Adjust x-axis tick font size
plt.yticks(fontsize=10)  # Adjust y-axis tick font size

# Show plot
plt.show()

In [None]:
# Assuming 'df' is your DataFrame with 'Date', 'Category', and 'Likes' columns

# Get unique categories
categories = df['Category'].unique()

# Create a separate line plot for each category
for category in categories:
    category_data = df[df['Category'] == category]
    
    # Create a new figure
    plt.figure(figsize=(12, 5))
    
    # Plot the data
    sns.lineplot(x='Date', y='Likes', data=category_data, color='steelblue', linewidth=2.5, alpha=0.9)
    
    
    # Title and labels
    plt.title(f'Likes Over Time for {category}', fontsize=16, fontweight='bold')
    plt.xlabel('Date', fontsize=12)
    plt.ylabel('Likes', fontsize=12)
    
    # Adjust ticks and grid
    plt.xticks(rotation=45, fontsize=10)
    plt.yticks(fontsize=10)
    plt.grid(True)
    
    # Add data source note
    plt.text(df['Date'].iloc[-1], df['Likes'].max(), '', fontsize=10, ha='right', color='gray')
    
    # Show plot
    plt.tight_layout()
    plt.show()

In [None]:
import calendar 
# Filter the DataFrame for the year 2023
df_2023 = df[df['Date'].dt.year == 2023]

# Calculate mean likes for each category aggregated by month
category_likes_monthly = df_2023.groupby([df_2023['Date'].dt.month, 'Category'])['Likes'].mean().unstack()

# Select the top 5 categories
top_5_categories = category_likes_monthly.mean().nlargest(5).index

# Filter category likes for the top 5 categories
category_likes_top_5 = category_likes_monthly[top_5_categories]

# Set custom color palette
custom_palette = sns.color_palette("husl", len(top_5_categories))

# Set seaborn style
sns.set_style("whitegrid")

# Create a new figure
plt.figure(figsize=(12, 6))

# Plot the combined trends for the top 5 categories for each month
for i, category in enumerate(top_5_categories):
    sns.lineplot(x=category_likes_top_5.index, y=category_likes_top_5[category], label=category, color=custom_palette[i], linewidth=2.5)

# Title and labels
plt.title('Likes Over Time for Top 5 Categories (2023)', fontsize=20, fontweight='bold', color='navy')
plt.xlabel('Month', fontsize=14, fontweight='bold')
plt.ylabel('Likes', fontsize=14, fontweight='bold')

# Adjust ticks and grid
plt.xticks(range(1, 13), calendar.month_name[1:], rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)

# Add legend and data source note
plt.legend(title='Category', title_fontsize='13', fontsize=12, loc='upper left')
plt.text(12.5, df_2023['Likes'].max(), ' ', fontsize=10, ha='right', color='gray')

# Adjust legend frame
plt.gca().get_legend().get_title().set_fontweight('bold')
plt.gca().get_legend().get_title().set_fontsize(13)
plt.gca().get_legend().set_bbox_to_anchor((1.02, 1))

# Show plot
plt.tight_layout()
plt.show()

# Calculate total likes for each category
category_total_likes = df.groupby('Category')['Likes'].sum().sort_values(ascending=False)

# Get the top 5 categories
top_5_categories = category_total_likes.head(5)

# Get the most liked category and its total likes
most_liked_category = top_5_categories.idxmax()
most_likes = top_5_categories.max()

# Print the most liked category and its total likes
print(f"\033[1mMost liked category among top 5:\033[0m {most_liked_category}")
print(f"\033[1mTotal likes for {most_liked_category}:\033[0m {most_likes}\n")

# Calculate popularity by counting the number of occurrences of each category
category_popularity = df['Category'].value_counts().sort_values(ascending=False)

# Get the popularity for the top 5 categories
top_5_popularity = category_popularity[top_5_categories.index]

# Get the most popular category among the top 5
most_popular_category = top_5_popularity.idxmax()
most_popularity = top_5_popularity.max()

# Print the most popular category and its popularity
print(f"\033[1mMost popular category among top 5:\033[0m {most_popular_category}")
print(f"\033[1mPopularity for top 5:\033[0m {most_popularity}\n")

# Print total likes and popularity for other top 5 categories
for category, likes in top_5_categories.items():
    if category != most_liked_category:
        popularity = top_5_popularity[category]
        print(f"{category}: Total likes - {likes}, Popularity - {popularity}")


In [None]:
import calendar 

# Filter the DataFrame for the year 2023
df_2023 = df[df['Date'].dt.year == 2023]

# Calculate mean likes for each category aggregated by month
category_likes_monthly = df_2023.groupby([df_2023['Date'].dt.month, 'Category'])['Likes'].mean().unstack()

# Select the lowest 5 categories
lowest_5_categories = category_likes_monthly.mean().nsmallest(5).index

# Filter category likes for the lowest 5 categories
category_likes_lowest_5 = category_likes_monthly[lowest_5_categories]

# Set custom color palette
custom_palette = sns.color_palette("husl", len(lowest_5_categories))

# Set seaborn style
sns.set_style("whitegrid")

# Create a new figure
plt.figure(figsize=(12, 6))

# Plot the combined trends for the lowest 5 categories for each month
for i, category in enumerate(lowest_5_categories):
    sns.lineplot(x=category_likes_lowest_5.index, y=category_likes_lowest_5[category], label=category, color=custom_palette[i], linewidth=2.5)

# Title and labels
plt.title('Likes Over Time for Lowest 5 Categories (2023)', fontsize=20, fontweight='bold', color='navy')
plt.xlabel('Month', fontsize=14, fontweight='bold')
plt.ylabel('Likes', fontsize=14, fontweight='bold')

# Adjust ticks and grid
plt.xticks(range(1, 13), calendar.month_name[1:], rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)

# Add legend and data source note
plt.legend(title='Category', title_fontsize='13', fontsize=12, loc='upper left')
plt.text(12.5, df_2023['Likes'].max(), ' ', fontsize=10, ha='right', color='gray')

# Adjust legend frame
plt.gca().get_legend().get_title().set_fontweight('bold')
plt.gca().get_legend().get_title().set_fontsize(13)
plt.gca().get_legend().set_bbox_to_anchor((1.02, 1))

# Show plot
plt.tight_layout()
plt.show()

# Calculate total likes for each category
category_total_likes = df.groupby('Category')['Likes'].sum().sort_values()

# Get the lowest 5 categories
lowest_5_categories = category_total_likes.head(5)

# Get the least liked category and its total likes
least_liked_category = lowest_5_categories.idxmin()
least_likes = lowest_5_categories.min()

# Print the least liked category and its total likes
print(f"\033[1mLeast liked category among lowest 5:\033[0m {least_liked_category}")
print(f"\033[1mTotal likes for {least_liked_category}:\033[0m {least_likes}\n")

# Calculate popularity by counting the number of occurrences of each category
category_popularity = df['Category'].value_counts().sort_values()

# Get the popularity for the lowest 5 categories
lowest_5_popularity = category_popularity[lowest_5_categories.index]

# Get the least popular category among the lowest 5
least_popular_category = lowest_5_popularity.idxmin()
least_popularity = lowest_5_popularity.min()

# Print the least popular category and its popularity
print(f"\033[1mLeast popular category among lowest 5:\033[0m {least_popular_category}")
print(f"\033[1mPopularity for lowest 5:\033[0m {least_popularity}\n")

# Print total likes and popularity for other lowest 5 categories
for category, likes in lowest_5_categories.items():
    if category != least_liked_category:
        popularity = lowest_5_popularity[category]
        print(f"{category}: Total likes - {likes}, Popularity - {popularity}")



In [None]:
# Calculate mean likes for each category
category_mean_likes = df.groupby('Category')['Likes'].mean()

# Calculate total likes for each category
category_total_likes = df.groupby('Category')['Likes'].sum()

# Calculate popularity for each category
category_popularity = df['Category'].value_counts()

# Create a DataFrame to store the metrics
category_metrics = pd.DataFrame({
    'Mean Likes': category_mean_likes,
    'Total Likes': category_total_likes,
    'Popularity': category_popularity
})

# Sort the DataFrame by mean likes in descending order
category_metrics = category_metrics.sort_values(by='Mean Likes', ascending=False)
category_metrics = category_metrics.sort_values(by='Total Likes', ascending=False)
category_metrics = category_metrics.sort_values(by='Popularity', ascending=False)

# Display the table
print(category_metrics)

In [None]:
# Calculate mean likes for each category
category_mean_likes = df.groupby('Category')['Likes'].mean()

# Calculate median likes for each category
category_median_likes = df.groupby('Category')['Likes'].median()

# Calculate standard deviation of likes for each category
category_std_likes = df.groupby('Category')['Likes'].std()

# Calculate total likes for each category
category_total_likes = df.groupby('Category')['Likes'].sum()

# Calculate total number of posts for each category
category_total_posts = df['Category'].value_counts()

# Calculate average likes per post for each category
category_avg_likes_per_post = category_total_likes / category_total_posts

# Calculate engagement rate for each category
category_engagement_rate = (category_total_likes / category_total_posts) * 100

# Calculate popularity for each category
category_popularity = df['Category'].value_counts()

# Create a DataFrame to store the metrics
category_metrics = pd.DataFrame({
    'Mean Likes': category_mean_likes,
    'Median Likes': category_median_likes,
    'S.D. Likes': category_std_likes,
    'Total Likes': category_total_likes,
    'AVG Likes per Post': category_avg_likes_per_post,
    'Engagement Rate (%)': category_engagement_rate,
    'Popularity': category_popularity
})

# Sort the DataFrame by mean likes in descending order
category_metrics = category_metrics.sort_values(by='Mean Likes', ascending=False)
category_metrics = category_metrics.sort_values(by='Total Likes', ascending=False)
category_metrics = category_metrics.sort_values(by='Popularity', ascending=False)
category_metrics = category_metrics.sort_values(by='Median Likes', ascending=False)
category_metrics = category_metrics.sort_values(by='Popularity', ascending=False)
# Display the table
print(category_metrics)