In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [3]:
# Set display options for better visualization
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

### DATA UNDERSTANDING AND CLEANING PROCESS FOR VIRAL SOCIAL MEDIA TRENDS DATASET

In [4]:
# Load the dataset
df = pd.read_csv('Viral_Social_Media_Trends.csv')

In [5]:
# Basic information about the dataset
print("Dataset Information:")
print(df.info())

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Post_ID           5000 non-null   object
 1   Platform          5000 non-null   object
 2   Hashtag           5000 non-null   object
 3   Content_Type      5000 non-null   object
 4   Region            5000 non-null   object
 5   Views             5000 non-null   int64 
 6   Likes             5000 non-null   int64 
 7   Shares            5000 non-null   int64 
 8   Comments          5000 non-null   int64 
 9   Engagement_Level  5000 non-null   object
dtypes: int64(4), object(6)
memory usage: 390.8+ KB
None


In [6]:
# Statistical summary
print("\nStatistical Summary:")
print(df.describe())


Statistical Summary:
           Views     Likes   Shares  Comments
count    5000.00   5000.00  5000.00   5000.00
mean  2494066.44 251475.03 50519.56  24888.39
std   1459489.82 144349.58 29066.36  14284.50
min      1266.00    490.00    52.00     18.00
25%   1186207.25 126892.25 25029.00  12305.25
50%   2497373.00 249443.00 50839.50  25004.00
75%   3759781.00 373970.75 75774.25  37072.75
max   4999430.00 499922.00 99978.00  49993.00


In [7]:
# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())


Missing values in each column:
Post_ID             0
Platform            0
Hashtag             0
Content_Type        0
Region              0
Views               0
Likes               0
Shares              0
Comments            0
Engagement_Level    0
dtype: int64


In [8]:
# Check for duplicates
print(f"\nNumber of duplicate rows: {df.duplicated().sum()}")


Number of duplicate rows: 0


In [9]:
# Handle missing values in key columns
# For Content_Type and Platform, fill with the most frequent value
if df['Content_Type'].isnull().sum() > 0:
    most_common_content = df['Content_Type'].mode()[0]
    df['Content_Type'] = df['Content_Type'].fillna(most_common_content)
    print(f"Filled missing Content_Type values with: {most_common_content}")

if df['Platform'].isnull().sum() > 0:
    most_common_platform = df['Platform'].mode()[0]
    df['Platform'] = df['Platform'].fillna(most_common_platform)
    print(f"Filled missing Platform values with: {most_common_platform}")

# For engagement metrics, fill with median values
engagement_metrics = ['Views', 'Likes', 'Shares', 'Comments']
for metric in engagement_metrics:
    if df[metric].isnull().sum() > 0:
        median_value = df[metric].median()
        df[metric] = df[metric].fillna(median_value)
        print(f"Filled missing {metric} values with median: {median_value}")

In [10]:
# Remove duplicates if any
if df.duplicated().sum() > 0:
    df = df.drop_duplicates()
    print(f"Removed duplicate rows. New shape: {df.shape}")


In [11]:
# Convert text columns to lowercase for consistency
text_columns = ['Platform', 'Hashtag', 'Content_Type', 'Region', 'Engagement_Level']
for col in text_columns:
    df[col] = df[col].str.lower()

In [12]:
# Standardize hashtags (remove # symbol if present)
df['Hashtag'] = df['Hashtag'].str.replace('#', '')

In [13]:
# Calculate additional engagement metrics
# Total Engagement
df['Total_Engagement'] = df['Likes'] + df['Shares'] + df['Comments']

In [14]:
# Engagement Rate (as percentage of views)
df['Engagement_Rate'] = (df['Total_Engagement'] / df['Views']) * 100

In [15]:
# Identify outliers using IQR method
numeric_columns = ['Views', 'Likes', 'Shares', 'Comments', 'Total_Engagement', 'Engagement_Rate']
outliers_summary = {}

In [16]:
for col in numeric_columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    outliers_summary[col] = len(outliers)
    
    print(f"\nColumn: {col}")
    print(f"Number of outliers: {len(outliers)}")
    print(f"Percentage of outliers: {100 * len(outliers) / len(df):.2f}%")
    print(f"Range: [{lower_bound:.2f}, {upper_bound:.2f}]")


Column: Views
Number of outliers: 0
Percentage of outliers: 0.00%
Range: [-2674153.38, 7620141.62]

Column: Likes
Number of outliers: 0
Percentage of outliers: 0.00%
Range: [-243725.50, 744588.50]

Column: Shares
Number of outliers: 0
Percentage of outliers: 0.00%
Range: [-51088.88, 151892.12]

Column: Comments
Number of outliers: 0
Percentage of outliers: 0.00%
Range: [-24846.00, 74224.00]

Column: Total_Engagement
Number of outliers: 0
Percentage of outliers: 0.00%
Range: [-170381.62, 822687.38]

Column: Engagement_Rate
Number of outliers: 595
Percentage of outliers: 11.90%
Range: [-21.13, 55.88]


In [17]:
# Final dataset summary
print("\nFinal dataset shape:", df.shape)
print("\nFinal dataset information:")
print(df.info())


Final dataset shape: (5000, 12)

Final dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Post_ID           5000 non-null   object 
 1   Platform          5000 non-null   object 
 2   Hashtag           5000 non-null   object 
 3   Content_Type      5000 non-null   object 
 4   Region            5000 non-null   object 
 5   Views             5000 non-null   int64  
 6   Likes             5000 non-null   int64  
 7   Shares            5000 non-null   int64  
 8   Comments          5000 non-null   int64  
 9   Engagement_Level  5000 non-null   object 
 10  Total_Engagement  5000 non-null   int64  
 11  Engagement_Rate   5000 non-null   float64
dtypes: float64(1), int64(5), object(6)
memory usage: 468.9+ KB
None


In [18]:
# Check if there are any remaining missing values
print("\nRemaining missing values:")
print(df.isnull().sum())


Remaining missing values:
Post_ID             0
Platform            0
Hashtag             0
Content_Type        0
Region              0
Views               0
Likes               0
Shares              0
Comments            0
Engagement_Level    0
Total_Engagement    0
Engagement_Rate     0
dtype: int64


In [56]:
# Save the cleaned dataset
df.to_csv('Viral_Social_Media_Trends_Cleaned.csv', index=False)
print("Cleaned dataset saved as 'Viral_Social_Media_Trends_Cleaned.csv'")

Cleaned dataset saved as 'Viral_Social_Media_Trends_Cleaned.csv'
