# 📊 Social Media Addiction Data Analysis

This notebook provides comprehensive analysis of the social media addiction dataset, including data exploration, preprocessing, and initial insights.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

In [None]:
# Load the dataset
df = pd.read_csv('../data/Students Social Media Addiction.csv')
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()

In [None]:
# Basic information about the dataset
print("=== DATASET INFO ===")
print(df.info())
print("\n=== MISSING VALUES ===")
print(df.isnull().sum())
print("\n=== DATA TYPES ===")
print(df.dtypes)

In [None]:
# Statistical summary
print("=== NUMERICAL FEATURES SUMMARY ===")
print(df.describe())

print("\n=== CATEGORICAL FEATURES SUMMARY ===")
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print(f"\n{col}:")
    print(df[col].value_counts())

In [None]:
# Visualize numerical features distribution
numerical_cols = df.select_dtypes(include=[np.number]).columns
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for i, col in enumerate(numerical_cols):
    if i < len(axes):
        sns.histplot(df[col], kde=True, ax=axes[i])
        axes[i].set_title(f'Distribution of {col}')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Visualize categorical features
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.ravel()

for i, col in enumerate(categorical_cols[:4]):
    if i < len(axes):
        df[col].value_counts().plot(kind='bar', ax=axes[i])
        axes[i].set_title(f'Distribution of {col}')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Count')
        axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
plt.figure(figsize=(12, 10))
correlation_matrix = df[numerical_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5)
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.show()

In [None]:
# Analyze relationships with target variables
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Conflicts vs Age
axes[0, 0].scatter(df['Age'], df['Conflicts_Over_Social_Media'], alpha=0.6)
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_ylabel('Conflicts Over Social Media')
axes[0, 0].set_title('Age vs Conflicts')

# Addiction Score vs Daily Usage
axes[0, 1].scatter(df['Avg_Daily_Usage_Hours'], df['Addicted_Score'], alpha=0.6)
axes[0, 1].set_xlabel('Average Daily Usage (Hours)')
axes[0, 1].set_ylabel('Addiction Score')
axes[0, 1].set_title('Daily Usage vs Addiction Score')

# Sleep vs Addiction Score
axes[1, 0].scatter(df['Sleep_Hours_Per_Night'], df['Addicted_Score'], alpha=0.6)
axes[1, 0].set_xlabel('Sleep Hours Per Night')
axes[1, 0].set_ylabel('Addiction Score')
axes[1, 0].set_title('Sleep vs Addiction Score')

# Mental Health vs Addiction Score
axes[1, 1].scatter(df['Mental_Health_Score'], df['Addicted_Score'], alpha=0.6)
axes[1, 1].set_xlabel('Mental Health Score')
axes[1, 1].set_ylabel('Addiction Score')
axes[1, 1].set_title('Mental Health vs Addiction Score')

plt.tight_layout()
plt.show()

In [None]:
# Save cleaned data
df.to_csv('../data/cleaned_data.csv', index=False)
print("Cleaned data saved to '../data/cleaned_data.csv'")

# Summary statistics
print("\n=== SUMMARY STATISTICS ===")
print(f"Total samples: {len(df)}")
print(f"Age range: {df['Age'].min()} - {df['Age'].max()}")
print(f"Average addiction score: {df['Addicted_Score'].mean():.2f}")
print(f"Average conflicts: {df['Conflicts_Over_Social_Media'].mean():.2f}")
print(f"Most used platform: {df['Most_Used_Platform'].mode()[0]}")