In [2]:
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pl.Config.set_tbl_rows(20)
sns.set_style("whitegrid")

print("Libraries loaded ✓")

Libraries loaded ✓


In [3]:
df_teen = pl.read_csv("data/teen_e_safety_dataset.csv")

print(f"Teen safety dataset: {df_teen.shape[0]:,} records")
print(f"Time period: {df_teen['Timestamp'].min()} to {df_teen['Timestamp'].max()}")
print(f"\nNumber of columns: {df_teen.shape[1]}")

Teen safety dataset: 67,921 records
Time period: 2017-01-01 00:00:00 to 2024-10-01 00:00:00

Number of columns: 30


In [4]:
# Look at key risk indicators
print("=== TEEN ONLINE RISKS ===\n")

print("Malware Detection:")
print(df_teen.group_by("Malware_Detection").agg(pl.len().alias("count")))

print("\nPhishing Attempts (distribution):")
print(df_teen.group_by("Phishing_Attempts").agg(pl.len().alias("count")).sort("Phishing_Attempts"))

print("\nRisky Website Visits:")
print(df_teen.group_by("Risky_Website_Visits").agg(pl.len().alias("count")))

print("\nCyberbullying Reports:")
print(df_teen.group_by("Cyberbullying_Reports").agg(pl.len().alias("count")).sort("Cyberbullying_Reports"))

=== TEEN ONLINE RISKS ===

Malware Detection:
shape: (2, 2)
┌───────────────────┬───────┐
│ Malware_Detection ┆ count │
│ ---               ┆ ---   │
│ i64               ┆ u32   │
╞═══════════════════╪═══════╡
│ 0                 ┆ 64629 │
│ 1                 ┆ 3292  │
└───────────────────┴───────┘

Phishing Attempts (distribution):
shape: (6, 2)
┌───────────────────┬───────┐
│ Phishing_Attempts ┆ count │
│ ---               ┆ ---   │
│ i64               ┆ u32   │
╞═══════════════════╪═══════╡
│ 0                 ┆ 50251 │
│ 1                 ┆ 15116 │
│ 2                 ┆ 2290  │
│ 3                 ┆ 238   │
│ 4                 ┆ 24    │
│ 5                 ┆ 2     │
└───────────────────┴───────┘

Risky Website Visits:
shape: (2, 2)
┌──────────────────────┬───────┐
│ Risky_Website_Visits ┆ count │
│ ---                  ┆ ---   │
│ i64                  ┆ u32   │
╞══════════════════════╪═══════╡
│ 0                    ┆ 61166 │
│ 1                    ┆ 6755  │
└──────────────────────

In [5]:
print("=== TEEN PROTECTIVE BEHAVIORS ===\n")

print("VPN Usage:")
print(df_teen.group_by("VPN_Usage").agg(pl.len().alias("count")))

print("\nPassword Strength:")
print(df_teen.group_by("Password_Strength").agg(pl.len().alias("count")).sort("Password_Strength"))

print("\nParental Control Alerts:")
print(df_teen.group_by("Parental_Control_Alerts").agg(pl.len().alias("count")).sort("Parental_Control_Alerts"))

print("\nPublic Network Usage:")
print(df_teen.group_by("Public_Network_Usage").agg(pl.len().alias("count")))

=== TEEN PROTECTIVE BEHAVIORS ===

VPN Usage:
shape: (2, 2)
┌───────────┬───────┐
│ VPN_Usage ┆ count │
│ ---       ┆ ---   │
│ i64       ┆ u32   │
╞═══════════╪═══════╡
│ 0         ┆ 57811 │
│ 1         ┆ 10110 │
└───────────┴───────┘

Password Strength:
shape: (3, 2)
┌───────────────────┬───────┐
│ Password_Strength ┆ count │
│ ---               ┆ ---   │
│ str               ┆ u32   │
╞═══════════════════╪═══════╡
│ Moderate          ┆ 10298 │
│ Strong            ┆ 3523  │
│ Weak              ┆ 54100 │
└───────────────────┴───────┘

Parental Control Alerts:
shape: (2, 2)
┌─────────────────────────┬───────┐
│ Parental_Control_Alerts ┆ count │
│ ---                     ┆ ---   │
│ i64                     ┆ u32   │
╞═════════════════════════╪═══════╡
│ 0                       ┆ 61191 │
│ 1                       ┆ 6730  │
└─────────────────────────┴───────┘

Public Network Usage:
shape: (2, 2)
┌──────────────────────┬───────┐
│ Public_Network_Usage ┆ count │
│ ---                  ┆ --- 

In [6]:
print("=== RISKS BY AGE GROUP ===\n")

age_risks = df_teen.group_by("Age_Group").agg([
    pl.len().alias("total_sessions"),
    pl.col("Malware_Detection").sum().alias("malware_incidents"),
    (pl.col("Phishing_Attempts") > 0).sum().alias("phishing_sessions"),
    pl.col("Risky_Website_Visits").sum().alias("risky_visits"),
    pl.col("Cyberbullying_Reports").sum().alias("cyberbullying"),
]).sort("Age_Group")

print(age_risks)

# Calculate percentages
print("\n=== PERCENTAGE OF SESSIONS WITH RISKS ===")
for row in age_risks.iter_rows(named=True):
    age = row['Age_Group']
    total = row['total_sessions']
    print(f"\n{age}:")
    print(f"  Malware: {row['malware_incidents']/total*100:.1f}%")
    print(f"  Phishing: {row['phishing_sessions']/total*100:.1f}%")
    print(f"  Risky sites: {row['risky_visits']/total*100:.1f}%")
    print(f"  Cyberbullying: {row['cyberbullying']/total*100:.1f}%")

=== RISKS BY AGE GROUP ===

shape: (3, 6)
┌───────────┬────────────────┬───────────────────┬──────────────────┬──────────────┬───────────────┐
│ Age_Group ┆ total_sessions ┆ malware_incidents ┆ phishing_session ┆ risky_visits ┆ cyberbullying │
│ ---       ┆ ---            ┆ ---               ┆ s                ┆ ---          ┆ ---           │
│ str       ┆ u32            ┆ i64               ┆ ---              ┆ i64          ┆ i64           │
│           ┆                ┆                   ┆ u32              ┆              ┆               │
╞═══════════╪════════════════╪═══════════════════╪══════════════════╪══════════════╪═══════════════╡
│ 13-16     ┆ 47695          ┆ 2315              ┆ 12365            ┆ 4756         ┆ 931           │
│ 17-19     ┆ 13491          ┆ 678               ┆ 3584             ┆ 1302         ┆ 262           │
│ <13       ┆ 6735           ┆ 299               ┆ 1721             ┆ 697          ┆ 137           │
└───────────┴────────────────┴───────────────────

In [7]:
print("=== SOCIAL MEDIA USAGE BY AGE ===\n")

social_media = df_teen.group_by(["Age_Group", "Social_Media_Usage"]).agg(
    pl.len().alias("count")
).sort(["Age_Group", "Social_Media_Usage"])

print(social_media)

# Calculate percentages by age group
print("\n=== SOCIAL MEDIA USAGE PERCENTAGES ===")
for age in ["<13", "13-16", "17-19"]:
    age_data = df_teen.filter(pl.col("Age_Group") == age)
    total = len(age_data)
    print(f"\n{age} (n={total}):")
    for level in ["Low", "Medium", "High"]:
        count = len(age_data.filter(pl.col("Social_Media_Usage") == level))
        print(f"  {level}: {count/total*100:.1f}%")

=== SOCIAL MEDIA USAGE BY AGE ===

shape: (9, 3)
┌───────────┬────────────────────┬───────┐
│ Age_Group ┆ Social_Media_Usage ┆ count │
│ ---       ┆ ---                ┆ ---   │
│ str       ┆ str                ┆ u32   │
╞═══════════╪════════════════════╪═══════╡
│ 13-16     ┆ High               ┆ 4800  │
│ 13-16     ┆ Low                ┆ 33428 │
│ 13-16     ┆ Medium             ┆ 9467  │
│ 17-19     ┆ High               ┆ 1349  │
│ 17-19     ┆ Low                ┆ 9477  │
│ 17-19     ┆ Medium             ┆ 2665  │
│ <13       ┆ High               ┆ 699   │
│ <13       ┆ Low                ┆ 4675  │
│ <13       ┆ Medium             ┆ 1361  │
└───────────┴────────────────────┴───────┘

=== SOCIAL MEDIA USAGE PERCENTAGES ===

<13 (n=6735):
  Low: 69.4%
  Medium: 20.2%
  High: 10.4%

13-16 (n=47695):
  Low: 70.1%
  Medium: 19.8%
  High: 10.1%

17-19 (n=13491):
  Low: 70.2%
  Medium: 19.8%
  High: 10.0%
