In [None]:
pip install pandas numpy faker scikit-learn

Collecting faker
  Downloading Faker-33.1.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-33.1.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-33.1.0


In [None]:
import pandas as pd
import numpy as np
from faker import Faker
from sklearn.preprocessing import LabelEncoder

# Set random seed for reproducibility
np.random.seed(28)

# Initialize Faker
faker = Faker()

# Number of samples
num_samples = 50

# Generate synthetic data
data = {
    "Age": np.random.randint(60, 90, num_samples),
    "Gender": np.random.choice(["Male", "Female"], num_samples, p=[0.48, 0.52]),
    "Education Level": np.random.choice(["High School", "College", "Graduate"], num_samples, p=[0.2, 0.32, 0.48]),
    "Tech Savviness": np.random.randint(1, 11, num_samples),
    "Hours Spent Online Daily": np.round(np.random.uniform(0.5, 10, num_samples), 1),
    "Primary Device Used": np.random.choice(["Smartphone", "Laptop", "Tablet"], num_samples, p=[0.4, 0.5, 0.1]),
    "Social Media Usage": np.random.choice(["Yes", "No"], num_samples, p=[0.7, 0.3]),
    "Email Awareness": np.random.randint(1, 11, num_samples),
    "Password Practices": np.random.randint(1, 6, num_samples),
    "Two-Factor Authentication (2FA)": np.random.choice(["Yes", "No"], num_samples, p=[0.3, 0.7]),
    "Scam History": np.random.choice(["Yes", "No"], num_samples, p=[0.35, 0.65]),
    "Scam Type Encountered": np.random.choice(["Phishing", "Fake Tech Support", "Online Shopping Scam", "None"], num_samples, p=[0.2, 0.05, 0.05, 0.7]),
    "Awareness Programs Attended": np.random.choice(["Yes", "No"], num_samples, p=[0.3, 0.7]),
    "Confidence in Identifying Scams": np.random.randint(1, 11, num_samples),
    "Recent Scam Attempts": np.random.randint(0, 11, num_samples)
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Scoring system for online safety
df["Safety Score"] = (
    (df["Tech Savviness"] * 2) +  # Higher weight for tech savviness
    (df["Password Practices"] * 1.5) +  # Medium weight for password practices
    (df["Confidence in Identifying Scams"] * 1.2) +  # Confidence level
    (df["Email Awareness"] * 1.0) +  # Awareness of phishing
    df["Scam History"].apply(lambda x: -2 if x == "Yes" else 2) +  # Negative score for scam history using apply
    df["Two-Factor Authentication (2FA)"].apply(lambda x: 2 if x == "Yes" else -1) +  # Positive for 2FA using apply
    df["Awareness Programs Attended"].apply(lambda x: 1 if x == "Yes" else -0.5) +  # Minor influence for awareness using apply
    (df["Hours Spent Online Daily"] * 0.8) +  # Moderate weight for hours spent online
    df["Education Level"].apply(lambda x: 0.5 if x == "Graduate" else (0.2 if x == "College" else 0))  # Minor role for education level using apply
)

# Define thresholds for safety based on scores
safe_threshold = df["Safety Score"].quantile(0.6)  # Top 60% of scores are "Safe"
not_safe_threshold = df["Safety Score"].quantile(0.4)  # Bottom 40% are "Not Safe"

# Assign "Safe" or "Not Safe" based on thresholds
df["Online Safety"] = np.where(
    df["Safety Score"] >= safe_threshold, "Safe",
    np.where(df["Safety Score"] <= not_safe_threshold, "Not Safe", "Moderate")
)

# Save to CSV
df.to_csv("elderly_online_safety.csv", index=False)

print("Synthetic dataset created and saved as 'elderly_online_safety.csv'.")


for col in ["Tech Savviness", "Hours Spent Online Daily", "Scam Type Encountered"]:
    df.loc[df.sample(frac=0.1).index, col] = np.nan

# Duplicates
duplicates = df.sample(frac=0.05)
df = pd.concat([df, duplicates], ignore_index=True)

# Outliers
outlier_indices = df.sample(frac=0.02).index
df.loc[outlier_indices, "Recent Scam Attempts"] = df["Recent Scam Attempts"].max() + np.random.randint(10, 20, len(outlier_indices))

# Noisy Data
noisy_indices = df.sample(frac=0.05).index
df.loc[noisy_indices, "Primary Device Used"] = np.random.choice(["Smartphone", "Laptop", "Tablet"], len(noisy_indices))

# Step 4: Save and Inspect
df.to_csv("elderly_online_safety_with_inconsistencies.csv", index=False)
print("Synthetic dataset with inconsistencies created and saved as 'elderly_online_safety_with_inconsistencies.csv'.")
print(df.head())

Synthetic dataset created and saved as 'elderly_online_safety.csv'.
Synthetic dataset with inconsistencies created and saved as 'elderly_online_safety_with_inconsistencies.csv'.
   Age  Gender Education Level  Tech Savviness  Hours Spent Online Daily  \
0   61    Male         College             6.0                       5.2   
1   85  Female     High School             5.0                       2.7   
2   65  Female        Graduate             6.0                       9.2   
3   82  Female        Graduate             6.0                       1.1   
4   80  Female        Graduate             6.0                       NaN   

  Primary Device Used Social Media Usage  Email Awareness  Password Practices  \
0              Tablet                Yes                7                   2   
1              Tablet                Yes                7                   4   
2          Smartphone                Yes                8                   4   
3              Laptop                Yes 