In [None]:
designations:sde,sde sr,solutions enabler, consultant, solutions architect,principal architect,it ,finance, accounting,hr  
skills:java, java swing, sql, c,c#,c++,go, django, html,css, javascript, node.js, next.js,typescript,python,devops,machine learning, data analysis,dbt,snowflake,azure,powerbi,data structures

In [None]:
#!pip install faker pandas


In [2]:
pip install tqdm

Collecting tqdm
  Using cached tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
Using cached tqdm-4.66.5-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.66.5
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
from faker import Faker
import random
from datetime import timedelta
from tqdm import tqdm  # Import tqdm for progress bar

fake = Faker()

# Provided designations, skills, and course names
designations = [
    'SDE', 'SDE Sr', 'Solutions Enabler', 'Consultant',
    'Solutions Architect', 'Principal Architect', 'IT',
    'Finance', 'Accounting', 'HR'
]

skills = [
    'Java', 'Java Swing', 'SQL', 'C', 'C#', 'C++',
    'Go', 'Django', 'HTML', 'CSS', 'JavaScript',
    'Node.js', 'Next.js', 'TypeScript', 'Python',
    'DevOps', 'Machine Learning', 'Data Analysis',
    'DBT', 'Snowflake', 'Azure', 'PowerBI',
    'Data Structures', 'Cloud Computing', 'Cybersecurity',
    'Networking Fundamentals', 'Agile Methodologies',
    'Employee Relations', 'Performance Management',
    'Recruitment and Selection', 'HR Information Systems (HRIS)',
    'Conflict Resolution'
]

course_names = [
    "Java Programming Essentials", "Mastering Java Swing for Desktop Applications",
    "SQL for Data Management", "C Programming for Beginners",
    "Advanced C# Techniques", "C++ for Game Development",
    "Go: A Comprehensive Guide", "Building Web Applications with Django",
    "HTML & CSS: The Complete Guide", "JavaScript Fundamentals",
    "Node.js for Scalable Network Applications",
    "Next.js: Server-Side Rendering Made Easy",
    "TypeScript for Large Applications", "Python for Data Analysis",
    "DevOps Fundamentals: Continuous Integration & Delivery",
    "Machine Learning with Python", "Data Analysis Techniques for Business Insights",
    "DBT: Transforming Data in the Warehouse", "Snowflake Essentials: Data Warehousing",
    "Azure Cloud Fundamentals", "Power BI for Data Visualization",
    "Data Structures and Algorithms in Java", "Cloud Security: Best Practices for IT",
    "HR Analytics: Leveraging Data for HR Decisions",
    "IT Project Management Fundamentals",
    "Agile Methodologies for HR and IT",
    "Cybersecurity Essentials for IT Professionals",
    "Effective Communication in HR",
    "Talent Management Strategies in the Digital Age",
    "Building a Diverse Workplace Culture"
]

# Constants for the fake data generation
NUM_USERS = 300
NUM_COURSES = len(course_names)

# User distribution based on your specifications
user_distribution = {
    'SDE': 100,
    'SDE Sr': 70,
    'Solutions Enabler': 45,
    'Consultant': 40,
    'Solutions Architect': 30,
    'Principal Architect': 15,
    'IT': 55,
    'Finance': 70,
    'Accounting': 60,
    'HR': 15
}

# Generate Designations
designations_df = pd.DataFrame({'id': range(1, len(designations) + 1), 'title': designations})

# Generate Skills
skills_df = pd.DataFrame({'id': range(1, len(skills) + 1), 'name': skills})

# Generate Users
users = []
user_id = 1

for designation, count in tqdm(user_distribution.items(), desc="Generating Users", unit="user"):
    for _ in range(count):
        users.append({
            'id': user_id,
            'name': fake.name(),
            'mail': fake.unique.email(),
            'role': 'employee',  # All users are employees
            'designationId': designations_df[designations_df['title'] == designation]['id'].values[0],
            'sex': random.choice(['m', 'f']),
            'experience': random.randint(0, 20),  # Experience in years
            'joindate': fake.date_between(start_date='-8y', end_date='today'),
            'hashedpassword': fake.password()
        })
        user_id += 1

users_df = pd.DataFrame(users)

# Ensure join date is a datetime object
users_df['joindate'] = pd.to_datetime(users_df['joindate'])

# Generate Courses
courses = []
for i in tqdm(range(1, NUM_COURSES + 1), desc="Generating Courses", unit="course"):
    created_at = fake.date_time_this_decade()
    courses.append({
        'id': i,
        'title': course_names[i - 1],
        'proficiency_level': random.choice(['Beginner', 'Intermediate', 'Advanced']),
        'no_of_chapters': random.randint(5, 20),  # Random total chapters
        'duration': random.randint(2, 10),  # Duration in days
        'createdAt': created_at
    })
courses_df = pd.DataFrame(courses)

# Ensure date columns are datetime
courses_df['createdAt'] = pd.to_datetime(courses_df['createdAt'])

# Generate Initial Progress with 0 chapters completed
initial_progress = []
for user_id in tqdm(range(1, NUM_USERS + 1), desc="Generating Initial Progress", unit="user"):
    for course_id in range(1, NUM_COURSES + 1):
        initial_progress.append({
            'id': len(initial_progress) + 1,
            'courseId': course_id,
            'userId': user_id,
            'updatedAt': courses_df.loc[courses_df['id'] == course_id, 'createdAt'].values[0],  # Set to creation date
            'chapters_completed': 0,
            'percentage_completed': 0.0,
            'certificate': None  # Nullable field
        })

# Simulate Progress Updates
progress_updates = []
for user_id in tqdm(range(1, NUM_USERS + 1), desc="Simulating Progress Updates", unit="user"):
    user_join_date = users_df.loc[users_df['id'] == user_id, 'joindate'].values[0]
    completed_courses = []

    for course_id in range(1, NUM_COURSES + 1):
        if course_id in completed_courses:
            continue  # Skip if the course is already completed

        total_chapters = courses_df.loc[courses_df['id'] == course_id, 'no_of_chapters'].values[0]
        creation_date = courses_df.loc[courses_df['id'] == course_id, 'createdAt'].values[0]

        # Ensure both dates are datetime
        creation_date = pd.to_datetime(creation_date)
        user_join_date = pd.to_datetime(user_join_date)

        # Simulate a few updates over time
        for update in range(random.randint(1, 5)):  # Randomly generate 1 to 5 progress updates
            chapters_completed = random.randint(1, total_chapters)  # Randomly increase chapters completed
            chapters_completed = min(chapters_completed, total_chapters)
            percentage_completed = (chapters_completed / total_chapters) * 100

            # Random time increment for each update, ensuring it's after both dates
            update_time = max(
                creation_date + timedelta(days=update * random.randint(1, 20)),  # Ensure no more than 20 days apart
                user_join_date + timedelta(days=random.randint(0, 30))  # Adding random days after join date
            )

            progress_updates.append({
                'id': len(progress_updates) + 1,
                'courseId': course_id,
                'userId': user_id,
                'updatedAt': update_time,
                'chapters_completed': chapters_completed,
                'percentage_completed': percentage_completed,
                'certificate': fake.file_name(extension='pdf') if percentage_completed == 100 else None  # Nullable field
            })

            if percentage_completed == 100:
                completed_courses.append(course_id)  # Mark course as completed

# Combine initial progress with updates
all_progress = initial_progress + progress_updates

# Convert to DataFrame
progress_df = pd.DataFrame(all_progress)

# Generate User Skill Proficiency
user_skills = []
for user_id in tqdm(range(1, NUM_USERS + 1), desc="Generating User Skills", unit="user"):
    for skill_id in random.sample(range(1, len(skills) + 1), k=random.randint(1, 5)):
        user_skills.append({
            'id': len(user_skills) + 1,
            'userId': user_id,
            'skillId': skill_id,
            'Proficiency_level': random.choice(['Beginner', 'Intermediate', 'Advanced'])
        })
user_skills_df = pd.DataFrame(user_skills)

# Generate Course Skills
course_skills = []
for course_id in tqdm(range(1, NUM_COURSES + 1), desc="Generating Course Skills", unit="course"):
    for skill_id in random.sample(range(1, len(skills) + 1), k=random.randint(1, 5)):
        course_skills.append({
            'id': len(course_skills) + 1,
            'courseId': course_id,
            'skillId': skill_id,
        })
course_skills_df = pd.DataFrame(course_skills)

# Save to CSV files
save_path = r'C:\Users\AnanyaSarkar\Documents\project\datascienceandengg\staging\raw\\'

users_df.to_csv(f'{save_path}users.csv', index=False)
courses_df.to_csv(f'{save_path}courses.csv', index=False)
designations_df.to_csv(f'{save_path}designations.csv', index=False)
skills_df.to_csv(f'{save_path}skills.csv', index=False)
progress_df.to_csv(f'{save_path}progress.csv', index=False)
user_skills_df.to_csv(f'{save_path}user_skills.csv', index=False)
course_skills_df.to_csv(f'{save_path}course_skills.csv', index=False)

print("Data generation completed and files saved.")


Generating Users: 100%|██████████| 10/10 [00:00<00:00, 21.73user/s]
Generating Courses: 100%|██████████| 30/30 [00:00<00:00, 14988.58course/s]
Generating Initial Progress: 100%|██████████| 300/300 [00:03<00:00, 92.70user/s]
Simulating Progress Updates: 100%|██████████| 300/300 [00:08<00:00, 35.53user/s]
Generating User Skills: 100%|██████████| 300/300 [00:00<00:00, 100094.76user/s]
Generating Course Skills: 100%|██████████| 30/30 [00:00<00:00, 29966.45course/s]


Data generation completed and files saved.


In [None]:
#for progress i want to show progress updates of employees in various courses they have been enrolled in . the user will always start with 0 'chapters_completed' , and will progress. it will be a timeseries data which means it wont overwrite the previous progressupdate. i dont need 100 as the end of every course for every employee but 0for every course for every employee is necessary. and the 'percentage_completed' is calculated by 'chapters_completed'*100/'total chapters' in that course