In [5]:
import pandas as pd
from tqdm import tqdm

# Load the transformed courses and progress data
courses_df = pd.read_csv(r"C:\Users\AnanyaSarkar\Documents\project\datascienceandengg\staging\transformed\transformed_courses.csv")
progress_df = pd.read_csv(r"C:\Users\AnanyaSarkar\Documents\project\datascienceandengg\staging\transformed\transformed_progress.csv")

# Create a new DataFrame for fact_course_progress
fact_course_progress_df = pd.DataFrame()

# Combine updated day, month, and year into a single datetime column
progress_df['updated_date'] = pd.to_datetime(
    progress_df['updated_year'].astype(str) + '-' +
    progress_df['updated_month'].astype(str) + '-' +
    progress_df['updated_day'].astype(str)
)

# completion time for each user who completed a course
completed_courses = progress_df[progress_df['percentage_completed'] == 100]

# Merge the completed courses with progress where they started
start_courses = progress_df[progress_df['percentage_completed'] == 0][['userId', 'courseId', 'updated_date']].rename(columns={'updated_date': 'start_date'})

# Join to get completion time
completion_times = pd.merge(completed_courses, start_courses, on=['userId', 'courseId'])
completion_times['completion_time'] = (completion_times['updated_date'] - completion_times['start_date']).dt.days

# Group by courseId to calculate total days and number of users who completed each course
course_completion = completion_times.groupby('courseId').agg(
    total_days_to_complete=('completion_time', 'sum'),
    total_users_completed=('userId', 'nunique'),
    median_completion_time=('completion_time', 'median'),
    total_users_with_certificate=('userId', lambda x: (completed_courses[completed_courses['userId'].isin(x)]['certificate'].notnull()).sum())
).reset_index()

# Calculate the average number of days to complete each course
course_completion['average_days_to_complete'] = course_completion['total_days_to_complete'] / course_completion['total_users_completed']

# Calculate completion rate
course_completion['completion_rate'] = course_completion['total_users_completed'] / progress_df.groupby('courseId')['userId'].nunique().reindex(course_completion['courseId']).fillna(0)

# Merge course details
fact_course_progress_df = pd.merge(courses_df, course_completion, on='courseId', how='left')

# Fill NaN values for users who haven't completed any courses
fact_course_progress_df['total_users_completed'] = fact_course_progress_df['total_users_completed'].fillna(0).astype(int)
fact_course_progress_df['average_days_to_complete'] = fact_course_progress_df['average_days_to_complete'].fillna(0)
fact_course_progress_df['median_completion_time'] = fact_course_progress_df['median_completion_time'].fillna(0)

# Add total chapters and percentage completed from progress_df if needed
fact_course_progress_df['total_chapters'] = progress_df.groupby('courseId')['chapters_completed'].max().reset_index(drop=True)
fact_course_progress_df['percentage_completed'] = progress_df.groupby('courseId')['percentage_completed'].max().reset_index(drop=True)

# Add original course duration and proficiency level
fact_course_progress_df['duration'] = courses_df['duration']
fact_course_progress_df['proficiency_level'] = courses_df['proficiency_level']

# Add date of the last update for each course
last_update = progress_df.groupby('courseId')['updated_date'].max().reset_index().rename(columns={'updated_date': 'last_update_date'})
fact_course_progress_df = pd.merge(fact_course_progress_df, last_update, on='courseId', how='left')

print(fact_course_progress_df.head())

# Save the fact_course_progress DataFrame to a CSV file
fact_course_progress_df.to_csv(r"C:/Users/AnanyaSarkar/Documents/project/datascienceandengg/staging/fact_course_progress.csv", index=False)


   courseId                                          title proficiency_level  \
0         1                    Java Programming Essentials          Beginner   
1         2  Mastering Java Swing for Desktop Applications          Beginner   
2         3                        SQL for Data Management          Advanced   
3         4                    C Programming for Beginners          Beginner   
4         5                         Advanced C# Techniques          Advanced   

   chapters  duration  course_age duration_category  \
0         5         2        1634             Short   
1         9        10        1487              Long   
2        10        10         150              Long   
3        15         7        1096            Medium   
4         7         5        1321            Medium   

   proficiency_level_encoded  created_day  created_month  created_year  \
0                          1           18              4          2020   
1                          1           1

In [6]:
# Save the fact_course_progress DataFrame to a CSV file
fact_course_progress_df.to_csv(r"C:/Users/AnanyaSarkar/Documents/project/datascienceandengg/staging/fact_course_progress.csv", index=False)