### Generating the student data using Faker

### Problem: 
##### You are tasked with building a predictive model that can estimate whether a student will successfully complete a given course based on the following factors: 
- Student Profile: Age, gender, major, academic year, and region. 
- Engagement Data: Number of logins per week, number of videos watched, time spent on platform, and quiz scores during the course. 
- Historical Data: Previous completion rates, average quiz scores across all courses, and the number of courses started but not completed. 

In [11]:
import pandas as pd
from faker import Faker
import random

# Initialize the Faker library
fake = Faker()

# Set the random seed for reproducibility
Faker.seed(42)
random.seed(42)

# Number of records to generate
num_students = 1000

# Generate Student Profile Data
student_profile_data = {
    'student_id': range(1, num_students + 1),
    'age': [random.randint(18, 27) for _ in range(num_students)],
    'gender': [random.choice(['Male', 'Female']) for _ in range(num_students)],
    'major': [random.choice(['Computer Science', 'Mechanical Engineering', 'Environmental Science',
                             'Civil Engineering', 'Electrical Engineering', 'Chemical Engineering', 'IT', 'Bio-Technology']) for _ in range(num_students)],
    'year': [random.randint(1, 4) for _ in range(num_students)],
    'region': [random.choice(['West Bengal', 'Delhi', 'Karnataka', 'Maharashtra', 'Gujarat', 'Tamil Nadu', 'Telenagana', 'Andra Pradesh', 'Kerala', 'UP', 'Rajasthan']) for _ in range(num_students)]
}

# Create DataFrame
student_profile_df = pd.DataFrame(student_profile_data)

# Generate Course Engagement Data
course_engagement_data = {
    'student_id': range(1, num_students + 1),
    'logins_per_week': [random.randint(1, 10) for _ in range(num_students)],
    'videos_watched': [random.randint(5, 20) for _ in range(num_students)],
    'time_spent': [random.randint(3, 15) for _ in range(num_students)],
    'avg_quiz_score': [random.randint(15, 100) for _ in range(num_students)]
}

# Create DataFrame
course_engagement_df = pd.DataFrame(course_engagement_data)

# Generate Historical Data
# Ensure courses_completed <= courses_started
historical_data = {
    'student_id': range(1, num_students + 1),
    'courses_started': [random.randint(1, 7) for _ in range(num_students)],
}

# Generate 'courses_completed' such that it's <= 'courses_started'
historical_data['courses_completed'] = [
    random.randint(0, started) for started in historical_data['courses_started']
]

# Generate 'avg_score_across_courses'
historical_data['avg_score_across_courses'] = [random.randint(15, 100) for _ in range(num_students)]

# Create DataFrame
historical_data_df = pd.DataFrame(historical_data)

# Display the first few rows of each DataFrame
print("Student Profile Data:")
print(student_profile_df.head())

print("\nCourse Engagement Data:")
print(course_engagement_df.head())

print("\nHistorical Data:")
print(historical_data_df.head())


Student Profile Data:
   student_id  age  gender                   major  year      region
0           1   19  Female                      IT     4   Rajasthan
1           2   18  Female          Bio-Technology     1  Telenagana
2           3   22  Female          Bio-Technology     3     Gujarat
3           4   21    Male       Civil Engineering     3  Tamil Nadu
4           5   21  Female  Electrical Engineering     2   Karnataka

Course Engagement Data:
   student_id  logins_per_week  videos_watched  time_spent  avg_quiz_score
0           1                9              10           8              94
1           2                2               6          10              98
2           3                2              14           5              71
3           4                4               5           6              59
4           5                9              14           5              98

Historical Data:
   student_id  courses_started  courses_completed  avg_score_across_cou

In [12]:
df_merge = pd.merge(student_profile_df, course_engagement_df, on = 'student_id')

df = pd.merge(df_merge, historical_data_df, on='student_id')


In [13]:
df

Unnamed: 0,student_id,age,gender,major,year,region,logins_per_week,videos_watched,time_spent,avg_quiz_score,courses_started,courses_completed,avg_score_across_courses
0,1,19,Female,IT,4,Rajasthan,9,10,8,94,4,0,50
1,2,18,Female,Bio-Technology,1,Telenagana,2,6,10,98,5,3,88
2,3,22,Female,Bio-Technology,3,Gujarat,2,14,5,71,7,6,61
3,4,21,Male,Civil Engineering,3,Tamil Nadu,4,5,6,59,7,4,60
4,5,21,Female,Electrical Engineering,2,Karnataka,9,14,5,98,7,7,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,24,Female,Chemical Engineering,2,West Bengal,5,19,13,51,6,2,94
996,997,23,Male,Environmental Science,1,Delhi,10,13,6,97,6,1,84
997,998,26,Male,Mechanical Engineering,3,Gujarat,5,18,11,73,4,2,87
998,999,22,Male,Civil Engineering,2,Delhi,1,6,12,59,7,2,52


In [15]:
df.to_csv('student_course_data.csv', index=False)