# Step 0: Create the Data

In [53]:
import pandas as pd

activity = pd.DataFrame({
    "Date": pd.date_range("2024-03-01", periods=10),
    "StudentID": [1, 1, 1, 2, 2, 2, 3, 3, 3, 3],
    "Steps": [8000, 9000, 7500, 6000, 6500, 7000, 10000, 11000, 10500, 9800],
    "StudyHours": [3, 4, 2, 2, 3, 3, 5, 6, 5, 4]
})

students = pd.DataFrame({
    "StudentID": [1, 2, 3],
    "Major": ["CS", "Math", "CS"]
})

In [54]:
activity

Unnamed: 0,Date,StudentID,Steps,StudyHours
0,2024-03-01,1,8000,3
1,2024-03-02,1,9000,4
2,2024-03-03,1,7500,2
3,2024-03-04,2,6000,2
4,2024-03-05,2,6500,3
5,2024-03-06,2,7000,3
6,2024-03-07,3,10000,5
7,2024-03-08,3,11000,6
8,2024-03-09,3,10500,5
9,2024-03-10,3,9800,4


In [55]:
students

Unnamed: 0,StudentID,Major
0,1,CS
1,2,Math
2,3,CS


# Step 1: Merge / Join (Data Integration)

In [56]:
df = pd.merge(activity, students, on="StudentID", how="left")
df

Unnamed: 0,Date,StudentID,Steps,StudyHours,Major
0,2024-03-01,1,8000,3,CS
1,2024-03-02,1,9000,4,CS
2,2024-03-03,1,7500,2,CS
3,2024-03-04,2,6000,2,Math
4,2024-03-05,2,6500,3,Math
5,2024-03-06,2,7000,3,Math
6,2024-03-07,3,10000,5,CS
7,2024-03-08,3,11000,6,CS
8,2024-03-09,3,10500,5,CS
9,2024-03-10,3,9800,4,CS


# Step 2: Time Series Preparation

In [57]:
df.set_index("Date", inplace=True)
df.sort_index(inplace=True)
df

Unnamed: 0_level_0,StudentID,Steps,StudyHours,Major
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-03-01,1,8000,3,CS
2024-03-02,1,9000,4,CS
2024-03-03,1,7500,2,CS
2024-03-04,2,6000,2,Math
2024-03-05,2,6500,3,Math
2024-03-06,2,7000,3,Math
2024-03-07,3,10000,5,CS
2024-03-08,3,11000,6,CS
2024-03-09,3,10500,5,CS
2024-03-10,3,9800,4,CS


#### Time order is important because trend analysis and rolling or expanding calculations only work correctly when data is in proper chronological order.
#### If the data is not sorted by time, future values may be used in calculations, leading to incorrect and misleading results.


# Step 3: GroupBy (Patterns by Major)

In [58]:
df.groupby("Major").mean()

Unnamed: 0_level_0,StudentID,Steps,StudyHours
Major,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CS,2.142857,9400.0,4.142857
Math,2.0,6500.0,2.666667


#### The CS major appears to be more active, as students in this group show higher average daily activity compared to other majors.

# Step 4: Pivot Table (Reporting View)

In [59]:
pivot_steps = pd.pivot_table(
    df,
    values='Steps',
    index='Date',
    columns='Major',
    aggfunc='mean'
)

print(pivot_steps)

Major            CS    Math
Date                       
2024-03-01   8000.0     NaN
2024-03-02   9000.0     NaN
2024-03-03   7500.0     NaN
2024-03-04      NaN  6000.0
2024-03-05      NaN  6500.0
2024-03-06      NaN  7000.0
2024-03-07  10000.0     NaN
2024-03-08  11000.0     NaN
2024-03-09  10500.0     NaN
2024-03-10   9800.0     NaN


# Step 5: Rolling Window (Short-Term Trends)

In [60]:
df['Steps_3Day_Avg'] = (
    df
    .groupby('StudentID')['Steps']
    .rolling(window=3)
    .mean()
    .reset_index(level=0, drop=True)
)

df

Unnamed: 0_level_0,StudentID,Steps,StudyHours,Major,Steps_3Day_Avg
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-03-01,1,8000,3,CS,
2024-03-02,1,9000,4,CS,
2024-03-03,1,7500,2,CS,8166.666667
2024-03-04,2,6000,2,Math,
2024-03-05,2,6500,3,Math,
2024-03-06,2,7000,3,Math,6500.0
2024-03-07,3,10000,5,CS,
2024-03-08,3,11000,6,CS,
2024-03-09,3,10500,5,CS,10500.0
2024-03-10,3,9800,4,CS,10433.333333


# Step 6: Expanding Window (Long-Term Progress)

In [61]:
df['Avg_Study_So_Far'] = (
    df
    .groupby('StudentID')['Steps']
    .expanding()
    .mean()
    .reset_index(level=0, drop=True)
)

df

Unnamed: 0_level_0,StudentID,Steps,StudyHours,Major,Steps_3Day_Avg,Avg_Study_So_Far
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-03-01,1,8000,3,CS,,8000.0
2024-03-02,1,9000,4,CS,,8500.0
2024-03-03,1,7500,2,CS,8166.666667,8166.666667
2024-03-04,2,6000,2,Math,,6000.0
2024-03-05,2,6500,3,Math,,6250.0
2024-03-06,2,7000,3,Math,6500.0,6500.0
2024-03-07,3,10000,5,CS,,10000.0
2024-03-08,3,11000,6,CS,,10500.0
2024-03-09,3,10500,5,CS,10500.0,10500.0
2024-03-10,3,9800,4,CS,10433.333333,10325.0


# Step 7: Insights & Filtering

In [74]:
df1 = df.groupby('StudentID')[['Steps', 'StudyHours']].mean()
filtered = df1[(df1['Steps'] > 8000) & (df1['StudyHours'] > 3)].sort_values(by='Steps')
filtered

Unnamed: 0_level_0,Steps,StudyHours
StudentID,Unnamed: 1_level_1,Unnamed: 2_level_1
3,10325.0,5.0


# Step 8: Interpretation (Very Important)


More active students tend to study slightly more, showing a positive link between physical activity and academic engagement.
Over time, steps and study hours fluctuate, with peaks on certain days and gradual increases for some students.
The data helps identify student groups for targeted interventions and monitoring.
For ML models, it can be used to predict performance, detect at-risk students, and recommend personalized schedules.
Rolling and expanding averages provide useful context for analyzing behavior and time-based trends.