Before using the code, put Duration (min:sec) in the original file into the first row.

In [1]:
import pandas as pd
import numpy as np

# Define a function to keep the original format as a string
def keep_original_format(x):
    return str(x)
# Load the Excel file and use the converter for the "Duration (min:sec)" column
file_path = 'student8/Activity Report Student 8.xlsx'
df = pd.read_excel(file_path, converters={'Duration (min:sec)': keep_original_format})

In [2]:
df.head()

Unnamed: 0,Start Time,End Time,Duration (min:sec),Grade | Section | Session | Skill,Score,At LC,Points,Outcome
0,2020-08-09 00:00:00,2020-08-09 00:00:00,01:31:00,Grade 4 | Math | P |,67.71(0.80),,--,"G4, 1.1Level 2 ---> Level 3 correct, (time: 11..."
1,17:09:53,17:11:24,,,,,,"G4, 1.2Level 2 ---> Level 3 correct, (time: 30..."
2,,,,,,,,"G4, 1.3Level 2 ---> Level 3 correct, (time: 6/..."
3,,,,,,,,"G4, 1.4Level 2 ---> Level 3 correct, (time: 16..."
4,,,,,,,,"G4, 1.5Level 2 ---> Level 3 correct, (time: 13..."


In [3]:
# Set option to display all rows
pd.set_option('display.max_rows', None)

In [4]:
df.shape

(38390, 8)

In [5]:
df.head(10)

Unnamed: 0,Start Time,End Time,Duration (min:sec),Grade | Section | Session | Skill,Score,At LC,Points,Outcome
0,2020-08-09 00:00:00,2020-08-09 00:00:00,01:31:00,Grade 4 | Math | P |,67.71(0.80),,--,"G4, 1.1Level 2 ---> Level 3 correct, (time: 11..."
1,17:09:53,17:11:24,,,,,,"G4, 1.2Level 2 ---> Level 3 correct, (time: 30..."
2,,,,,,,,"G4, 1.3Level 2 ---> Level 3 correct, (time: 6/..."
3,,,,,,,,"G4, 1.4Level 2 ---> Level 3 correct, (time: 16..."
4,,,,,,,,"G4, 1.5Level 2 ---> Level 3 correct, (time: 13..."
5,,,,,,,,"G4, 1.6Level 2 ---> Level 3 correct, (time: 8/..."
6,2020-08-09 00:00:00,2020-08-09 00:00:00,01:50:00,Grade 4 | Math | P |,68.03(0.32),,--,"G4, 1.7Level 2 ---> Level 3 correct, (time: 6/..."
7,17:11:36,17:13:26,,,,,,"G4, 1.8Level 2 ---> Level 3 correct, (time: 14..."
8,,,,,,,,"G4, 1.9Level 2 ---> Level 3 correct, (time: 21..."
9,,,,,,,,"G4, 2.1Level 2 ---> Level 3 correct, (time: 16..."


In [6]:
df["Start Time"].eq('Start Time').any()
# Filter out rows that have 'Start Time' in the "Start Time" column.

False

In [7]:
df["Duration (min:sec)"].eq('(min:sec)').any()

False

In [8]:
df = df[df["Start Time"] != "Start Time"]
df = df[df['Duration (min:sec)'] != '(min:sec)']

### Start Time and End Time

In [9]:
# Replace NaN with empty string, then cast to str
df["Start Time"] = df["Start Time"].fillna("").astype(str)
df["End Time"] = df["End Time"].fillna("").astype(str)

# Identify rows that look like "YYYY-MM-DD 00:00:00" vs. "HH:MM:SS"
mask_date_start = df["Start Time"].str.match(r"^\d{4}-\d{2}-\d{2} 00:00:00$", na=False)
mask_time_start = df["Start Time"].str.match(r"^\d{2}:\d{2}:\d{2}$", na=False)
mask_date_end = df["End Time"].str.match(r"^\d{4}-\d{2}-\d{2} 00:00:00$", na=False)
mask_time_end = df["End Time"].str.match(r"^\d{2}:\d{2}:\d{2}$", na=False)

# Shift next row's value up
df["time_in_next_row_start"] = df["Start Time"].shift(-1)
next_is_time_start = mask_time_start.shift(-1)
df["time_in_next_row_end"] = df["End Time"].shift(-1)
next_is_time_end = mask_time_end.shift(-1)

# Extract date portion
date_part_start = df["Start Time"].str.split(" ").str[0]  # e.g. "2020-08-05"
date_part_end = df["End Time"].str.split(" ").str[0]  # e.g. "2020-08-05"

# Where current row is date & next row is time, combine them
df["combined_str_start"] = np.where(
    mask_date_start & next_is_time_start,
    date_part_start + " " + df["time_in_next_row_start"],  # e.g. "2020-08-05 19:08:49"
    np.nan
)
df["combined_str_end"] = np.where(
    mask_date_end & next_is_time_end,
    date_part_end + " " + df["time_in_next_row_end"],  # e.g. "2020-08-05 19:08:49"
    np.nan
)

# Convert to datetime
df["Start Time"] = pd.to_datetime(df["combined_str_start"], errors="raise")
df["End Time"] = pd.to_datetime(df["combined_str_end"], errors="raise")

columns_to_drop=['time_in_next_row_start', 'time_in_next_row_end', 'combined_str_start', 'combined_str_end']


# Dropping the rows where any of these columns exist
df = df.drop(columns=columns_to_drop)

In [10]:
df.shape

(38390, 8)

In [11]:
# Subtract 3 hours from "Start Time" and "End Time"
df['Start Time'] = df['Start Time'] - pd.Timedelta(hours=3)
df['End Time'] = df['End Time'] - pd.Timedelta(hours=3)

In [12]:
# Drop rows where all contents are NaN
df = df.dropna(how='all').reset_index(drop=True)

In [13]:
df.shape

(37124, 8)

### Duration

In [14]:
import re

def update_duration_column(duration):
    if pd.isna(duration):
        return duration  # Return NaN if the value is NaN

    # Check if the duration contains 'day'
    if 'day' in duration:
        # Extract days, hours, minutes, and seconds using regex
        match = re.match(r'(\d+)\s*day[s]*,\s*(\d+):(\d+):(\d+)', duration)
        if match:
            days = int(match.group(1))
            hours = int(match.group(2))
            minutes = str(match.group(3))  # Convert minutes to string
            total_minutes = str(days * 24 + hours)  # Convert total_minutes to string
            return f"{total_minutes}:{minutes}"
    else:
        # Split the duration into parts
        parts = duration.split(':')
        try:
            if len(parts) == 3:
                # If format is HH:MM:SS
                minutes = str(parts[0])
                seconds = str(parts[1])
                return f"{minutes}:{seconds}"
        except ValueError:
            return duration  # Return original if format is unexpected

    return duration  # Return original if format is unexpected

# Apply the function to update the column, considering non-null values
df['Duration (min:sec)'] = df['Duration (min:sec)'].apply(lambda x: update_duration_column(x) if pd.notna(x) else x)
# Save the final df to Excel
#df.to_excel('test1.xlsx', index=False)

In [15]:
# Clean up whitespace issues
df["Grade | Section | Session | Skill"] = df["Grade | Section | Session | Skill"].str.strip()

# Extract 'Grade' (e.g., "6")
df['Grade'] = df['Grade | Section | Session | Skill'].str.extract(r'Grade\s+(\d+)')

# Extract 'Session' (e.g., "L", "P", "A") and map to full names
df['Session'] = df['Grade | Section | Session | Skill'].str.extract(r'\|\s*([LPA])\s*\|?')[0].map({
    'L': 'LEARNING',
    'P': 'PROFICIENCY',
    'A': 'AFFICIENCY'
})

# Extract 'Skill' (e.g., "1.1" or blank)
df['Skill'] = df['Grade | Section | Session | Skill'].str.extract(r'\|\s*([\d\.]*)\s*$')[0]

In [16]:
# Extract 'G6, 1.1' and replace ',' with space
df['skill_head'] = df['Outcome'].str.extract(r'(G\d+,\s*\d+\.\d+)')[0].str.replace(',', '')
df.loc[df['Session'] == 'LEARNING', 'skill_head'] = "G" + df['Grade'] + " " + df['Skill']

In [17]:
# Updated dictionary for replacement patterns
replacement_patterns = {
    r'.*To Be Learned to Level 1.*': '01',
    r'.*Level 1 ---> To Be Learned.*': '10',
    r'.*To Be Learned \(No Change\).*': '00',
    r'.*Level 1 \(No Change\).*': '11',
    r'.*Level 1\xa0\(No Change\).*': '11',
    r'.*Level 2 \(No Change\).*': '22',
    r'.*Level 3 \(No Change\).*': '33',
    r"^Level 3 \(No Change\)$": '33',
    r"^Level 2 \(No Change\)$": '22',
    r"^Level 1 to Level 2$": '12',
    r"^Level 2 to Level 3$": '23',
    r"^Level 2 to Proficiency$": '24',
    r'.*Level 2\s*--->\s*Proficient.*': '24',
    r"^Level 3 to Proficiency$": '34',
    r'.*Proficiency \(No Change\).*': '44', #### not sure, this is in learning session
    r'.*Proficient \(No Change\).*': '44',
    r'.*Level 1 ---> Level 2.*': '12',
    r'.*Level 1 ---> Level 3.*': '13',
    r'.*Level 2 ---> Level 3.*': '23',
    r'.*Level 3 ---> Level 4.*': '34',
    r'.*Level 3 ---> Proficient.*': '34',
    r'.*Proficient ---> Level 3.*': '43',
    r'.*Level 2 ---> Level 1.*': '21',
    r'.*Level 3 ---> Level 2.*': '32',
    r'.*Level 2 ---> Level 3.*': '23',
    r'.*Level 3 ---> To Be Learned.*': '30',
    r'.*Level 1 ---> Level 1- Make-up Level.*': '10.5',
    r'.*Level 1 - Make-up Level ---> Level 2.*': '0.52',
    r'.*Level 1 - Make-up Level ---> To Be Learned.*': '0.50',
    r'.*Proficiency\xa0to\xa0Level 3.*': '43',
    r'.*Level 1\xa0to\xa0To Be Learned.*': '10',
    r'.*Level 3\xa0to\xa0Level 2.*': '32' ,
    r'.*Level 3\s*--->\xa0Level 2.*': '32'   
}

# Create the 'skill_level_change' column
df['skill_level_change'] = df['Outcome']

# Replace values based on patterns
for pattern, value in replacement_patterns.items():
    df['skill_level_change'] = df['skill_level_change'].str.replace(pattern, value, regex=True)


In [18]:
df.head()

Unnamed: 0,Start Time,End Time,Duration (min:sec),Grade | Section | Session | Skill,Score,At LC,Points,Outcome,Grade,Session,Skill,skill_head,skill_level_change
0,2020-08-09 14:09:53,2020-08-09 14:11:24,01:31,Grade 4 | Math | P |,67.71(0.80),,--,"G4, 1.1Level 2 ---> Level 3 correct, (time: 11...",4.0,PROFICIENCY,,G4 1.1,23
1,NaT,NaT,,,,,,"G4, 1.2Level 2 ---> Level 3 correct, (time: 30...",,,,G4 1.2,23
2,NaT,NaT,,,,,,"G4, 1.3Level 2 ---> Level 3 correct, (time: 6/...",,,,G4 1.3,23
3,NaT,NaT,,,,,,"G4, 1.4Level 2 ---> Level 3 correct, (time: 16...",,,,G4 1.4,23
4,NaT,NaT,,,,,,"G4, 1.5Level 2 ---> Level 3 correct, (time: 13...",,,,G4 1.5,23


In [19]:
df['Outcome'][671:680]

671    G4, 6.10Proficient (No Change) correct, (time:...
672    G4, 3.10Proficient (No Change) correct, (time:...
673    G4, 4.7Proficient (No Change) correct, (time: ...
674    G4, 2.9Proficient (No Change) correct, (time: ...
675    G4, 6.1Proficient (No Change) correct, (time: ...
676    G4, 5.3Proficient (No Change) correct, (time: ...
677    G4, 1.3Proficient (No Change) correct, (time: ...
678    G4, 7.5Proficient (No Change) correct, (time: ...
679    G4, 4.5Proficient (No Change) correct, (time: ...
Name: Outcome, dtype: object

In [20]:
print(df.loc[df['Outcome'].str.contains('Assisted', na=False), 'Outcome'].unique())

['Assisted Ex\xa01\xa0not complete' 'Assisted Ex\xa02\xa0not complete'
 'Assisted Ex\xa03\xa0not complete' 'Assisted Ex\xa01\xa0correct'
 'Assisted Ex\xa01\xa0wrong' 'Assisted Ex\xa02\xa0incomplete'
 'Assisted Ex\xa04\xa0not complete' 'Assisted Ex\xa011\xa0not complete'
 'Assisted Ex\xa06\xa0not complete' 'Assisted Ex\xa02\xa0correct'
 'Assisted Ex\xa01\xa0incomplete']


In [21]:
# -----------------------
# Create a group identifier
# -----------------------
non_nan_mask = df['Session'].notna()
df['group'] = non_nan_mask.cumsum()

In [22]:
# Define the regex patterns
pattern_demo = r'Demo [1-9] complete!'
pattern_assisted_ex = r'Assisted\s?Ex(?:\s|\xa0)+[1-9](?:\s|\xa0)+correct'

# Group by 'group' over only the columns we need: 'Session' and 'Outcome'
counts = (
    df.groupby('group')[['Session', 'Outcome']]
      .apply(lambda g: pd.Series({
          'L_num_completed_demo': (
              g['Outcome'].str.contains(pattern_demo, na=False).sum()
              if g['Session'].iloc[0] == 'LEARNING' else 0
          ),
          'L_num_correct_assisted': (
              g['Outcome'].str.contains(pattern_assisted_ex, na=False).sum()
              if g['Session'].iloc[0] == 'LEARNING' else 0
          )
      }))
)

# Join the computed counts back onto df
df = df.join(counts, on='group')

In [23]:
# Define a function to process the 'Outcome' column while preserving sequence
def extract_outcome_sequence(outcome):
    if not isinstance(outcome, str):  # Handle NaN or non-string values
        return ''
    # Replace words sequentially
    outcome = outcome.replace('wrong', '0')
    outcome = outcome.replace('correct', '1')
    outcome = outcome.replace('hint', '2')
    # Remove extra characters like commas and spaces
    return ''.join([char for char in outcome if char in '012'])

# Define a function to extract the sequence of "wrong", "correct", and "hint"
def extract_outcome_se(outcome):
    if not isinstance(outcome, str):  # Handle NaN or non-string values
        return ''
    # Only proceed if the outcome starts with "G" or "(#"
    if not (outcome.startswith("G") or outcome.startswith("(#")):
        return ''
        
    # Split the string into words and collect only the target ones
    words = outcome.lower().split()
    sequence = []
    for word in words:
        if 'wrong' in word:
            sequence.append('wrong')
        elif 'correct' in word:
            sequence.append('correct')
        elif 'hint' in word:
            sequence.append('hint')
    return ', '.join(sequence)


# Apply the function to the 'Outcome' column
df['Outcome_se'] = df['Outcome'].apply(extract_outcome_se)

# Apply the function to the 'Outcome' column
df['Outcome_str'] = df['Outcome_se'].apply(extract_outcome_sequence)

In [24]:
df=df.drop(columns=['Outcome_se','Skill','Grade | Section | Session | Skill'])

* L_num_qs_first_correct
* L_num_wrong_start_correct_end

In [25]:
# Identify groups where the first "Session" is "LEARNING"
learning_groups = df.groupby("group")["Session"].transform(lambda x: x.iloc[0] == "LEARNING")

# 2) Cumulative count of Outcome_str starting with "1" for each row in LEARNING groups
df["L_num_qs_first_correct"] = (
    (learning_groups & df["Outcome_str"].str.startswith("1"))
      .groupby(df["group"])
      .cumsum()
      .astype(int)
)

# 3) Cumulative count of Outcome_str matching “0…1” for each row in LEARNING groups
df["L_num_wrong_start_correct_end"] = (
    (learning_groups & df["Outcome_str"].str.match(r"0.*1$"))
      .groupby(df["group"])
      .cumsum()
      .astype(int)
)

# 4) Cumulative count of all questions (Outcome starting with "(#") in LEARNING groups
df["L_total_qs"] = (
    (learning_groups & df["Outcome"].str.startswith("(#"))
      .groupby(df["group"])
      .cumsum()
      .astype(int)
)


In [26]:
# # Count occurrences where "Outcome_str" starts with "0" and ends with "1" in each "LEARNING" group
# count_wrong_start_correct_end = (
#     df[learning_groups & df["Outcome_str"].str.match(r"0.*1$")]
#     .groupby("group")["Outcome_str"]
#     .count()
# )

# # Initialize the new column with 0
# df["L_num_wrong_start_correct_end"] = 0

# # Assign the count only to the first row of each group
# df.loc[df.groupby("group").head(1).index, "L_num_wrong_start_correct_end"] = df["group"].map(count_wrong_start_correct_end).fillna(0).astype(int)


### count number of questions

In [27]:
# counts = (
#     df.groupby('group')[['Session', 'Outcome']]
#       .apply(lambda g: g['Outcome'].str.startswith('(#').sum() 
#                       if g['Session'].iloc[0] == 'LEARNING' 
#                       else 0)
# )
# df['L_total_qs'] = df['group'].map(counts)


In [28]:
# Patterns to remove
patterns = [r'Answer History:']

# Remove rows with matching patterns in the 'Outcome' column
df = df[~df['Outcome'].str.contains('|'.join(patterns), na=False)]

In [29]:
df.shape

(34870, 18)

In [30]:
# # -----------------------
# # Define your function
# # -----------------------
# def concatenate_outcomes(group):
#     """
#     If the group's first row is 'LEARNING', concatenate all
#     'Outcome_str' values in that group, place them in the first row,
#     and set subsequent rows' 'Outcome_str' to NaN.
#     """
#     if group['Session'].iloc[0] == 'LEARNING':
#         concatenated = ''.join(group['Outcome_str'])
#         group.at[group.index[0], 'Outcome_str'] = concatenated
#         group.loc[group.index[1:], 'Outcome_str'] = np.nan
#     return group

# # -----------------------
# # Apply the function only on the needed columns
# # -----------------------
# df[['Session', 'Outcome_str']] = (
#     df[['Session', 'Outcome_str']]                 # Select only relevant columns
#     .groupby(df['group'], group_keys=False)        # Group by 'group'
#     .apply(concatenate_outcomes)                   # Apply the function
# )
df["Session"] = df["Session"].ffill()

mask = df["Session"] == "LEARNING"

def cum_substrings(s: str):
    s = str(s)
    return [s[:i] for i in range(1, len(s) + 1)]

mask = df['Session'] == 'LEARNING'

# ensure string type
df['Outcome_str'] = df['Outcome_str'].astype(str)

# for LEARNING, build cumulative lists; for others, wrap as single-element list
df.loc[mask, 'Outcome_str'] = (
    df.loc[mask, 'Outcome_str']
      .apply(cum_substrings)
)

# explode—only the list-cells will expand; single strings stay as-is
df = df.explode("Outcome_str")

# -----------------------
# Clean up rows with NaN 'Outcome_str'
# ----------------------

#df = df[~df['Outcome'].str.startswith(('Demo', 'Assisted'))].reset_index(drop=True)


In [31]:
df.head(50)

Unnamed: 0,Start Time,End Time,Duration (min:sec),Score,At LC,Points,Outcome,Grade,Session,skill_head,skill_level_change,group,L_num_completed_demo,L_num_correct_assisted,Outcome_str,L_num_qs_first_correct,L_num_wrong_start_correct_end,L_total_qs
0,2020-08-09 14:09:53,2020-08-09 14:11:24,01:31,67.71(0.80),,--,"G4, 1.1Level 2 ---> Level 3 correct, (time: 11...",4.0,PROFICIENCY,G4 1.1,23,1,0,0,1.0,0,0,0
1,NaT,NaT,,,,,"G4, 1.2Level 2 ---> Level 3 correct, (time: 30...",,PROFICIENCY,G4 1.2,23,1,0,0,1.0,0,0,0
2,NaT,NaT,,,,,"G4, 1.3Level 2 ---> Level 3 correct, (time: 6/...",,PROFICIENCY,G4 1.3,23,1,0,0,1.0,0,0,0
3,NaT,NaT,,,,,"G4, 1.4Level 2 ---> Level 3 correct, (time: 16...",,PROFICIENCY,G4 1.4,23,1,0,0,1.0,0,0,0
4,NaT,NaT,,,,,"G4, 1.5Level 2 ---> Level 3 correct, (time: 13...",,PROFICIENCY,G4 1.5,23,1,0,0,1.0,0,0,0
5,NaT,NaT,,,,,"G4, 1.6Level 2 ---> Level 3 correct, (time: 8/...",,PROFICIENCY,G4 1.6,23,1,0,0,1.0,0,0,0
6,2020-08-09 14:11:36,2020-08-09 14:13:26,01:50,68.03(0.32),,--,"G4, 1.7Level 2 ---> Level 3 correct, (time: 6/...",4.0,PROFICIENCY,G4 1.7,23,2,0,0,1.0,0,0,0
7,NaT,NaT,,,,,"G4, 1.8Level 2 ---> Level 3 correct, (time: 14...",,PROFICIENCY,G4 1.8,23,2,0,0,1.0,0,0,0
8,NaT,NaT,,,,,"G4, 1.9Level 2 ---> Level 3 correct, (time: 21...",,PROFICIENCY,G4 1.9,23,2,0,0,1.0,0,0,0
9,NaT,NaT,,,,,"G4, 2.1Level 2 ---> Level 3 correct, (time: 16...",,PROFICIENCY,G4 2.1,23,2,0,0,1.0,0,0,0


In [32]:
df.shape

(37760, 18)

In [33]:
# Extract the first and second time values using regex
df['first_answer_time'] = df['Outcome'].str.extract(r'time:\s*(\d+)/')[0]
df['skill_specified_time'] = df['Outcome'].str.extract(r'/(\d+)\s*sec')[0]

# Convert the extracted values to numeric (optional, for calculations)
df['first_answer_time'] = pd.to_numeric(df['first_answer_time'], errors='raise')
df['skill_specified_time'] = pd.to_numeric(df['skill_specified_time'], errors='raise')


In [34]:
df.head()

Unnamed: 0,Start Time,End Time,Duration (min:sec),Score,At LC,Points,Outcome,Grade,Session,skill_head,skill_level_change,group,L_num_completed_demo,L_num_correct_assisted,Outcome_str,L_num_qs_first_correct,L_num_wrong_start_correct_end,L_total_qs,first_answer_time,skill_specified_time
0,2020-08-09 14:09:53,2020-08-09 14:11:24,01:31,67.71(0.80),,--,"G4, 1.1Level 2 ---> Level 3 correct, (time: 11...",4.0,PROFICIENCY,G4 1.1,23,1,0,0,1,0,0,0,11.0,30.0
1,NaT,NaT,,,,,"G4, 1.2Level 2 ---> Level 3 correct, (time: 30...",,PROFICIENCY,G4 1.2,23,1,0,0,1,0,0,0,30.0,60.0
2,NaT,NaT,,,,,"G4, 1.3Level 2 ---> Level 3 correct, (time: 6/...",,PROFICIENCY,G4 1.3,23,1,0,0,1,0,0,0,6.0,30.0
3,NaT,NaT,,,,,"G4, 1.4Level 2 ---> Level 3 correct, (time: 16...",,PROFICIENCY,G4 1.4,23,1,0,0,1,0,0,0,16.0,30.0
4,NaT,NaT,,,,,"G4, 1.5Level 2 ---> Level 3 correct, (time: 13...",,PROFICIENCY,G4 1.5,23,1,0,0,1,0,0,0,13.0,30.0


In [35]:
def split_skill_level(skill_level_change):
    if pd.isna(skill_level_change) or not isinstance(skill_level_change, str):
        return pd.NA, pd.NA
    
    
    # Define the specific codes and their corresponding splits
    if skill_level_change == '10.5':
        return '1', '0.5'
    elif skill_level_change == '0.52':
        return '0.5', '2'
    elif skill_level_change == '0.50':
        return '0.5', '0'
    else:
        # Handle unexpected values gracefully
        if len(skill_level_change) >= 2:
            return skill_level_change[0], skill_level_change[1]
        else:
            return pd.NA, pd.NA


# For groups where the first session is LEARNING, set every row’s
# skill_level_change and skill_head to that group’s first values
cols = ['skill_level_change', 'skill_head','Start Time','End Time','Duration (min:sec)','Score', 'At LC','Points','Grade']
df.loc[learning_groups, cols] = (
    df.loc[learning_groups, cols]
      .groupby(df.loc[learning_groups, 'group'])[cols]
      .transform('first')
)


# Apply the function to the 'skill_level_change' column and create new columns
df[['level_begin', 'level_end']] = df['skill_level_change'].apply(lambda x: pd.Series(split_skill_level(x)))


In [36]:
df[df['Session']=="LEARNING"].head(20)

Unnamed: 0,Start Time,End Time,Duration (min:sec),Score,At LC,Points,Outcome,Grade,Session,skill_head,...,L_num_completed_demo,L_num_correct_assisted,Outcome_str,L_num_qs_first_correct,L_num_wrong_start_correct_end,L_total_qs,first_answer_time,skill_specified_time,level_begin,level_end
24,2020-08-09 14:36:51,2020-08-09 14:44:39,07:48,68.14(0.78),,--,To Be Learned to Level 1,4,LEARNING,G4 2.6,...,0,0,,0,0,0,,,0,1
26,2020-08-09 14:36:51,2020-08-09 14:44:39,07:48,68.14(0.78),,--,"(#1)wrong, wrong, correct.",4,LEARNING,G4 2.6,...,0,0,0.0,0,1,1,,,0,1
26,2020-08-09 14:36:51,2020-08-09 14:44:39,07:48,68.14(0.78),,--,"(#1)wrong, wrong, correct.",4,LEARNING,G4 2.6,...,0,0,0.0,0,1,1,,,0,1
26,2020-08-09 14:36:51,2020-08-09 14:44:39,07:48,68.14(0.78),,--,"(#1)wrong, wrong, correct.",4,LEARNING,G4 2.6,...,0,0,1.0,0,1,1,,,0,1
27,2020-08-09 14:36:51,2020-08-09 14:44:39,07:48,68.14(0.78),,--,(#2)correct.,4,LEARNING,G4 2.6,...,0,0,1.0,1,1,2,,,0,1
28,2020-08-09 14:36:51,2020-08-09 14:44:39,07:48,68.14(0.78),,--,(#3)correct.,4,LEARNING,G4 2.6,...,0,0,1.0,2,1,3,,,0,1
29,2020-08-09 14:36:51,2020-08-09 14:44:39,07:48,68.14(0.78),,--,(#4)correct.,4,LEARNING,G4 2.6,...,0,0,1.0,3,1,4,,,0,1
30,2020-08-10 11:28:21,2020-08-10 11:34:29,06:08,68.14(0.00),,--,To Be Learned to Level 1,4,LEARNING,G4 2.7,...,0,0,,0,0,0,,,0,1
31,2020-08-10 11:28:21,2020-08-10 11:34:29,06:08,68.14(0.00),,--,Demo 1 not complete!,4,LEARNING,G4 2.7,...,0,0,,0,0,0,,,0,1
33,2020-08-10 11:28:21,2020-08-10 11:34:29,06:08,68.14(0.00),,--,(#1)correct.,4,LEARNING,G4 2.7,...,0,0,1.0,1,0,1,,,0,1


In [37]:
df[df['Session']=="LEARNING"]['group'].head(20)

24     5
26     5
26     5
26     5
27     5
28     5
29     5
30     6
31     6
33     6
34     6
35     6
36     6
37     6
77    15
78    15
79    15
81    15
81    15
82    15
Name: group, dtype: int64

In [38]:
# Method 1: on the sliced Series
idx_all = df.index[df['Session'] == "LEARNING"]
# first 20 via slicing
print(idx_all[:20])

Index([24, 26, 26, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 77, 78, 79, 81,
       81, 82],
      dtype='int64')


In [39]:
df = df.dropna(subset=['Outcome_str']).reset_index(drop=True)

In [40]:
cols = ['Start Time','End Time','Duration (min:sec)',
        'Score','At LC','Points','Grade']

df[cols] = df.groupby('group')[cols] \
            .transform(lambda x: x.where(x.index == x.index[0]))

duplicate 

In [41]:
cols = ['Start Time', 'End Time', 'Duration (min:sec)',
        'Score', 'At LC', 'Points', 'Grade']          # make sure these spellings match your df

# 1️⃣ first row of each group (keeps original group order)
first_rows = df.groupby('group', sort=False).first()   # index = group label, no need to reset

# 2️⃣ flag first-rows that duplicate an earlier signature in *those* seven columns
dup_mask = first_rows.duplicated(subset=cols, keep='first')

# 3️⃣ groups to discard  →   just grab them from the index
groups_to_drop = first_rows.index[dup_mask]

# 4️⃣ drop every row whose group is in that list; order unchanged
df = df[~df['group'].isin(groups_to_drop)].reset_index(drop=True)


In [42]:
df = df.drop(columns=['group'])

In [43]:
df=df.drop(columns=['Outcome'])
df=df.rename(columns={'Outcome_str': 'Outcome'})

In [44]:
df.rename(columns={"Start Time": "start_time", "End Time": "end_time",
                   'Grade':'grade','Session':'session',
                   'skill_head':'skill','Outcome':'outcome'}, inplace=True)

In [45]:
# Create a statistics summary
summary = df.describe()

# Save both the data and the summary to an Excel file
with pd.ExcelWriter('student8/data_formatted_s8.xlsx') as writer:
    df.to_excel(writer, sheet_name='Data', index=False)
    summary.to_excel(writer, sheet_name='Statistics Summary')

#print("Data and statistics summary have been saved to 'combined_data_final.xlsx'.")

In [46]:
df.describe()

Unnamed: 0,start_time,end_time,At LC,L_num_completed_demo,L_num_correct_assisted,L_num_qs_first_correct,L_num_wrong_start_correct_end,L_total_qs,first_answer_time,skill_specified_time
count,5841,5841,0.0,26319.0,26319.0,26319.0,26319.0,26319.0,14422.0,14422.0
mean,2022-03-07 16:03:34.546995456,2022-03-07 16:08:59.375107328,,0.218929,0.001292,1.248224,0.505414,1.8207,27.97691,94.566981
min,2020-08-09 14:09:53,2020-08-09 14:11:24,,0.0,0.0,0.0,0.0,0.0,0.0,15.0
25%,2021-01-20 07:41:50,2021-01-20 07:45:25,,0.0,0.0,0.0,0.0,0.0,10.0,45.0
50%,2022-03-11 20:28:47,2022-03-11 20:28:57,,0.0,0.0,0.0,0.0,0.0,17.0,90.0
75%,2023-03-15 16:08:36,2023-03-15 16:12:03,,0.0,0.0,2.0,1.0,3.0,32.0,120.0
max,2025-03-18 20:42:47,2025-03-18 20:43:37,,5.0,2.0,23.0,14.0,31.0,1332.0,360.0
std,,,,0.606301,0.04773,2.0949,1.091925,2.933444,43.207239,59.40732


In [47]:
# Columns to analyze
columns_to_check = ['outcome', 'level_begin', 'level_end']

# Check unique values and filter those containing a-z
unique_values_with_chars = {
    col: [val for val in df[col].unique() if isinstance(val, str) and any(c.isalpha() for c in val)]
    for col in columns_to_check
}

# Print the results
print("Columns with unique values containing characters a-z:")
for column, values in unique_values_with_chars.items():
    print(f"{column}: {values}")

Columns with unique values containing characters a-z:
outcome: []
level_begin: []
level_end: []


In [48]:
print(df.loc[df['skill_level_change'].str.contains('Level', na=False), 'skill_level_change'].unique())

[]


## check repetitive time

In [49]:
# Get repeated 'Start Time' values
repeated_values = df['start_time'].value_counts()
repeated_values = repeated_values[repeated_values > 1]

# Convert to a DataFrame and sort by datetime
repeated_values = repeated_values.rename_axis('Start Time').reset_index(name='Count')
repeated_values['Start Time'] = pd.to_datetime(repeated_values['Start Time'])
repeated_values = repeated_values.sort_values('Start Time')

# Display
print(repeated_values)


Empty DataFrame
Columns: [Start Time, Count]
Index: []


## check if there is character

In [50]:
df.isnull().sum()

start_time                       20478
end_time                         20478
Duration (min:sec)               20478
Score                            20478
At LC                            26319
Points                           20478
grade                            20478
session                              0
skill                                0
skill_level_change                   0
L_num_completed_demo                 0
L_num_correct_assisted               0
outcome                              0
L_num_qs_first_correct               0
L_num_wrong_start_correct_end        0
L_total_qs                           0
first_answer_time                11897
skill_specified_time             11897
level_begin                          0
level_end                            0
dtype: int64

## the numbers should be the same, otherwise manually process 