In [1]:
# Purpose:
# This program uses the actionLog file from DeepThought F20 condition3 (adaptive condition)
# Calculates:
#  the average # of steps per student for total tutor
#  the average # of training steps per student for total tutor
#  the average # of level end steps per student
#  the average # of posttest steps per student
#
# Preprocessing:
# Remove the rows with erroneous IDs: hkaakat, sukhan
# Sort the rows based on userId, and actionCount
#
# Author:
# Nazia Alam

In [2]:
#imports
import pandas as pd
import numpy as np
import re

In [3]:
#Read file for condition3: actionLog_L7_3_F20.csv
#Read only the necessary columns from the csv file
col_list = ["id","userID","courseID","actionCount","currentProblem","stepCount","stepPreState","stepPostState","actionCode","currentProblemType"]
df = pd.read_csv("D:/Courses/Fall2021/CSC890/Fall2020 work/Analysis/Dataset/Condition3/actionLog_L7_3_F20.csv", usecols=col_list)
print(df.shape)
print(df.head())


(208063, 10)
     id    userID                             courseID  actionCount  \
0  6357  wtmoore2  NCSU;CSC226;001;STALLMANN;Fall;2020            1   
1  6358  wtmoore2  NCSU;CSC226;001;STALLMANN;Fall;2020            2   
2  6359  wtmoore2  NCSU;CSC226;001;STALLMANN;Fall;2020            3   
3  6360  wtmoore2  NCSU;CSC226;001;STALLMANN;Fall;2020            4   
4  6361  wtmoore2  NCSU;CSC226;001;STALLMANN;Fall;2020            5   

   stepCount  actionCode                                       stepPreState  \
0          0           0                                                NaN   
1          0          14                                                NaN   
2          0          12  (A>(B*C))[1;0;Given],(A+D)[2;0;Given],(-D*E)[3...   
3          0          12  (A>(B*C))[1;0;Given],(A+D)[2;0;Given],(-D*E)[3...   
4          0          11  (A>(B*C))[1;0;Given],(A+D)[2;0;Given],(-D*E)[3...   

                                       stepPostState  currentProblem  \
0            

In [4]:
# Preprocessing
# Remove the rows with erroneous IDs: hkaakat, sukhan

# Get indexes for the rows that should be dropped
indexNumbers = df.loc[(df['userID'] == 'hkaakat') | (df['userID'] =='sukhan')].index

# Delete these row indexes 
df.drop(indexNumbers , inplace=True)
print(df.shape)
#print(df)

(203178, 10)


In [5]:
# Preprocessing
# Sort the rows based on userId, and actionCount
dfNew = df.sort_values(by=['userID','actionCount'])
print(dfNew.shape)
print(dfNew.head)
#dfNew.to_csv("actionLog_3_F20_PreprocessedStepAnalysis.csv")

(203178, 10)
<bound method NDFrame.head of             id   userID                             courseID  actionCount  \
39027   134280  ahagedo  NCSU;CSC226;001;STALLMANN;Fall;2020            1   
39028   134282  ahagedo  NCSU;CSC226;001;STALLMANN;Fall;2020            2   
39029   134286  ahagedo  NCSU;CSC226;001;STALLMANN;Fall;2020            3   
39030   134287  ahagedo  NCSU;CSC226;001;STALLMANN;Fall;2020            4   
39031   134288  ahagedo  NCSU;CSC226;001;STALLMANN;Fall;2020            5   
...        ...      ...                                  ...          ...   
173916  206550   yluo14  NCSU;CSC226;002;STALLMANN;Fall;2020         2035   
173917  206552   yluo14  NCSU;CSC226;002;STALLMANN;Fall;2020         2036   
173918  206553   yluo14  NCSU;CSC226;002;STALLMANN;Fall;2020         2037   
173919  206555   yluo14  NCSU;CSC226;002;STALLMANN;Fall;2020         2038   
173920  206558   yluo14  NCSU;CSC226;002;STALLMANN;Fall;2020         2039   

        stepCount  actionCode  \

In [6]:
# Create the student list 
result = pd.DataFrame(columns = ['userID', 'totalSteps'])
result['userID'] = dfNew['userID'].unique()
print(result.shape)
print(result)

(78, 2)
      userID totalSteps
0    ahagedo        NaN
1     amdaum        NaN
2   askelly4        NaN
3    asnair2        NaN
4   bjjohns6        NaN
..       ...        ...
73  wcallred        NaN
74  wtmoore2        NaN
75  ychen267        NaN
76   yliu229        NaN
77    yluo14        NaN

[78 rows x 2 columns]


In [7]:
# Find the number of steps per student
dfNew = dfNew.assign(stateChange=0)
dfNew = dfNew.reset_index()
print(dfNew.head())

   index      id   userID                             courseID  actionCount  \
0  39027  134280  ahagedo  NCSU;CSC226;001;STALLMANN;Fall;2020            1   
1  39028  134282  ahagedo  NCSU;CSC226;001;STALLMANN;Fall;2020            2   
2  39029  134286  ahagedo  NCSU;CSC226;001;STALLMANN;Fall;2020            3   
3  39030  134287  ahagedo  NCSU;CSC226;001;STALLMANN;Fall;2020            4   
4  39031  134288  ahagedo  NCSU;CSC226;001;STALLMANN;Fall;2020            5   

   stepCount  actionCode                                       stepPreState  \
0          0           0                                                NaN   
1          0          14                                                NaN   
2          0           1  (A>(B*C))[1;0;Given],(A+D)[2;0;Given],(-D*E)[3...   
3          0          -1  (A>(B*C))[1;0;Given],(A+D)[2;0;Given],(-D*E)[3...   
4          0          -1  (A>(B*C))[1;0;Given],(A+D)[2;0;Given],(-D*E)[3...   

                                       stepPostSta

In [8]:
# Find the number of steps per student
dfNew['stateChange'] = 0
dfNew = dfNew.astype({"stepPreState": str, "stepPostState": str})

for i in range(len(dfNew)-1):

    if((dfNew['userID'].iloc[i]==dfNew['userID'].iloc[i+1]) and (dfNew['currentProblem'].iloc[i]==dfNew['currentProblem'].iloc[i+1]) and (dfNew['currentProblemType'].iloc[i]=="PS") ):
        if(dfNew['stepPostState'].iloc[i]!=dfNew['stepPostState'].iloc[i+1]):
            if(dfNew['stepPostState'].iloc[i]=="nan" or dfNew['stepPostState'].iloc[i+1]=="nan"):
                continue
            #if(np.isnan(dfNew['stepPostState'].iloc[i])==False and np.isnan(dfNew['stepPostState'].iloc[i+1])==False):
            else:    
                dfNew['stateChange'].iloc[i] = 1
                     

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [9]:

dfNew.to_csv("actionLog_3_F20_PreprocessedStepAnalysis.csv")

In [10]:
# Find the number of steps per student
result['totalSteps'] = (dfNew.groupby('userID')['stateChange'].sum()).values
print(result.head())

     userID  totalSteps
0   ahagedo         363
1    amdaum         415
2  askelly4         344
3   asnair2         336
4  bjjohns6         270


In [11]:
# Find the number of training steps per student
dfNew['trainingSteps'] = np.where(((dfNew['currentProblem']!=2.8) & (dfNew['currentProblem']!=3.8) & (dfNew['currentProblem']!=4.8) & (dfNew['currentProblem']!=5.8) & (dfNew['currentProblem']!=6.8) & (dfNew['currentProblem']<7) & (dfNew['currentProblem']>=2) ),  dfNew['stateChange'],0)
result['trainingSteps'] = (dfNew.groupby('userID')['trainingSteps'].sum()).values
print(result.head())

     userID  totalSteps  trainingSteps
0   ahagedo         363            107
1    amdaum         415            119
2  askelly4         344            135
3   asnair2         336            115
4  bjjohns6         270            114


In [12]:
# Find the number of level end steps per student
dfNew['levelEndSteps'] = np.where( ( (dfNew['currentProblem']==2.8) | (dfNew['currentProblem']==3.8) | (dfNew['currentProblem']==4.8) | (dfNew['currentProblem']==5.8) | (dfNew['currentProblem']==6.8) ),  dfNew['stateChange'],0)
result['levelEndSteps'] = (dfNew.groupby('userID')['levelEndSteps'].sum()).values
print(result.head())

     userID  totalSteps  trainingSteps  levelEndSteps
0   ahagedo         363            107            138
1    amdaum         415            119            196
2  askelly4         344            135            126
3   asnair2         336            115             66
4  bjjohns6         270            114             68


In [13]:
# Find the number of posttest steps per student
dfNew['posttestSteps'] = np.where( (dfNew['currentProblem']>=7 ),  dfNew['stateChange'],0)
result['posttestSteps'] = (dfNew.groupby('userID')['posttestSteps'].sum()).values
print(result.head())

     userID  totalSteps  trainingSteps  levelEndSteps  posttestSteps
0   ahagedo         363            107            138             56
1    amdaum         415            119            196             73
2  askelly4         344            135            126             70
3   asnair2         336            115             66            130
4  bjjohns6         270            114             68             77


In [14]:
# Find the number of steps per student in transfer task
dfNew['posttestStepsT'] = np.where( (dfNew['currentProblem']==7.6 ),  dfNew['stateChange'],0)
result['posttestStepsT'] = (dfNew.groupby('userID')['posttestStepsT'].sum()).values
print(result.head())

     userID  totalSteps  trainingSteps  levelEndSteps  posttestSteps  \
0   ahagedo         363            107            138             56   
1    amdaum         415            119            196             73   
2  askelly4         344            135            126             70   
3   asnair2         336            115             66            130   
4  bjjohns6         270            114             68             77   

   posttestStepsT  
0              24  
1              28  
2              10  
3              55  
4              31  


In [15]:
result.to_csv("Result/actionLog_3_F20_ResultStepAnalysis.csv")

In [19]:
# Find average # of steps per student
print("total steps")
avgTotalSteps = np.mean(result['totalSteps'])
print(avgTotalSteps)
SDTotalSteps = np.std(result['totalSteps'])
print(SDTotalSteps)

# Find average # of training steps per student
print("training steps")
avgTrainingSteps = np.mean(result['trainingSteps'])
print(avgTrainingSteps)
SDTrainingSteps = np.std(result['trainingSteps'])
print(SDTrainingSteps)

# Find average # of level end steps per student
print("level end steps")
avgLevelEndSteps = np.mean(result['levelEndSteps'])
print(avgLevelEndSteps)
SDLevelEndSteps = np.std(result['levelEndSteps'])
print(SDLevelEndSteps)

# Find average # of posttest steps per student
print("posttest steps")
avgPosttestSteps = np.mean(result['posttestSteps'])
print(avgPosttestSteps)
SDPosttestSteps = np.std(result['posttestSteps'])
print(SDPosttestSteps)


# Find average # of total posttest steps per student
print("total posttest steps")
avgTotalPosttestSteps = np.mean(result['posttestSteps']+result['levelEndSteps'])
print(avgTotalPosttestSteps)
SDTotalPosttestSteps = np.std(result['posttestSteps']+result['levelEndSteps'])
print(SDTotalPosttestSteps)

# Find average # of steps per student in transfer task
print("transfer task")
avgPosttestStepsT = np.mean(result['posttestStepsT'])
print(avgPosttestStepsT)
SDPosttestStepsT = np.std(result['posttestStepsT'])
print(SDPosttestStepsT)

total steps
354.7435897435897
105.60308317874687
training steps
122.84615384615384
32.659893432974556
level end steps
104.7051282051282
58.83203143203081
posttest steps
94.47435897435898
41.875188283992884
total posttest steps
199.17948717948718
79.6865242266643
transfer task
33.17948717948718
32.16055219808474
