In [1]:
"""
Created on: Thu. 13 Oct. 2022
Updated on: Fri. 6 Jan. 2023
Author: Mélina Verger

Merge stInfo with the total number of clicks.
"""

# For data manipulation
import pandas as pd

# To handle ZIP files
import zipfile

## Load data sets

In [2]:
zf = zipfile.ZipFile("../data/data.zip") 

In [3]:
studentInfo = pd.read_csv(zf.open("studentInfo.csv"))
studentVle = pd.read_csv(zf.open("studentVle.csv"))

In [4]:
studentInfo

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass
1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass
2,AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn
3,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass
4,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass
...,...,...,...,...,...,...,...,...,...,...,...,...
32588,GGG,2014J,2640965,F,Wales,Lower Than A Level,10-20,0-35,0,30,N,Fail
32589,GGG,2014J,2645731,F,East Anglian Region,Lower Than A Level,40-50%,35-55,0,30,N,Distinction
32590,GGG,2014J,2648187,F,South Region,A Level or Equivalent,20-30%,0-35,0,30,Y,Pass
32591,GGG,2014J,2679821,F,South East Region,Lower Than A Level,90-100%,35-55,0,30,N,Withdrawn


## Aggregate click information

In [5]:
studentVle  # need to sum the cliks over the dates and 'id_site'

Unnamed: 0,code_module,code_presentation,id_student,id_site,date,sum_click
0,AAA,2013J,28400,546652,-10,4
1,AAA,2013J,28400,546652,-10,1
2,AAA,2013J,28400,546652,-10,1
3,AAA,2013J,28400,546614,-10,11
4,AAA,2013J,28400,546714,-10,1
...,...,...,...,...,...,...
10655275,GGG,2014J,675811,896943,269,3
10655276,GGG,2014J,675578,896943,269,1
10655277,GGG,2014J,654064,896943,269,3
10655278,GGG,2014J,654064,896939,269,1


In [6]:
clicks = studentVle.groupby(["code_module", "code_presentation", "id_student"]).sum()
clicks = clicks.drop(columns=["id_site", "date"])

In [7]:
clicks

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sum_click
code_module,code_presentation,id_student,Unnamed: 3_level_1
AAA,2013J,11391,934
AAA,2013J,28400,1435
AAA,2013J,30268,281
AAA,2013J,31604,2158
AAA,2013J,32885,1034
...,...,...,...
GGG,2014J,2640965,41
GGG,2014J,2645731,893
GGG,2014J,2648187,312
GGG,2014J,2679821,275


In [8]:
# # To check the group by (change the id_student number and verify the sum)
# studentVle[studentVle["code_module"]=="AAA"][studentVle["code_presentation"]=="2013J"][studentVle["id_student"]==30268]["sum_click"].sum()

## Merge studentInfo with click and score

In [9]:
studentClick = pd.merge(studentInfo, clicks, how="inner", on=["code_module", "code_presentation", "id_student"])
# It is normal if with inner join the number of rows goes from 32593 (rows from studentInfo) to 29228 (rows from studentClick) because of the duplicates in studentInfo

In [10]:
# Arrange columns order
studentClick = pd.DataFrame(studentClick, columns=['code_module', 'code_presentation', 'id_student', 'gender', 'region',
       'highest_education', 'imd_band', 'age_band', 'disability', 'num_of_prev_attempts', 
       'studied_credits', 'sum_click', 'final_result'])

In [11]:
studentClick.to_csv("../data/studentClick.csv", index=False)

In [12]:
studentClick

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,disability,num_of_prev_attempts,studied_credits,sum_click,final_result
0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,N,0,240,934,Pass
1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,N,0,60,1435,Pass
2,AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,Y,0,60,281,Withdrawn
3,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,N,0,60,2158,Pass
4,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,N,0,60,1034,Pass
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29223,GGG,2014J,2640965,F,Wales,Lower Than A Level,10-20,0-35,N,0,30,41,Fail
29224,GGG,2014J,2645731,F,East Anglian Region,Lower Than A Level,40-50%,35-55,N,0,30,893,Distinction
29225,GGG,2014J,2648187,F,South Region,A Level or Equivalent,20-30%,0-35,Y,0,30,312,Pass
29226,GGG,2014J,2679821,F,South East Region,Lower Than A Level,90-100%,35-55,N,0,30,275,Withdrawn


In [13]:
studentClick.isna().sum()

code_module                0
code_presentation          0
id_student                 0
gender                     0
region                     0
highest_education          0
imd_band                1054
age_band                   0
disability                 0
num_of_prev_attempts       0
studied_credits            0
sum_click                  0
final_result               0
dtype: int64