In [19]:
"""
Created on: Mon. 29 Aug. 2022
Updated on: Tue. 30 Aug. 2022
Updated on: Wed. 31 Aug. 2022
Updated on: Fri. 6 Jan. 2023
Author: Mélina Verger

Merge stInfo with the total number of clicks and the weighted scores.
"""

# To interact with the operating system
import os

# For data manipulation
import pandas as pd

# To handle ZIP files
import zipfile

## Load data sets

In [20]:
zf = zipfile.ZipFile("../data/data.zip") 

In [21]:
studentInfo = pd.read_csv(zf.open("studentInfo.csv"))
studentAssessment = pd.read_csv(zf.open("studentAssessment.csv"))
studentVle = pd.read_csv(zf.open("studentVle.csv"))
assessments = pd.read_csv(zf.open("assessments.csv"))

## Merge assessments information

In [22]:
studentAssessment

Unnamed: 0,id_assessment,id_student,date_submitted,is_banked,score
0,1752,11391,18,0,78.0
1,1752,28400,22,0,70.0
2,1752,31604,17,0,72.0
3,1752,32885,26,0,69.0
4,1752,38053,19,0,79.0
...,...,...,...,...,...
173907,37443,527538,227,0,60.0
173908,37443,534672,229,0,100.0
173909,37443,546286,215,0,80.0
173910,37443,546724,230,0,100.0


In [23]:
assessments

Unnamed: 0,code_module,code_presentation,id_assessment,assessment_type,date,weight
0,AAA,2013J,1752,TMA,19.0,10.0
1,AAA,2013J,1753,TMA,54.0,20.0
2,AAA,2013J,1754,TMA,117.0,20.0
3,AAA,2013J,1755,TMA,166.0,20.0
4,AAA,2013J,1756,TMA,215.0,30.0
...,...,...,...,...,...,...
201,GGG,2014J,37443,CMA,229.0,0.0
202,GGG,2014J,37435,TMA,61.0,0.0
203,GGG,2014J,37436,TMA,124.0,0.0
204,GGG,2014J,37437,TMA,173.0,0.0


In [24]:
studentScore = pd.merge(studentAssessment, assessments, how="inner", on="id_assessment")
studentScore = studentScore.drop(columns=["date_submitted", "is_banked", "date"])

In [25]:
studentScore

Unnamed: 0,id_assessment,id_student,score,code_module,code_presentation,assessment_type,weight
0,1752,11391,78.0,AAA,2013J,TMA,10.0
1,1752,28400,70.0,AAA,2013J,TMA,10.0
2,1752,31604,72.0,AAA,2013J,TMA,10.0
3,1752,32885,69.0,AAA,2013J,TMA,10.0
4,1752,38053,79.0,AAA,2013J,TMA,10.0
...,...,...,...,...,...,...,...
173907,37443,527538,60.0,GGG,2014J,CMA,0.0
173908,37443,534672,100.0,GGG,2014J,CMA,0.0
173909,37443,546286,80.0,GGG,2014J,CMA,0.0
173910,37443,546724,100.0,GGG,2014J,CMA,0.0


## Aggregate assessments information

In [26]:
# # df.csv already computed since time demanding (available in pre-loaded folder)
# # ~ 206 rows per minute => 844' for 173,912 rows => 14h
# # 146' (~ 2.5h) in the end for 25,843 over 173,912 iterations that satisfy condition

# code_module = studentScore["code_module"].unique()
# code_presentation = studentScore["code_presentation"].unique()
# id_student = studentScore["id_student"].unique()

# studentScore["weight"].replace(0, 1, inplace=True)  # to avoid zero division

# # Create empty dataframe
# new_df = pd.DataFrame(columns=["id_student", "code_module", "code_presentation", "weighted_score"])

# i = 0
# for id_stu in id_student:
#     for code_mod in code_module:
#         for code_pres in code_presentation:
#             mini_df = studentScore.loc[(studentScore["id_student"] == id_stu) & (studentScore["code_module"] == code_mod) & (studentScore["code_presentation"] == code_pres)]
#             if len(mini_df) > 0:  # if data exists
#                 i +=1
#                 sum_weight = mini_df["weight"].sum()
#                 numerator = (mini_df["score"] * mini_df["weight"]).sum()
#                 weighted_score = numerator/sum_weight
#                 new_line = pd.DataFrame([(id_stu, code_mod, code_pres, weighted_score)], columns=["id_student", "code_module", "code_presentation", "weighted_score"])
#                 new_df = pd.concat([new_df, new_line], ignore_index=True)
#                 print("Iteration {} / 173,912".format(i), end="\r", flush=True)  # overwritting display
                
# new_df.to_csv("../data/df.csv", index=False)

In [27]:
# Either you load new_df from data folder (meaning that you just computed it)
# or you load it from pre-loaded folder as already computed

if not os.path.exists("../data/df.csv"):
    # then use the pre-loaded df.csv
    new_df = pd.read_csv("../pre_loaded/df.csv")
else:
    new_df = pd.read_csv("../data/df.csv")

In [28]:
new_df

Unnamed: 0,id_student,code_module,code_presentation,weighted_score
0,11391,AAA,2013J,82.400000
1,28400,AAA,2013J,65.400000
2,31604,AAA,2013J,76.300000
3,32885,AAA,2013J,55.000000
4,38053,AAA,2013J,66.900000
...,...,...,...,...
26634,697314,GGG,2014J,90.000000
26635,1734156,GGG,2014J,78.125000
26636,692171,GGG,2014J,87.500000
26637,650630,GGG,2014J,67.000000


## Aggregate click information

In [29]:
studentVle

Unnamed: 0,code_module,code_presentation,id_student,id_site,date,sum_click
0,AAA,2013J,28400,546652,-10,4
1,AAA,2013J,28400,546652,-10,1
2,AAA,2013J,28400,546652,-10,1
3,AAA,2013J,28400,546614,-10,11
4,AAA,2013J,28400,546714,-10,1
...,...,...,...,...,...,...
10655275,GGG,2014J,675811,896943,269,3
10655276,GGG,2014J,675578,896943,269,1
10655277,GGG,2014J,654064,896943,269,3
10655278,GGG,2014J,654064,896939,269,1


In [30]:
studentClick = studentVle.groupby(["code_module", "code_presentation", "id_student"]).sum()
studentClick = studentClick.drop(columns=["id_site", "date"])

In [31]:
studentClick

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sum_click
code_module,code_presentation,id_student,Unnamed: 3_level_1
AAA,2013J,11391,934
AAA,2013J,28400,1435
AAA,2013J,30268,281
AAA,2013J,31604,2158
AAA,2013J,32885,1034
...,...,...,...
GGG,2014J,2640965,41
GGG,2014J,2645731,893
GGG,2014J,2648187,312
GGG,2014J,2679821,275


In [32]:
# # To check the group by (change the id_student number and verify the sum)
# studentVle[studentVle["code_module"]=="AAA"][studentVle["code_presentation"]=="2013J"][studentVle["id_student"]==30268]["sum_click"].sum()

## Merge studentInfo with click and score

In [33]:
studentAll = pd.merge(studentInfo, studentClick, how="inner", on=["code_module", "code_presentation", "id_student"])
# It is normal if with inner join the number of rows goes from 32593 (rows from studentInfo) to 29228 (rows from studentClick) because of the duplicates in studentInfo

In [34]:
studentAll = pd.merge(studentAll, new_df, how="inner", on=["code_module", "code_presentation", "id_student"])
# It is also normal if with inner join the number of rows goes from 29228 (rows from first merge) to 26589 (rows from new_df) because inner join only keeps where the join is possible so we end up with the minimal number of rows

In [35]:
# Arrange columns order
studentAll = pd.DataFrame(studentAll, columns=['code_module', 'code_presentation', 'id_student', 'gender', 'region',
       'highest_education', 'imd_band', 'age_band', 'disability', 'num_of_prev_attempts', 
       'studied_credits', 'sum_click', 'weighted_score', 'final_result'])

In [36]:
studentAll.to_csv("../data/studentAll.csv", index=False)