In [2]:
import psycopg2
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
import statsmodels.api as sm
from statsmodels.formula.api import ols
from distfit import distfit
from statsmodels.tsa.arima.model import ARIMA, ARIMAResults
from statsmodels.tsa import forecasting
from statsmodels.tsa.stattools import adfuller, acf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf, plot_predict
from pmdarima.arima.utils import ndiffs
from statsmodels.tsa.stattools import acf
from scipy.stats import bootstrap
from sklearn.model_selection import train_test_split

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
df1 = pd.read_csv('grades_history_trimmed.csv')
df2 = pd.read_csv('grades_trimmed.csv')

df1_1 = df1.drop(['Unnamed: 0', 'id', 'oldid', 'source', 'loggeduser', 'rawscaleid', 'usermodified',
                  'finalgrade', 'overriden', 'feedback', 'feedbackformat',
                  'year_month', 'action', 'grade_cat'], axis=1)
df2_2 = df2.drop(['Unnamed: 0', 'id', 'rawscaleid', 'usermodified', 'agreegationstatus',
                  'finalgrade', 'overriden', 'feedback', 'feedbackformat',
                  'agreegationweight', 'time_created', 'grade_cat', 'timecreated'], axis=1)

df1_1 = df1_1.reindex(sorted(df1_1.columns), axis=1)
df2_2 = df2_2.reindex(sorted(df2_2.columns), axis=1)
df = pd.concat([df1_1, df2_2])

In [5]:
conn = psycopg2.connect(dbname='primuss_fhin',
                            user='thisuccess',
                            password='5N7vcG!8FD%',
                            host='194.94.97.25',
                            port='8890')

cur = conn.cursor()
cur.execute("SELECT * FROM moodle.mdl21_grade_items")

records = cur.fetchall()

cur.close()
conn.close()

headers = ['id', 'courseid', 'categoryid', 'itemname', 'itemtype', 'itemmodule', 'iteminstance', 'itemnumber',
           'iteminfo', 'idnumber', 'calculation', 'gradetype', 'grademax', 'grademin', 'scaleid',
           'outcomeid', 'gradepass', 'multfactor', 'plusfactor', 'aggregationcoef', 'sortorder', 'display',
           'decimals', 'hidden', 'locked', 'locktime', 'needsupdate',
           'timecreated', 'timemodified', 'aggregationcoef2', 'weightoverride']

grade_items = pd.DataFrame(records, columns = headers)
grade_items = grade_items.reset_index()

In [6]:
course_dict = {grade_items['id'][i]:grade_items['courseid'][i] for i in range(len(grade_items))}
df['course'] = df['itemid'].replace(course_dict)

In [7]:
from tqdm import tqdm

user_df = df[['grade', 'itemid', 'time_modified', 'timemodified', 'userid', 'year', 'course']]

user_df = user_df.sort_values('time_modified', ascending=True).groupby('userid')    

all_dataframes = []
all_dataframes_grades = []
user_df = df[['grade', 'itemid', 'time_modified', 'timemodified', 'userid', 'year', 'course']]

user_df = user_df.sort_values('time_modified', ascending=True).groupby('userid') 
rg = user_df.size().max()-1

for i in tqdm(range(rg)):
    current_df = pd.DataFrame(columns=['timemodified', 'itemid', 'userid', 'rawgrade', 'rawgrademax',
                                       'rawgrademin', 'time_modified', 'year', 'month', 'grade', 'year_norm'])    
    for x in user_df.groups:
        if len(user_df.get_group(x))>i:
            n = pd.DataFrame(user_df.get_group(x))
            a = n.iloc[[i]]
            current_df = current_df.append(a, ignore_index=True)
    all_dataframes.append(current_df)
    all_dataframes_grades.append(current_df['grade'])

100%|█████████████████████████████████████████| 137/137 [18:20<00:00,  8.03s/it]


In [8]:
def train_split_fun(df, ratio = 0.8, seed = 123, userid = "userid"):
    """
    With that function we can use the same splits of datasets and 
    keep each individual student only in one split. 
    The splitting ratio follows asymptotically the given value.
    
    Parameters
    ----------
    df : The pandas dataframe to be splited. It should include
        a variable with IDs
    ratio: The percentage of data to be added in the train spli.
        Default 0.8 (80%)
    seed: The seed for generated the random splits. Default 123.
    userid: The name of the variable who defines the users in the
        original dataframe. Default "userid".
        
    Returns
    -------
    Two DataFrame
        Train: A split of the original according to ratio.
        Test: The other 1-ratio part of dataset.
    """
    users = df[userid].value_counts().rename_axis(userid).reset_index(name='counts')
    size = ratio*sum(users['counts'])
    train_split = set()
    counts = 0 
    shuffled = users.sample(n=min(size, users[userid].nunique()), replace = False, random_state = seed)
    for _, user, count in shuffled.itertuples():
        train_split.add(user)
        counts += count
        if counts >= size:
            break
    train = df[df[userid].isin(train_split)]
    mask = df[userid].isin(train[userid])
    test = df[~mask]
    return train, test

In [9]:
def StudentSimulator3(df, size, grades='grade', userid='userid', quizid='quizid', 
                      courseid='courseid', random_state=123):
    
    """
    StudentSimulator3 simulates data based on 
    real student data. Giving a educational dataset
    can reproduce it and return simulated student attempts.

    :df: The pandas dataframe to be reproduced 
    :size: Final amount of observations 
    :grades: Name of the column which contain grades in scale 0 to 100 (Default: grade)
    :userid: Name of the column which contain users (Default: userid)
    :quizid: Name of the column which contain quizzes (Default: quizid)
    :courseid: Name of the column which contain courses (Default: courseid)
    :random_state: The seed which fixes the randomness
    :return: Returns a pd dataframe with student grades simulated from real data
    with real properties.
    """
    
    if isinstance(df, pd.DataFrame):
        #Setting the seed
        if random_state != None:
            np.random.seed(random_state)

        #Creating the output dataframe
        newdf = pd.DataFrame()

        #Gather the unique users
        users = df['userid'].unique()

        #Simulate students and reindexing
        ids = np.array(range(1, size+1))
        newuserid = []
        counter = 0

        while len(newdf) <= size:
            counter += 1
            user = int(np.random.choice(users, 1))
            t = df[userid]==user
            newdf = newdf.append(df[t]) 
            newuserid.append([counter]*t.sum())

        newuserid = [item for sublist in newuserid for item in sublist]
        newdf[userid] = newuserid

        newdf = newdf.reset_index()

        #Random remove the excess amount of observations 
        if len(newdf) > size:
            remove_n = len(newdf) - size
            drop_indices = np.random.choice(newdf.index, remove_n, replace=False)
            newdf1 = newdf.drop(drop_indices)

        #Adding random noise to real data and fixing the final output
        newdf1[grades] = newdf1[grades] + np.random.normal(0, 3, size)

        newdf1[grades].where(newdf1[grades]<100, 100, inplace=True)
        newdf1[grades].where(newdf1[grades]>=0, 0, inplace=True)    

        newdf1[grades] = newdf1[grades].round(2)
        newdf1 = newdf1.reset_index().drop(['level_0', 'index'], axis=1)

        #Reindexing items and courses for anonymity 
        if quizid in df.columns:
            df_n = newdf1.sort_values(quizid, inplace=True)
            df_n = newdf1.reset_index()
            item = df_n[quizid].to_list()
            from itertools import accumulate
            indexes  = range(len(item))
            byGroup  = accumulate(indexes,lambda i,u: (i+1)*(u>0 and item[u-1]==item[u]))
            indexes  = [i-1 for i in accumulate(int(g==0) for g in byGroup)]
            indexAndItem = [(i,u) for i,u in zip(indexes,item)]
            new_item = pd.DataFrame([(i,u) for i,u in zip(indexes,item)], columns=['new_item', 'old_item'])

            df_n[quizid] = new_item['new_item']
            df_n = df_n.drop('index', axis=1)

        if courseid in df.columns:    
            df_n1 = df_n.sort_values(courseid, inplace=True)
            df_n1 = df_n.reset_index()
            course = df_n[courseid].to_list()
            from itertools import accumulate
            indexes  = range(len(course))
            byGroup  = accumulate(indexes,lambda i,u: (i+1)*(u>0 and course[u-1]==course[u]))
            indexes  = [i-1 for i in accumulate(int(g==0) for g in byGroup)]
            indexAndcourse = [(i,u) for i,u in zip(indexes,course)]
            new_course = pd.DataFrame([(i,u) for i,u in zip(indexes,course)], columns=['new_course', 'old_course'])

            df_n1[courseid] = new_course['new_course']
            df_n1 = df_n1.drop('index', axis=1)

        newdf = df_n1.sample(frac=1).reset_index(drop=True)
        newdf = newdf[[userid, courseid, quizid, grades, 'time_modified']]
        return newdf
    
    else:
        raise TypeError('The imported object is not a pandas.DataFrame. Please import a pandas.DataFrame type.')            

In [10]:
a = StudentSimulator3(df=df, size=200000, grades='grade', userid='userid', quizid='itemid', courseid='course', random_state=123)

In [11]:
# Applying the method
check_nan = a['userid'].isnull().values.any()

# printing the result
print(check_nan)

False


In [None]:
#user_df = x[["id", "userid", "quizid", "attempts", "courseid", "time_used", "grade"]]
all_dataframes = []
user_df = a.groupby('userid')    
 
rg = user_df.size().max()-1

for i in tqdm(range(rg)):
    current_df = pd.DataFrame(columns=['id', 'userid', 'quizid', 'attempts', 'courseid', 'time_used', 'grade'])    
    for x in user_df.groups:
        if len(user_df.get_group(x))>i:
            n = pd.DataFrame(user_df.get_group(x))
            a = n.iloc[[i]]
            current_df = current_df.append(a, ignore_index=True)
    all_dataframes.append(current_df)


  0%|                                                   | 0/137 [00:00<?, ?it/s]

In [None]:
import statsmodels.stats.api as sms
avg = []
std = []
ci_l = []
ci_u = []
for i in range(len(all_dataframes)):
    avg.append(all_dataframes[i]['grade'].mean())
    std.append(all_dataframes[i]['grade'].std())
    ci_l.append(sms.DescrStatsW(all_dataframes[i]['grade']).tconfint_mean()[0])
    ci_u.append(sms.DescrStatsW(all_dataframes[i]['grade']).tconfint_mean()[1])
    stat = pd.DataFrame({'avg_grades':avg, 'std_grades':std, 'ci_low':ci_l, 'ci_up':ci_u})

In [None]:
stat['ci_low'].where(stat['ci_low'] >= 0, 0, inplace=True)
stat['ci_up'].where(stat['ci_up'] < 100, 100, inplace=True)

In [None]:
plt.figure(figsize=(32,10), dpi=400)
plt.xlim(0,250)
plt.plot(stat.index, stat['avg_grades'], color='blue', lw=2)
plt.plot(stat.index, stat['ci_low'], color='blue', linestyle="--", label='_nolegend_')
plt.plot(stat.index, stat['ci_up'], color='blue', linestyle="--", label='_nolegend_')
plt.fill_between(stat.index, stat['ci_low'], stat['ci_up'], color='dimgrey', alpha=0.1, animated=True)
plt.gca().set(title='Performance of students per attempt', xlabel='Attempt', ylabel='Average Grades')
plt.axhline(y=stat['avg_grades'].mean(), color='gold', lw=2)
plt.xticks(np.arange(0,250,5))
plt.legend(['Average grade of attempt', 'Confidence interval', 'Average grade of all attempts'], loc=2)
plt.show()