In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from am4chart import *

In [2]:
def get_loc(df,ix,default=0,cols=None):
    try:
        if(cols):
            return df.loc[ix][cols]
        return df.loc[ix]
    except TypeError:
        return default

def normalizePer(df,col,sum_col,group_col,copy_of = None,new_name=None):
    if not new_name:
        new_name = col
    if not copy_of:
        copy_of  = col
        
    df[new_name] = df[copy_of]
    
    total = df[[group_col,sum_col]].groupby(group_col).sum()
    
    for xi in total.index:
        tot  = total.loc[xi,sum_col]
        mask = df_months[group_col]==xi
        df.loc[mask,new_name] = df[mask][new_name].divide(tot/100)

In [3]:
pd.options.display.max_columns = None
pd.options.display.max_rows    = None

In [4]:
data_folder = "../data/"

In [5]:
am4 = Amchart()

# INFO:
## Raw informations:
 - Total activities: 33,728
 - Total users: 1,052
 - Start year: 2013 (104 users)
     - Following years: 2014: 113, 2015: 105, 2016: 103, 2017: 81, 2018: 144, 2019: 155

All the data in this notebook are coming from the merge of two DBs in Ticino. This is still not perfect considering some redundancy (to be fixed in the future)

[months](http://chili.ineeda.space/data/months.htm),
[months_supervisors](http://chili.ineeda.space/data/months_supervisors.htm),
[users](http://chili.ineeda.space/data/users.htm),
[activities](http://chili.ineeda.space/data/activities.htm)

[other notebook](http://chili.ineeda.space/notebooks/preprocess_pipeline.html)

# Data Loading and Cleaning

In [49]:
df_months = pd.read_csv(f"{data_folder}months.csv").fillna(0).sort_values(by=['month','activity_school_year'])
df_months.head()

Unnamed: 0,month,activity_school_year,n_users_per_year,n_logins,n_activities,n_recipes,n_experiences,avg_n_user_activities,avg_n_user_recipes,avg_n_user_experiences,n_files,n_files_recipes,n_files_experiences,avg_n_files,avg_n_files_recipes,avg_n_files_experiences,std_n_files,std_n_files_recipes,std_n_files_experiences,n_feedback_requests,n_feedback_responses,n_feedback_requests_recipes,n_feedback_responses_recipes,n_feedback_requests_experiences,n_feedback_responses_experiences,n_in_curriculum,n_in_curriculum_recipes,n_in_curriculum_experiences,n_in_curriculum_insert_date,n_in_curriculum_insert_date_recipes,n_in_curriculum_insert_date_experiences,avg_activity_total_length,std_activity_total_length,avg_len_descriptions,std_len_descriptions,avg_len_steps,std_len_steps,avg_len_observations,std_len_observations,avg_sum_len_reflections,std_avg_sum_len_reflections,avg_avg_len_reflections,std_avg_len_reflections,avg_len_bilancio,std_len_bilancio,avg_len_competenze,std_len_competenze,avg_len_miglioramenti,std_len_miglioramenti,avg_len_critici,std_len_critici,total_reflections,total_null_reflections,n_edits,perc_total_feedback_requests,perc_total_feedback_requests_recipes,perc_total_feedback_requests_experiences,perc_feedback_responses,perc_feedback_responses_recipes,perc_feedback_responses_experiences,perc_in_curriculum,perc_recipes_in_curriculum,perc_experiences_in_curriculum,perc_in_curriculum_insert_date,perc_recipes_in_curriculum_insert_date,perc_experiences_in_curriculum_insert_date
0,1,1,547,2512,1259.0,947.0,312.0,4.43,3.33,1.1,3437.0,3061.0,376.0,9.0,9.0,1.0,5.5266,6.0612,2.2693,292.0,0.0,225.0,0.0,67.0,0.0,1174.0,909.0,265.0,222.0,169.0,53.0,114.0,88.88,9.0,18.89,96.0,71.9,8.0,15.71,5.59,7.35,3.42,6.15,5.61,10.24,7.91,11.18,4.55,8.32,4.28,9.71,427.0,819.0,8964.0,23.19,23.76,21.47,0.0,0.0,0.0,93.25,72.2,21.05,17.63,13.42,4.21
1,1,2,376,2012,1093.0,367.0,726.0,5.66,1.9,3.76,1976.0,1379.0,597.0,9.0,6.0,2.0,4.5558,5.4339,4.3932,96.0,0.0,62.0,0.0,34.0,0.0,1023.0,323.0,700.0,74.0,51.0,23.0,149.0,138.8,11.0,32.23,129.0,118.4,6.0,15.28,5.93,8.44,4.42,7.2,3.43,8.46,11.18,17.22,3.66,9.07,5.46,11.25,289.0,795.0,6386.0,8.78,16.89,4.68,0.0,0.0,0.0,93.6,29.55,64.04,6.77,4.67,2.1
2,1,3,225,1333,805.0,328.0,477.0,7.06,2.88,4.18,596.0,500.0,96.0,11.0,10.0,1.0,7.6447,8.4491,2.829,43.0,0.0,23.0,0.0,20.0,0.0,682.0,240.0,442.0,1.0,0.0,1.0,145.0,143.14,18.0,43.84,134.0,138.5,3.0,15.33,7.39,8.24,3.87,4.32,8.88,22.76,11.68,10.23,4.95,7.09,4.06,10.96,106.0,690.0,4169.0,5.34,7.01,4.19,0.0,0.0,0.0,84.72,29.81,54.91,0.12,0.0,0.12
3,2,1,547,3019,1139.0,728.0,411.0,3.64,2.33,1.31,3843.0,3483.0,360.0,11.0,10.0,1.0,6.7857,7.2533,2.2828,249.0,0.0,181.0,0.0,68.0,0.0,1006.0,663.0,343.0,197.0,148.0,49.0,124.0,115.75,9.0,23.87,107.0,97.56,8.0,16.05,6.65,8.11,3.92,6.68,6.18,11.1,10.05,13.65,5.27,9.06,5.09,11.09,396.0,734.0,7554.0,21.86,24.86,16.55,0.0,0.0,0.0,88.32,58.21,30.11,17.3,12.99,4.3
4,2,2,376,2025,1234.0,334.0,900.0,6.63,1.8,4.84,2134.0,1285.0,849.0,10.0,7.0,3.0,7.8019,8.3508,5.719,87.0,0.0,37.0,0.0,50.0,0.0,1158.0,299.0,859.0,145.0,36.0,109.0,140.0,135.08,7.0,32.46,128.0,119.78,5.0,13.92,4.14,6.43,3.33,5.87,3.77,7.43,6.68,14.41,3.26,6.54,2.86,7.16,239.0,984.0,7422.0,7.05,11.08,5.56,0.0,0.0,0.0,93.84,24.23,69.61,11.75,2.92,8.83


In [7]:
#month_map={1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
#df_months.replace({'month':month_map},inplace=True)

In [50]:
y_users = df_months['n_users_per_year']
df_months['norm_avg_n_user_recipes']        = df_months['n_recipes'].divide(y_users)
df_months['norm_avg_n_user_experiences']    = df_months['n_experiences'].divide(y_users)
df_months['norm_avg_n_activities']          = df_months['n_activities'].divide(y_users)

# Data Exploring

## Apprentices

### # of activities

In [9]:
column = am4.ColumnChart()
column.setTitle("Total number of activities per month")
column.fromDataFrame(df_months, "month", "n_activities", hue="activity_school_year", hue_prefix="year ")
column.plot()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
column = am4.ColumnChart()
column.setTitle("[bold]Average[/] number of activities per all the users per month")
column.fromDataFrame(df_months, "month", 
                     "norm_avg_n_activities", 
                     hue="activity_school_year", hue_prefix="year ")
column.plot()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
column = am4.ColumnChart()
column.setTitle("Average number of [bold]activities[/] per user per month")
column.fromDataFrame(df_months, "month", 
                     ["norm_avg_n_user_recipes","norm_avg_n_user_experiences"], 
                     hue="activity_school_year", hue_prefix="year ")
column.plot()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

----------

### Files

In [12]:
# normalize wrt number of activities per month
df_months['norm_n_files_recipes'] = df_months['n_files_recipes'].divide(df_months['n_activities'])
df_months['norm_n_files_experiences'] = df_months['n_files_experiences'].divide(df_months['n_activities'])

column = am4.ColumnChart()
column.setTitle("[bold]Normalized[/] number of activities's files per month")
column.fromDataFrame(df_months, "month", 
                     ['norm_n_files_recipes','norm_n_files_experiences'], 
                     hue="activity_school_year", hue_prefix="year ")
column.plot()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

And the average grouped by years:

In [13]:
df_months[['activity_school_year','n_files','avg_n_files','std_n_files']]\
    .groupby('activity_school_year').mean()

Unnamed: 0_level_0,n_files,avg_n_files,std_n_files
activity_school_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2752.416667,10.25,6.48685
2,1946.666667,10.583333,7.176258
3,799.833333,9.416667,5.447183


### Feedbacks and Curriculum

In [17]:
column = am4.ColumnChart()
column.setTitle("% of activities with requests for feedback per month")
column.showLegend(False)
column.fromDataFrame(df_months, "month", 
                     ['perc_total_feedback_requests_recipes','perc_total_feedback_requests_experiences'], 
                     hue="activity_school_year", hue_prefix="year ")
column.plot()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [18]:
normalizePer(df_months,'perc_total_feedback_requests_recipes',
             'n_feedback_requests','activity_school_year','n_feedback_requests_recipes')
normalizePer(df_months,'perc_total_feedback_requests_experiences',
             'n_feedback_requests','activity_school_year','n_feedback_requests_experiences')

In [20]:
column = am4.ColumnChart()
column.setTitle("% of feedback requests over the school year")
column.showLegend(False)
column.fromDataFrame(df_months, "month", 
                     ['perc_total_feedback_requests_recipes','perc_total_feedback_requests_experiences'], 
                     hue="activity_school_year", hue_prefix="year ")
column.plot()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

TODO: try with request date

In [21]:
column = am4.ColumnChart()
column.setTitle("% of activities in curriculum per month")
column.fromDataFrame(df_months, "month", 
                     ['perc_recipes_in_curriculum'], 
                     hue="activity_school_year", hue_prefix="year ")
column.plot()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

TODO: CONTROLLARE BENE LA QUERY n_activities per via del "final".
TODO: vedere se moltiplicare i due

In [22]:
normalizePer(df_months,'norm_perc_recipes_in_curriculum',
             'n_activities','activity_school_year','n_in_curriculum_recipes')
normalizePer(df_months,'norm_perc_experiences_in_curriculum',
             'n_activities','activity_school_year','n_in_curriculum_experiences')

In [24]:
column = am4.ColumnChart()
column.setTitle("[bold]Normalized[/] number of activities in curriculum per month")
column.showLegend(False)
column.fromDataFrame(df_months, "month", 
                     ['norm_perc_recipes_in_curriculum','norm_perc_experiences_in_curriculum'], 
                     hue="activity_school_year", hue_prefix="year ")
column.plot()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Lengths
(total length: description + steps + observations)

In [25]:
column = am4.ColumnChart()
column.setTitle("Activity total length per month")
column.fromDataFrame(df_months, "month", 
                     ['avg_activity_total_length'], 
                     hue="activity_school_year", hue_prefix="year ")
column.plot()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [28]:
column = am4.ColumnChart()
column.setTitle("Activity total length per month")
column.fromDataFrame(df_months, "month", 
                     ['avg_len_descriptions','avg_len_steps','avg_len_observations'], 
                     hue="activity_school_year", hue_prefix="year ")
column.plot()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [29]:
df_months[['activity_school_year','avg_activity_total_length','std_activity_total_length']]\
    .groupby('activity_school_year').mean()

Unnamed: 0_level_0,avg_activity_total_length,std_activity_total_length
activity_school_year,Unnamed: 1_level_1,Unnamed: 2_level_1
1,123.916667,115.52
2,146.416667,149.583333
3,137.833333,152.120833


Note: std very high because of NULL descriptions. TODO: try without NULL descriptions

In [30]:
column = am4.ColumnChart()
column.setTitle("Average reflections total length per month")
column.fromDataFrame(df_months, "month", 
                     ['avg_sum_len_reflections'], 
                     hue="activity_school_year", hue_prefix="year ")
column.plot()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [33]:
column = am4.ColumnChart()
column.setTitle("Total length of average reflections per month")
column.fromDataFrame(df_months, "month", 
                     ['avg_len_bilancio','avg_len_competenze','avg_len_miglioramenti','avg_len_critici'], 
                     hue="activity_school_year", hue_prefix="year ")
column.plot()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Edits

In [34]:
column = am4.ColumnChart()
column.setTitle("Edits per months")
column.fromDataFrame(df_months, "month", 
                     'n_edits', 
                     hue="activity_school_year", hue_prefix="year ")
column.plot()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Logins

In [35]:
df_students_logins = pd.read_csv(f"{data_folder}students_logins.csv")\
                        .drop(['ut_user_type','start_semester','start_year'],axis=1)
df_students_logins['date'] = df_students_logins['date'].astype('datetime64')
df_students_logins.head()

Unnamed: 0,us_user,date,user_school_year,month,dayofweek,hour,minute
0,20,2013-11-25 15:30:13,1,11,2,15,30
1,20,2013-12-19 14:11:06,1,12,5,14,11
2,20,2014-01-08 18:50:53,1,1,4,18,50
3,20,2014-01-08 20:14:51,1,1,4,20,14
4,20,2014-01-14 10:56:35,1,1,3,10,56


In [37]:
date_hist = pd.DataFrame(df_students_logins['date'].groupby([df_students_logins["date"].dt.dayofweek,df_students_logins["date"].dt.hour]).count())
date_hist = date_hist.rename(columns={"date": "count"})
date_hist = pd.DataFrame([(x,y,get_loc(date_hist,(x,y),0,'count')) for x in range(7) for y in range(24)])
date_hist.columns = ['dayofweek','hour','count']

In [41]:
column = am4.ColumnChart()
column.setTitle("Apprentices logins")
column.showLegend(False)
column.fromDataFrame(date_hist, "dayofweek", 
                     'count', 
                     hue="hour")
column.plot()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

--------------
## Supervisors

In [45]:
df_months_supervisors = pd.read_csv(f"{data_folder}months_supervisors.csv").fillna(0).sort_values(by=['month','activity_school_year'])
df_months_supervisors.head()

Unnamed: 0,month,activity_school_year,n_users_per_year,n_logins,n_activities,n_recipes,n_experiences,avg_n_user_activities,avg_n_user_recipes,avg_n_user_experiences,n_files,n_files_recipes,n_files_experiences,avg_n_files,avg_n_files_recipes,avg_n_files_experiences,std_n_files,std_n_files_recipes,std_n_files_experiences,n_feedback_requests,n_feedback_responses,n_feedback_requests_recipes,n_feedback_responses_recipes,n_feedback_requests_experiences,n_feedback_responses_experiences,n_in_curriculum,n_in_curriculum_recipes,n_in_curriculum_experiences,n_in_curriculum_insert_date,n_in_curriculum_insert_date_recipes,n_in_curriculum_insert_date_experiences,avg_activity_total_length,std_activity_total_length,avg_len_descriptions,std_len_descriptions,avg_len_steps,std_len_steps,avg_len_observations,std_len_observations,avg_sum_len_reflections,std_avg_sum_len_reflections,avg_avg_len_reflections,std_avg_len_reflections,avg_len_bilancio,std_len_bilancio,avg_len_competenze,std_len_competenze,avg_len_miglioramenti,std_len_miglioramenti,avg_len_critici,std_len_critici,total_reflections,total_null_reflections,n_edits,perc_total_feedback_requests,perc_total_feedback_requests_recipes,perc_total_feedback_requests_experiences,perc_feedback_responses,perc_feedback_responses_recipes,perc_feedback_responses_experiences,perc_in_curriculum,perc_recipes_in_curriculum,perc_experiences_in_curriculum,perc_in_curriculum_insert_date,perc_recipes_in_curriculum_insert_date,perc_experiences_in_curriculum_insert_date
0,1,1,227,259,1259.0,947.0,312.0,4.43,3.33,1.1,3437.0,3061.0,376.0,9.0,9.0,1.0,5.5266,6.0612,2.2693,292.0,217.0,225.0,165.0,67.0,52.0,1174.0,909.0,265.0,222.0,169.0,53.0,16.0,0.0,1.0,0.0,14.0,0.0,1.0,0.0,4.84,7.3,2.66,5.19,4.38,9.77,7.55,14.28,4.01,7.87,3.44,8.69,0.0,1.0,8964.0,23.19,23.76,21.47,17.24,17.42,16.67,93.25,72.2,21.05,17.63,13.42,4.21
1,1,2,101,113,1093.0,367.0,726.0,5.66,1.9,3.76,1976.0,1379.0,597.0,9.0,6.0,2.0,4.5558,5.4339,4.3932,96.0,58.0,62.0,44.0,34.0,14.0,1023.0,323.0,700.0,74.0,51.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.36,4.59,1.75,3.33,3.02,5.85,4.77,7.25,3.31,9.37,2.32,6.82,0.0,0.0,6386.0,8.78,16.89,4.68,5.31,11.99,1.93,93.6,29.55,64.04,6.77,4.67,2.1
2,1,3,42,26,805.0,328.0,477.0,7.06,2.88,4.18,596.0,500.0,96.0,11.0,10.0,1.0,7.6447,8.4491,2.829,43.0,20.0,23.0,11.0,20.0,9.0,682.0,240.0,442.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.47,6.76,2.38,2.19,5.03,8.99,5.69,9.13,7.17,7.63,4.0,11.33,0.0,0.0,4169.0,5.34,7.01,4.19,2.48,3.35,1.89,84.72,29.81,54.91,0.12,0.0,0.12
3,2,1,227,216,1139.0,728.0,411.0,3.64,2.33,1.31,3843.0,3483.0,360.0,11.0,10.0,1.0,6.7857,7.2533,2.2828,249.0,134.0,181.0,116.0,68.0,18.0,1006.0,663.0,343.0,197.0,148.0,49.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.34,7.46,3.21,6.42,4.48,8.98,7.74,15.31,3.86,7.74,5.26,13.99,0.0,0.0,7554.0,21.86,24.86,16.55,11.76,15.93,4.38,88.32,58.21,30.11,17.3,12.99,4.3
4,2,2,101,80,1234.0,334.0,900.0,6.63,1.8,4.84,2134.0,1285.0,849.0,10.0,7.0,3.0,7.8019,8.3508,5.719,87.0,53.0,37.0,26.0,50.0,27.0,1158.0,299.0,859.0,145.0,36.0,109.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.34,4.63,2.69,3.23,4.3,6.86,6.25,8.99,2.92,5.45,3.9,10.21,0.0,0.0,7422.0,7.05,11.08,5.56,4.29,7.78,3.0,93.84,24.23,69.61,11.75,2.92,8.83


### Feedbacks

In [46]:
normalizePer(df_months_supervisors,'perc_total_feedback_responses_recipes',
             'n_feedback_responses','activity_school_year','n_feedback_responses_recipes')
normalizePer(df_months_supervisors,'perc_total_feedback_responses_experiences',
             'n_feedback_responses','activity_school_year','n_feedback_responses_experiences')

In [48]:
column = am4.ColumnChart()
column.setTitle("% of feedback responses over the school year")
column.showLegend(False)
column.fromDataFrame(df_months_supervisors, "month", 
                     ['perc_total_feedback_responses_recipes','perc_total_feedback_responses_experiences'], 
                     hue="activity_school_year", hue_prefix="year ")
column.plot()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [54]:
df_months_supervisors['ration_response'] = (df_months_supervisors['n_feedback_responses']/df_months['n_feedback_requests']).fillna(0)

In [57]:
column = am4.ColumnChart()
column.setTitle("Ratio responses/requests")
column.showLegend(False)
column.fromDataFrame(df_months_supervisors, "month", 
                     ['ration_response'], 
                     hue="activity_school_year", hue_prefix="year ")
column.plot()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [58]:
column = am4.ColumnChart()
column.setTitle("Total length of average feedbacks per month")
column.fromDataFrame(df_months_supervisors, "month", 
                     ['avg_len_bilancio','avg_len_competenze','avg_len_miglioramenti','avg_len_critici'], 
                     hue="activity_school_year", hue_prefix="year ")
column.plot()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Logins

In [61]:
df_supervisors_logins = pd.read_csv(f"{data_folder}supervisors_logins.csv")\
                        .drop(['ut_user_type','start_semester','start_year'],axis=1)
df_supervisors_logins['date'] = df_supervisors_logins['date'].astype('datetime64')

date_hist = pd.DataFrame(df_supervisors_logins['date'].groupby([df_supervisors_logins["date"].dt.dayofweek,df_supervisors_logins["date"].dt.hour]).count())
date_hist = date_hist.rename(columns={"date": "count"})
date_hist = pd.DataFrame([(x,y,get_loc(date_hist,(x,y),0,'count')) for x in range(7) for y in range(24)])
date_hist.columns = ['dayofweek','hour','count']

In [62]:
column = am4.ColumnChart()
column.setTitle("Supervisors logins")
column.showLegend(False)
column.fromDataFrame(date_hist, "dayofweek", 
                     'count', 
                     hue="hour")
column.plot()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>