In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
data_folder = "../data/"

# Data loading and cleaning

In [4]:
user_info = pd.read_csv("../data/users_info.csv")
user_info.head(1)

Unnamed: 0,user_id,archived,user_type,classes,n_activities,n_recipes,n_experiences,n_reflections,n_recipe_reflections,n_experience_reflections,...,n_in_curriculum_semester2,n_in_curriculum_semester3,n_in_curriculum_semester4,n_in_curriculum_semester5,n_feedback_requests,n_received_feedback_responses,n_received_feedback_requests,n_feedback_responses,n_files,n_folders
0,86,0,formatore,,,,,,,,...,,,,,,,,,,


**Below the list of the columns and a short description**

In [5]:
print(list(user_info.columns))

['user_id', 'archived', 'user_type', 'classes', 'n_activities', 'n_recipes', 'n_experiences', 'n_reflections', 'n_recipe_reflections', 'n_experience_reflections', 'n_in_curriculum', 'n_recipes_in_curriculum', 'n_experiences_in_curriculum', 'n_in_curriculum_semester1', 'n_in_curriculum_semester2', 'n_in_curriculum_semester3', 'n_in_curriculum_semester4', 'n_in_curriculum_semester5', 'n_feedback_requests', 'n_received_feedback_responses', 'n_received_feedback_requests', 'n_feedback_responses', 'n_files', 'n_folders']


 - user_id
 - **archived**: binary, if a user is archived the is no more visible in the user list in the platform
 - **user_type**: types of the user in csv
 - **classes**: classes to which the user belongs
 - **n_activities**: the number of user _activities_
 - **n_recipes**: the number of user _recipes_
 - **n_experiences**: the number of user _experiences_
 - **n_reflections**: the number of user reflections
 - **n_in_curriculum**: the number of _total activities_ in curriculum
 - **n_recipes_in_curriculum**: the number of _recipes_ in curriculum
 - **n_experiences_in_curriculum**: the number of _experiences_ in curriculum
 - **n_in_curriculum_semester1**: number of _total activities_ in curriculum, semester 1 
 - **n_in_curriculum_semester2**: number of _total activities_ in curriculum, semester 2 
 - **n_in_curriculum_semester3**: number of _total activities_ in curriculum, semester 3 
 - **n_in_curriculum_semester4**: number of _total activities_ in curriculum, semester 4 
 - **n_in_curriculum_semester5**: number of _total activities_ in curriculum, semester 5 
 - **n_feedback_requests**: number of feedbacks _requested_
 - **n_received_feedback_responses**: number of feedbacks _received_
 - **n_received_feedback_requests**: number of requests _received_
 - **n_feedback_responses**: number of _given_ feedbacks

In the df are present those type of users:

In [6]:
print(user_info['user_type'].str.split(',', expand=True)[0].unique())

['formatore' 'docente' 'convocatore' 'studente' 'HGF']


We need only formatore (supervisor), docente (teacher) and studente (student)

In [7]:
# a little translation from italian
vals_to_replace = {'formatore':'supervisor', 'docente':'teacher', 'studente':'student'}
user_info.replace({"user_type":vals_to_replace}, inplace=True)

In [8]:
# One hot for user_type
user_info = pd.concat(
                [user_info.loc[:, :'archived'], 
                 (user_info['user_type'].str.split('\s*,\s*', expand=True)
                   .stack()
                   .str.get_dummies()
                   .sum(level=0)), 
                 user_info.loc[:, 'classes':]], 
                axis=1)

In [9]:
# remove NaN to empty classes
user_info.classes = user_info.classes.fillna('')

# fill with 0 the others
user_info.fillna(0,inplace=True)

In [10]:
user_info.head()

Unnamed: 0,user_id,archived,HGF,convocatore,docente,ispettore,student,supervisor,teacher,classes,...,n_in_curriculum_semester2,n_in_curriculum_semester3,n_in_curriculum_semester4,n_in_curriculum_semester5,n_feedback_requests,n_received_feedback_responses,n_received_feedback_requests,n_feedback_responses,n_files,n_folders
0,86,0,0,0,0,0,0,1,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,87,0,0,0,0,0,0,1,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,88,0,0,0,0,0,0,1,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,89,0,0,0,0,0,0,1,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,90,0,0,0,0,0,0,1,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


------

# Data exploring

In [11]:
user_info.describe()

Unnamed: 0,user_id,archived,HGF,convocatore,docente,ispettore,student,supervisor,teacher,n_recipes,...,n_in_curriculum_semester2,n_in_curriculum_semester3,n_in_curriculum_semester4,n_in_curriculum_semester5,n_feedback_requests,n_received_feedback_responses,n_received_feedback_requests,n_feedback_responses,n_files,n_folders
count,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,...,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0
mean,526.842105,0.0,0.003509,0.003509,0.007018,0.003509,0.207018,0.74386,0.038596,0.989474,...,0.007018,0.0,0.007018,0.003509,0.14386,0.073684,0.129825,0.066667,8.347368,0.007018
std,263.801314,0.0,0.059235,0.059235,0.083623,0.059235,0.405881,0.437268,0.19297,2.434326,...,0.083623,0.0,0.083623,0.059235,0.870288,0.472678,0.843777,0.458616,21.494563,0.11847
min,86.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,331.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,541.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,804.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,881.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,15.0,...,1.0,0.0,1.0,1.0,11.0,4.0,11.0,4.0,126.0,2.0
