In [1]:
from utils import Activities, Users

import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
pd.options.plotting.backend = "plotly"

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from utils import join_by_fuzzy
from utils import recognize_gender

import info_utilities

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

from pprint import pprint

import pydot
from IPython.display import Image

import logging
logging.basicConfig(level=logging.DEBUG)

In [2]:
pd.options.display.max_columns = None
pd.options.display.max_rows    = None

In [3]:
data_folder = "../data/"
raws_folder = "../data/mysql_extractions/"
suffix      = "_TI"

# Data Loading and Cleaning

### Loading Users

In [4]:
users = Users(f"{raws_folder}users{suffix}.csv")
users.preprocess_user_type()
users.fillna()
users.solve_duplicates()
users.solve_gender()

students = users.df.query("student == 1")
students = students.drop_duplicates('user_name',keep='first')

supervisors = users.df.query("supervisor == 1")
teachers  = users.df.query("teacher == 1")

INFO:root:Recognizing gender for Severine Bailly: 
INFO:root:	F
INFO:root:Recognizing gender for Junio Balla Agabus: 
INFO:root:	Retrying with different weights..
INFO:root:Recognizing gender for Junio Balla Agabus: 
INFO:root:	Change detector type..
INFO:root:Recognizing gender for Junio Balla Agabus: 
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.genderize.io:443
DEBUG:urllib3.connectionpool:https://api.genderize.io:443 "GET /?name%5B0%5D=Agabus&country_id=IT HTTP/1.1" 200 77
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.genderize.io:443
DEBUG:urllib3.connectionpool:https://api.genderize.io:443 "GET /?name%5B0%5D=Balla&country_id=IT HTTP/1.1" 200 83
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.genderize.io:443
DEBUG:urllib3.connectionpool:https://api.genderize.io:443 "GET /?name%5B0%5D=Junio&country_id=IT HTTP/1.1" 200 82
INFO:root:	M
INFO:root:Recognizing gender for Robin Bessire: 
INFO:root:	M
INFO:root:Recognizing 

INFO:root:	M
INFO:root:Recognizing gender for Hanna De Farias: 
INFO:root:	F
INFO:root:Recognizing gender for Chandu Devegney: 
INFO:root:	Retrying with different weights..
INFO:root:Recognizing gender for Chandu Devegney: 
INFO:root:	Change detector type..
INFO:root:Recognizing gender for Chandu Devegney: 
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.genderize.io:443
DEBUG:urllib3.connectionpool:https://api.genderize.io:443 "GET /?name%5B0%5D=Devegney&country_id=IT HTTP/1.1" 200 79
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.genderize.io:443
DEBUG:urllib3.connectionpool:https://api.genderize.io:443 "GET /?name%5B0%5D=Chandu&country_id=IT HTTP/1.1" 200 79
INFO:root:	M
INFO:root:Recognizing gender for Manuel Esperanzate Antrix: 
INFO:root:	M
INFO:root:Recognizing gender for Kevin Janin : 
INFO:root:	Retrying with different weights..
INFO:root:Recognizing gender for Kevin Janin : 
INFO:root:	M
INFO:root:Recognizing gender for Laetitia Japiot

INFO:root:	M
INFO:root:Recognizing gender for Mathilde Jan: 
INFO:root:	Retrying with different weights..
INFO:root:Recognizing gender for Mathilde Jan: 
INFO:root:	F
INFO:root:Recognizing gender for MAT JUNGE: 
INFO:root:	F
INFO:root:Recognizing gender for Yann Mantelli: 
INFO:root:	M
INFO:root:Recognizing gender for Magaly Mimbela: 
INFO:root:	F
INFO:root:Recognizing gender for Camille Schonenberger: 
INFO:root:	F
INFO:root:Recognizing gender for Thibaut Serrano: 
INFO:root:	M
INFO:root:Recognizing gender for Daniel Soares Canana: 
INFO:root:	M
INFO:root:Recognizing gender for Chabbey Christian: 
INFO:root:	M
INFO:root:Recognizing gender for Cristelle Teinturier: 
INFO:root:	F
INFO:root:Recognizing gender for Stéphane Jan: 
INFO:root:	M
INFO:root:Recognizing gender for Jessica Zacchi: 
INFO:root:	F
INFO:root:Recognizing gender for Ugo Lanz: 
INFO:root:	M
INFO:root:Recognizing gender for Vincenzo Bleve: 
INFO:root:	M
INFO:root:Recognizing gender for Guillas Pascal: 
INFO:root:	M
INFO:

INFO:root:	F
INFO:root:Recognizing gender for Chahboun Elias: 
INFO:root:	M
INFO:root:Recognizing gender for Steiner Daley: 
INFO:root:	M
INFO:root:Recognizing gender for Favre Benoît: 
INFO:root:	M
INFO:root:Recognizing gender for Gay Edouard: 
INFO:root:	Retrying with different weights..
INFO:root:Recognizing gender for Gay Edouard: 
INFO:root:	F
INFO:root:Recognizing gender for Fleurisse Frédéric: 
INFO:root:	M
INFO:root:Recognizing gender for Leandro Da Silva: 
INFO:root:	Retrying with different weights..
INFO:root:Recognizing gender for Leandro Da Silva: 
INFO:root:	M
INFO:root:Recognizing gender for Brett Solochek: 
INFO:root:	M
INFO:root:Recognizing gender for Soulié Salomé: 
INFO:root:	F
INFO:root:Recognizing gender for Bensalah Ahmed: 
INFO:root:	M
INFO:root:Recognizing gender for Depping Anastasia: 
INFO:root:	F
INFO:root:Recognizing gender for Volpi Davon: 
INFO:root:	M
INFO:root:Recognizing gender for Zapata Kevin Denaro: 
INFO:root:	M
INFO:root:Recognizing gender for Killi

DEBUG:urllib3.connectionpool:https://api.genderize.io:443 "GET /?name%5B0%5D=Timothee&country_id=IT HTTP/1.1" 200 81
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.genderize.io:443
DEBUG:urllib3.connectionpool:https://api.genderize.io:443 "GET /?name%5B0%5D=Salamon&country_id=IT HTTP/1.1" 200 80
INFO:root:	M
INFO:root:Recognizing gender for Xhokli Lum: 
INFO:root:	Retrying with different weights..
INFO:root:Recognizing gender for Xhokli Lum: 
INFO:root:	Change detector type..
INFO:root:Recognizing gender for Xhokli Lum: 
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.genderize.io:443
DEBUG:urllib3.connectionpool:https://api.genderize.io:443 "GET /?name%5B0%5D=Lum&country_id=IT HTTP/1.1" 200 80
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.genderize.io:443
DEBUG:urllib3.connectionpool:https://api.genderize.io:443 "GET /?name%5B0%5D=Xhokli&country_id=IT HTTP/1.1" 200 77
INFO:root:	M
INFO:root:Recognizing gender for Vigen Han

INFO:root:	M
INFO:root:Recognizing gender for Atienza Athena: 
INFO:root:	F
INFO:root:Recognizing gender for Stagno Steven: 
INFO:root:	M
INFO:root:Recognizing gender for Rousset Sébastien: 
INFO:root:	M
INFO:root:Recognizing gender for Bouvard Murielle: 
INFO:root:	F
INFO:root:Recognizing gender for Delessert Thierry: 
INFO:root:	M
INFO:root:Recognizing gender for Baron Séverine: 
INFO:root:	Retrying with different weights..
INFO:root:Recognizing gender for Baron Séverine: 
INFO:root:	M
INFO:root:Recognizing gender for Godin Christophe: 
INFO:root:	M
INFO:root:Recognizing gender for Legros Bruno: 
INFO:root:	M
INFO:root:Recognizing gender for Borruel José: 
INFO:root:	M
INFO:root:Recognizing gender for Orpel Pablo: 
INFO:root:	M
INFO:root:Recognizing gender for Giacomini Loïc: 
INFO:root:	M
INFO:root:Recognizing gender for Oruezabal Julie: 
INFO:root:	F
INFO:root:Recognizing gender for Beillon Jean-Michel: 
INFO:root:	Retrying with different weights..
INFO:root:Recognizing gender for 

INFO:root:	Retrying with different weights..
INFO:root:Recognizing gender for Indré Dauksaite: 
INFO:root:	Change detector type..
INFO:root:Recognizing gender for Indré Dauksaite: 
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.genderize.io:443
DEBUG:urllib3.connectionpool:https://api.genderize.io:443 "GET /?name%5B0%5D=Dauksaite&country_id=IT HTTP/1.1" 200 80
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.genderize.io:443
DEBUG:urllib3.connectionpool:https://api.genderize.io:443 "GET /?name%5B0%5D=Indr%C3%A9&country_id=IT HTTP/1.1" 200 77
INFO:root:	Retrying with different weights..
INFO:root:Recognizing gender for Indré Dauksaite: 
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.genderize.io:443
DEBUG:urllib3.connectionpool:https://api.genderize.io:443 "GET /?name%5B0%5D=Dauksaite&country_id=IT HTTP/1.1" 200 80
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.genderize.io:443
DEBUG:urllib3.connectionpool

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.genderize.io:443
DEBUG:urllib3.connectionpool:https://api.genderize.io:443 "GET /?name%5B0%5D=Formateur&country_id=IT HTTP/1.1" 200 80
INFO:root:	M
INFO:root:Recognizing gender for Daniel Gallay: 
INFO:root:	M
INFO:root:Recognizing gender for CHAMA MOHAMMAD: 
INFO:root:	M
INFO:root:Recognizing gender for DIAS SOUZA WASHINGTON: 
INFO:root:	M
INFO:root:Recognizing gender for KROUG OCEANE: 
INFO:root:	Retrying with different weights..
INFO:root:Recognizing gender for KROUG OCEANE: 
INFO:root:	Change detector type..
INFO:root:Recognizing gender for KROUG OCEANE: 
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.genderize.io:443
DEBUG:urllib3.connectionpool:https://api.genderize.io:443 "GET /?name%5B0%5D=OCEANE&country_id=IT HTTP/1.1" 200 84
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.genderize.io:443
DEBUG:urllib3.connectionpool:https://api.genderize.io:443 "GET /?name%5B0%5D=KROU

INFO:root:Recognizing gender for Mattéo Giraud: 
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.genderize.io:443
DEBUG:urllib3.connectionpool:https://api.genderize.io:443 "GET /?name%5B0%5D=Giraud&country_id=IT HTTP/1.1" 200 77
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.genderize.io:443
DEBUG:urllib3.connectionpool:https://api.genderize.io:443 "GET /?name%5B0%5D=Matt%C3%A9o&country_id=IT HTTP/1.1" 200 78
INFO:root:Recognizing gender for Roxanne Djabali: 
INFO:root:	F
INFO:root:Recognizing gender for Michael Tutt: 
INFO:root:	M
INFO:root:Recognizing gender for aitok sidi moumounta: 
INFO:root:	M
INFO:root:Recognizing gender for Antonio Ventruto: 
INFO:root:	M
INFO:root:Recognizing gender for ALessandro Garcia: 
INFO:root:	M
INFO:root:Recognizing gender for Wallimann Stéphane: 
INFO:root:	M
INFO:root:Recognizing gender for BUCCI Victoria: 
INFO:root:	F
INFO:root:Recognizing gender for Penot Christophe: 
INFO:root:	M
INFO:root:Recognizing gend

INFO:root:Recognizing gender for Laetitia Japiot: 
INFO:root:	F
INFO:root:Recognizing gender for Quentin Vialatte: 
INFO:root:	M
INFO:root:Recognizing gender for Mathieu Croze: 
INFO:root:	M
INFO:root:Recognizing gender for Pedro Matos: 
INFO:root:	M
INFO:root:Recognizing gender for Miguel Beltran: 
INFO:root:	M
INFO:root:Recognizing gender for Jérôme Rauscher: 
INFO:root:	M
INFO:root:Recognizing gender for Fabien Bourdon: 
INFO:root:	M
INFO:root:Recognizing gender for Marc Teuscher : 
INFO:root:	M
INFO:root:Recognizing gender for Meystre Johan: 
INFO:root:	M
INFO:root:Recognizing gender for Jeremy Abad: 
INFO:root:	M
INFO:root:Recognizing gender for Marc Teuscher Formateur: 
INFO:root:	M
INFO:root:Recognizing gender for Sarah Brown Formatrice: 
INFO:root:	F


### Loading Grades (TICINO)

#### Teacher's grades

In [5]:
grades_1st = pd.read_csv(f"{data_folder}grades_1st.csv",sep="\t",names=["name","grade"])
grades_2nd = pd.read_csv(f"{data_folder}grades_2nd.csv",sep="\t",names=["name","grade"])
grades_3rd = pd.read_csv(f"{data_folder}grades_3rd.csv",sep="\t",names=["name","grade"])

grades_1st.drop_duplicates('name',keep='last', inplace=True)
grades_2nd.drop_duplicates('name',keep='last', inplace=True)
grades_3rd.drop_duplicates('name',keep='last', inplace=True)

students = join_by_fuzzy(students,grades_1st,'user_name','name','grade','grade_1st')
students = join_by_fuzzy(students,grades_2nd,'user_name','name','grade','grade_2nd')
students = join_by_fuzzy(students,grades_3rd,'user_name','name','grade','grade_3rd')

#### Final exam's grades

In [6]:
col_names = ["final_grade","final_LP","final_CP","final_IP","contract_type"]
grades_exams = pd.read_csv(f"{data_folder}grades_exams.csv",sep="\t",names=['name']+col_names)
grades_exams.drop_duplicates('name',keep='last', inplace=True)

for col_name in col_names:
    students = join_by_fuzzy(students,grades_exams,'user_name','name',col_name,col_name)

### Info

In [7]:
print(f'There are {len(students)} students')

There are 543 students


In [8]:
with_null = len(students[students[['grade_1st','grade_2nd','grade_3rd']].isnull().any(axis=1)])
print(f'There are {with_null}/{len(students)} students with at least a null grade ({round(with_null/len(students)*100,2)}%)')

with_null = len(students[students[col_names].isnull().any(axis=1)])
print(f'There are {with_null}/{len(students)} students with at least a null FINAL grade ({round(with_null/len(students)*100,2)}%)')

with_null = len(students[students['final_LP'].isnull()])
print(f'There are {with_null}/{len(students)} students missing the LP FINAL grade ({round(with_null/len(students)*100,2)}%)')

There are 389/543 students with at least a null grade (71.64%)
There are 363/543 students with at least a null FINAL grade (66.85%)
There are 298/543 students missing the LP FINAL grade (54.88%)


In [9]:
all_null   = len(students[students[['grade_1st','grade_2nd','grade_3rd']].isnull().all(axis=1)])
print(f'There are {all_null}/{len(students)} students without a grade ({round(all_null/len(students)*100,2)}%)')

all_null   = len(students[students[col_names].isnull().all(axis=1)])
print(f'There are {all_null}/{len(students)} students without a FINAL grade ({round(all_null/len(students)*100,2)}%)')

all_null   = len(students[students[['grade_1st','grade_2nd','grade_3rd'] + col_names].isnull().all(axis=1)])
print(f'There are {all_null}/{len(students)} studnets without a grade at all ({round(all_null/len(students)*100,2)}%)')

There are 235/543 students without a grade (43.28%)
There are 298/543 students without a FINAL grade (54.88%)
There are 133/543 studnets without a grade at all (24.49%)


### Average grade

In [10]:
students['E_grade'] = np.nan
students['E_grade'] = students[['grade_1st','grade_2nd','grade_3rd']+col_names].mean(axis=1)

In [11]:
students['E_grade'].describe()

count    410.000000
mean       4.673384
std        0.711358
min        0.000000
25%        4.372321
50%        4.766667
75%        5.128571
max        6.000000
Name: E_grade, dtype: float64

In [12]:
students_wo_grades = students[students[['E_grade']].isnull().any(axis=1)]
with_null = len(students_wo_grades)
print(f'There are {with_null}/{len(students)} students without estimate grade ({round(with_null/len(students)*100,2)}%)')

There are 133/543 students without estimate grade (24.49%)


# Export extracted data

In [5]:
students.to_csv(f"{data_folder}students{suffix}.csv",sep='\t',index=False)
users.df.to_csv(f"{data_folder}users{suffix}.csv",sep='\t',index=False)
supervisors.to_csv(f"{data_folder}supervisors{suffix}.csv",sep='\t',index=False)
teachers.to_csv(f"{data_folder}teachers{suffix}.csv",sep='\t',index=False)

In [14]:
students.head()

Unnamed: 0,us_user,user_name,user_email,start_semester,start_year,archived,user_type,HGF,convocatore,teacher,supervisor,ispettore,statista,student,classes,companies,n_activities,n_activities_school_year_1,n_activities_school_year_2,n_activities_school_year_3,n_recipes,n_experiences,n_reflections,n_recipe_reflections,n_experience_reflections,n_in_curriculum,n_recipes_in_curriculum,n_experiences_in_curriculum,n_in_curriculum_semester1,n_in_curriculum_semester2,n_in_curriculum_semester3,n_in_curriculum_semester4,n_in_curriculum_semester5,n_feedback_requests,n_received_feedback_responses,n_received_feedback_requests,n_feedback_responses,avg_activity_evaluations,avg_reflection_length,avg_specific_evaluations,avg_supervisor_evaluation,n_files,n_folders,us_canton,gender,grade_1st,grade_2nd,grade_3rd,final_grade,final_LP,final_CP,final_IP,contract_type,E_grade
0,21,Paolo Barenco,paolobarenco@gmail.com,2013-08-01,2013,1,studente,0,0,0,0,0,0,1,5,,29.0,29.0,0.0,0.0,14.0,15.0,0.0,0.0,0.0,18.0,4.0,14.0,13.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,14.0,0.0,TI,1.0,4.0,,,,,,,,4.0
1,22,Patrick Binaghi,patrick.tartaruga@hotmail.it,2013-08-01,2013,1,studente,0,0,0,0,0,0,1,5819,7137.0,124.0,41.0,29.0,54.0,66.0,58.0,67.0,13.0,54.0,82.0,25.0,57.0,26.0,11.0,14.0,21.0,10.0,0.0,1.0,0.0,0.0,4.6,35.0,,6.0,54.0,0.0,TI,1.0,5.5,6.0,5.5,5.0,4.7,5.1,5.5,Apprendista,5.328571
2,23,Andrew Bottinelli,andrewbottinelli@gmail.com,2013-08-01,2013,1,studente,0,0,0,0,0,0,1,5819,,109.0,51.0,15.0,43.0,40.0,69.0,31.0,21.0,10.0,63.0,14.0,49.0,20.0,9.0,10.0,16.0,8.0,22.0,8.0,0.0,0.0,4.603448,27.0,6.0,4.0,61.0,0.0,TI,1.0,5.0,5.0,5.0,4.8,4.7,5.1,5.0,Apprendista,4.942857
3,24,Jana Clarissa Burkhalter,burkhalter.jana@bluewin.ch,2013-08-01,2013,1,studente,0,0,0,0,0,0,1,5819,9.0,119.0,24.0,14.0,81.0,63.0,56.0,60.0,20.0,40.0,74.0,19.0,55.0,21.0,16.0,13.0,13.0,11.0,57.0,0.0,0.0,0.0,5.059322,41.0,4.724138,,280.0,0.0,TI,0.0,5.5,5.5,5.0,5.1,5.2,4.8,5.0,Apprendista,5.157143
4,25,Jonathan Coscia,jonny_coscia@yahoo.com,2013-08-01,2013,1,studente,0,0,0,0,0,0,1,5819,91.0,96.0,19.0,6.0,71.0,39.0,58.0,65.0,9.0,56.0,68.0,11.0,57.0,19.0,12.0,9.0,17.0,11.0,66.0,0.0,0.0,0.0,4.929688,6.0,,,125.0,0.0,TI,1.0,5.5,5.0,4.5,4.9,4.7,5.2,5.0,Apprendista,4.971429
