In [1]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
pd.options.plotting.backend = "plotly"

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from fuzzywuzzy import fuzz 
from fuzzywuzzy import process
def fuzzy_checker(wrong_options,correct_options):
    names_array=[]
    ratio_array=[]    
    for wrong_option in wrong_options:
        if wrong_option in correct_options:
            names_array.append(wrong_option)
            ratio_array.append(100)
        else:   
            x=process.extractOne(wrong_option,correct_options,scorer=fuzz.token_set_ratio)
            names_array.append(x[0])
            ratio_array.append(x[1])
    return names_array,ratio_array


import info_utilities

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

In [2]:
pd.options.display.max_columns = None
pd.options.display.max_rows    = None

In [3]:
data_folder = "../data/"

In [4]:
#plotly style:
line_traces = dict(mode='lines+markers',line_shape='spline',line_smoothing=1,marker_size=10,marker_opacity=0.9)

# Data loading and cleaning

## Users info

In [5]:
users      = pd.read_csv(f"{data_folder}users.csv")
users_info = pd.read_csv(f"{data_folder}users_info.csv",sep=";").set_index('attribute')

**Below the list of the columns and a short description**

In [6]:
users_info

Unnamed: 0_level_0,type,description
attribute,Unnamed: 1_level_1,Unnamed: 2_level_1
user_id,numeric,user id
archived,ordinal,"boolean: if a user is archived, is no more vis..."
user_type,mixed,types of the user in csv
classes,categorical,classes to which the user belongs
n_activities,interval,the number of user activities
n_recipes,interval,the number of user recipes
n_experiences,interval,the number of user experiences
n_reflections,interval,the number of user reflections
n_recipe_reflections,interval,the number of user's recipies reflections
n_experience_reflections,interval,the number of user's experiences reflections


In [7]:
info_utilities.check_columns(users,users_info)

There are 1 extra columns in info: ['user_id']
There are 7 extra columns in df: ['user_email', 'companies', 'avg_supervisor_evaluation', 'start_year', 'us_user', 'user_name', 'start_semester']


In the df are present those type of users:

We need only formatore (supervisor), docente (teacher) and studente (student)

In [8]:
# a little translation from italian
vals_to_replace = {'formatore':'supervisor', 'docente':'teacher', 'studente':'student'}
users.replace({"user_type":vals_to_replace}, inplace=True)

In [9]:
# One hot for user_type
users = pd.concat(
                [users.loc[:, :'user_type'], 
                 (users['user_type'].str.split('\s*,\s*', expand=True)
                   .stack()
                   .str.get_dummies()
                   .sum(level=0)), 
                 users.loc[:, 'classes':]], 
                axis=1)

In [10]:
interval_features = users_info[users_info['type']=='interval'].index.tolist()

In [11]:
# fill with 0 the interval features
users.loc[:,interval_features] = users[interval_features].fillna(0)

In [12]:
students    = users[users['student'] == 1]
supervisors = users[users['supervisor'] == 1]
teachers    = users[users['teacher'] == 1]

### Loading notes

In [13]:
def join_by_fuzzy(df,df_to_match,
                  original_column,to_match_column,
                  to_join_column, joined_column, 
                  no_match_value=np.nan,limit=90):
    
    tmp_joined_name = '__fuzzy_result'
    
    str2Match = df[original_column].tolist()
    strOptions = df_to_match[to_match_column].tolist()
    name_match,ratio_match=fuzzy_checker(str2Match,strOptions)
    
    df1 = pd.DataFrame()
    df1['old_names']=pd.Series(str2Match)
    df1['correct_names']=pd.Series(name_match)
    df1['correct_ratio']=pd.Series(ratio_match)
    equiv=df_to_match.set_index(to_match_column).to_dict()[to_join_column]
    df1.loc[:,tmp_joined_name] = df1["correct_names"].map(equiv)
    df1.loc[df1['correct_ratio']<limit,tmp_joined_name] = no_match_value
    
    return df.assign(__fuzzy_result=df1[tmp_joined_name].values).rename({'__fuzzy_result': joined_column}, axis=1)

In [14]:
grades_1st = pd.read_csv(f"{data_folder}grades_1st.csv",sep="\t",names=["name","grade"])
grades_2nd = pd.read_csv(f"{data_folder}grades_2nd.csv",sep="\t",names=["name","grade"])
grades_3rd = pd.read_csv(f"{data_folder}grades_3rd.csv",sep="\t",names=["name","grade"])

In [15]:
grades_1st[grades_1st.duplicated(['name'], keep=False)]

Unnamed: 0,name,grade
71,Poretti Victor,3.5
79,Dachille David,4.5
101,Dachille David,5.5
111,Poretti Victor,4.0
114,Stocker Nicolò,3.5
144,De Luca Carolyn,4.5
208,De Luca Carolyn,4.0
283,Stocker Nicolò,5.0


In [16]:
grades_2nd[grades_2nd.duplicated(['name'], keep=False)]

Unnamed: 0,name,grade
9,Pedrazzani Cesare,4.0
66,Fontana Loris,4.5
72,Pedrazzani Cesare,5.0
133,Poretti Victor,3.5
168,Poretti Victor,2.0
193,Fontana Loris,4.5
211,Carrozzo Samuele,3.5
225,Carrozzo Samuele,5.5


In [17]:
grades_3rd[grades_3rd.duplicated(['name'], keep=False)]

Unnamed: 0,name,grade
3,Fernandez-Bermudez Samantha Jessica,4.5
16,Steiner Alcea,4.0
32,Schönholzer Francisco,5.0
57,Steiner Alcea,4.0
65,Fernandez-Bermudez Samantha Jessica,4.5
77,Schönholzer Francisco,5.0
95,Spiegel Fabio,2.5
148,Amnuayporn Natthatida,2.5
179,Spiegel Fabio,5.0
181,Amnuayporn Natthatida,5.0


In [18]:
grades_1st.drop_duplicates('name',keep=False, inplace=True)
grades_2nd.drop_duplicates('name',keep=False, inplace=True)
grades_3rd.drop_duplicates('name',keep=False, inplace=True)

In [19]:
students = join_by_fuzzy(students,grades_1st,'user_name','name','grade','grade_1st')
students = join_by_fuzzy(students,grades_2nd,'user_name','name','grade','grade_2nd')
students = join_by_fuzzy(students,grades_3rd,'user_name','name','grade','grade_3rd')

In [20]:
students.drop_duplicates('user_name',keep=False, inplace=True)

In [31]:
filter_grades = (((students['grade_2nd'].notnull()) & (students['grade_1st'].isnull())) | (students['grade_3rd'].notnull()) & ((students['grade_1st'].isnull()) | (students['grade_2nd'].isnull())))

In [32]:
students.drop(students[filter_grades].index, axis=0, inplace=True)

In [33]:
with_null = len(students[students[['grade_1st','grade_2nd','grade_3rd']].isnull().any(axis=1)])
print(f'There are {with_null}/{len(students)} studnets with at least a null grade ({round(with_null/len(students)*100,2)}%)')

There are 372/523 studnets with at least a null grade (71.13%)


We need to calc a different metric to include the most of the users

In [34]:
print(list(students.columns))

['us_user', 'user_name', 'user_email', 'start_semester', 'start_year', 'archived', 'user_type', 'HGF', 'convocatore', 'docente', 'ispettore', 'statista', 'student', 'studente', 'supervisor', 'teacher', 'classes', 'companies', 'n_activities', 'n_recipes', 'n_experiences', 'n_reflections', 'n_recipe_reflections', 'n_experience_reflections', 'n_in_curriculum', 'n_recipes_in_curriculum', 'n_experiences_in_curriculum', 'n_in_curriculum_semester1', 'n_in_curriculum_semester2', 'n_in_curriculum_semester3', 'n_in_curriculum_semester4', 'n_in_curriculum_semester5', 'n_feedback_requests', 'n_received_feedback_responses', 'n_received_feedback_requests', 'n_feedback_responses', 'avg_activity_evaluations', 'avg_reflection_length', 'avg_specific_evaluations', 'avg_supervisor_evaluation', 'n_files', 'n_folders', 'grade_1st', 'grade_2nd', 'grade_3rd']


n_activities, n_reflections, n_in_curriculum, n_feedback_requests, avg_reflection_length, n_files, n_folders, avg_supervisor_evaluation could be good candidates

In [35]:
cols = ['n_activities', 'n_reflections', 'n_in_curriculum', 'n_feedback_requests', 'avg_reflection_length', 'n_files', 'n_folders']
students[cols].isnull().any(axis=1).sum()

0

In [36]:
# create a column the most representative for the grade
students_with_all_grades = students[['grade_1st','grade_2nd','grade_3rd']].notnull().all(axis=1)
students['E_grade'] = np.nan
students['E_grade'] = students[['grade_1st','grade_2nd','grade_3rd']].mean(axis=1)
students['E_grade'].fillna(students['avg_supervisor_evaluation'], inplace=True)

In [37]:
students_wo_grades = students[students[['E_grade']].isnull().any(axis=1)]
with_null = len(students_wo_grades)
print(f'There are {with_null}/{len(students)} students without estimate grade ({round(with_null/len(students)*100,2)}%)')

There are 202/523 students without estimate grade (38.62%)


In [38]:
students.drop(students_wo_grades.index, axis=0, inplace=True)

In [39]:
students_wo_grades = students[students[['E_grade']].isnull().any(axis=1)]
with_null = len(students_wo_grades)
print(f'There are {with_null}/{len(students)} students without estimate grade ({round(with_null/len(students)*100,2)}%)')

There are 0/321 students without estimate grade (0.0%)


In [40]:
scaler = MinMaxScaler()

In [41]:
reg = LinearRegression().fit(scaler.fit_transform(students[students['E_grade']>4][cols].values) , students[students['E_grade']>4]['E_grade'])

In [42]:
coeff = dict(zip(cols,reg.coef_))
coeff = pd.DataFrame.from_dict({'column':list(coeff.keys()),'coeff':list(coeff.values())})

In [43]:
fig = coeff.sort_values(by="coeff").plot.bar(x="column",y="coeff")
fig.show()

In [65]:
students.plot.hist(x="E_grade",y="n_feedback_requests", histfunc="avg",nbins=20)

In [66]:
students.plot.hist(x="E_grade",y="n_activities", histfunc="avg",nbins=20)

In [34]:
students.plot.scatter(x="n_feedback_requests",y="E_grade",color='E_grade',trendline="ols")