File paths and other constants to be used throughout the notebook


In [3]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

excelOriginal = '../data/imd_student_blind.xlsx'
csvOriginal = '../data/imd-student-blind.csv'

Libraries to import and use 

In [4]:
# Import pandas
import pandas as pd
#Import numpy
import numpy as np

Convert the original Microsoft Excell spreadsheet to .csv

In [4]:
# Load spreadsheet: xl
xl = pd.ExcelFile(excelFilePath)

# Convert to .csv
df = xl.parse(0)
df.to_csv(csvOriginal)

Given a class, separate the students between the ones who had a good performance on it, and the ones who did not.

In [37]:
def getDataframeForEachStudent(df, studentColumn='a_ID'):
    students = df.groupby(df['a_ID'])
    g = students.groups
    studentDFs = dict()
    for key, value in g.items():
        studentDFs[key] = students.get_group(key)
    return studentDFs

def eraseStudentsWhoDidNotParticipateOnClass(studentsDict, classID=5, classColumn='disciplina_ID'):
    before = len(studentsDict)
    toErase = list()
    for key, value in studentsDict.items():
        enrolled = False
        for label, row in value.iterrows():
            if((row[classColumn] == classID) or (row[classColumn] == str(classID))):
                enrolled = True
                #print("Student " + str(key) + " enrolled on " + str(classID))
                break
        if(not enrolled):
            toErase.append(key)
    for key in toErase:
        del studentsDict[key]
    print(str(before-len(studentsDict)) + " students did not enrolled on " + str(classID))
    return studentsDict

'''
standard   The criteria to determine if a student had a good performance or not
    'APPROVED ON FIRST TRY'
    'APPROVED WITH SCORE >= 7'
return     True if the student had a good performance
'''
def studentHadGoodPerformanceInDiscipline(studentDf, discipline=5, 
                                          standard='APPROVED WITH SCORE >= 7', 
                                          classColumn='disciplina_ID'):
    if(standard == 'APPROVED ON FIRST TRY'):
        attempts = 0
        approved = False
        for label, row in studentDf.iterrows():
            if(row[classColumn] == discipline):
                attempts = attempts + 1
                if(row['status.disciplina'] == 'Aprovado'):
                    approved = True
        return (approved and (attempts == 1))
    elif(standard == 'APPROVED WITH SCORE >= 7'):
        approved = False
        for label, row in studentDf.iterrows():
            if(row[classColumn] == discipline):
                if(row['status.disciplina'] == 'Aprovado'):
                    if(row['nota'] >= 7.0):
                        approved = True
                        break
        return approved
    else:
        return False

def getGoodAndBadStudentsInDiscipline(df, discipline=5, standard='APPROVED ON FIRST TRY'):
    studentsRaw = getDataframeForEachStudent(df)
    students = eraseStudentsWhoDidNotParticipateOnClass(studentsRaw, classID=discipline)
    goodStudents = list()
    badStudents = list()
    for key, studentDF in students.items():
        if(studentHadGoodPerformanceInDiscipline(studentDF, standard=standard, discipline=discipline)):
            goodStudents.append(key)
        else:
            badStudents.append(key)
    print(str(len(goodStudents)) + " good students in discipline " + str(discipline))
    x = len(students)
    print(str((len(goodStudents)/x)*100) + "% good students.")
    return (goodStudents, badStudents)

print('Using APPROVED ON FIRST TRY')
t = getGoodAndBadStudentsInDiscipline(df, discipline=5, standard='APPROVED ON FIRST TRY')
print('\nUsing APPROVED WITH SCORE >= 7')
t2 = getGoodAndBadStudentsInDiscipline(df, discipline=5, standard='APPROVED WITH SCORE >= 7')

Using APPROVED ON FIRST TRY
377 students did not enrolled on 5
189 good students in discipline 5
36.137667304015295% good students.

Using APPROVED WITH SCORE >= 7
377 students did not enrolled on 5
91 good students in discipline 5
17.39961759082218% good students.


In [43]:
def getStudentMeanScore(df, studentID, studentColumn='a_ID', columnClasses='disciplina_ID', columnValue='nota'):
    studentDF = df[df[studentColumn] == studentID]
    return studentDF[columnValue].mean()

goodStudentsMeanSeries = pd.Series()
badStudentsMeanSeries = pd.Series()
for student in t[0]:
    goodStudentsMeanSeries[str(student)] = getStudentMeanScore(df, student)
for student in t[1]:
    badStudentsMeanSeries[str(student)] = getStudentMeanScore(df, student)
    
print(goodStudentsMeanSeries.mean())
print(badStudentsMeanSeries.mean())

7.07144872763921
3.7228059980604886


Find the classes with the highest or lowest scores for the students on a given dataset:

In [6]:
def getSortedDisciplines(df, columnClasses='disciplina_ID', columnValue='nota', highest=True):
    a = df[columnValue].groupby(df[columnClasses]).median()
    a.sort_values(ascending=(not highest), inplace=True)
    return a
    
getSortedDisciplines(t, highest=False)

disciplina_ID
5    2.20
3    2.90
6    4.30
0    5.25
2    5.30
4    5.95
1    7.10
Name: nota, dtype: float64