<h3>Analyze Attendance</h3>
<p>
    In this project we are using attendance data from a Malaysian school. 
    Due to COVID-19 schools started their classes online. <br/>
    This particular school use google forms / google sheet to 
    record student attendance.<br/>
    Attendance data collected here from lower-secondary level (Forms 1 – 3).<br/>
    Each level has multiple classes and each class has multiple subjects.
</p>
<p>
    <b>Level</b> - FORM 1<br/>
    <b>Classes</b> - ERAT, ELIT, DEDIKASI, IKHLAS, KREATIF, MULIA, RAJIN, RASIONAL
    SABAR, SAYANG, SETIA<br/>
    <b>Subjects</b> - BM, BI, MATE, SN, SEJ, GEO, PI, PM, PJPK, BC, BT, PSV, RBT
</p>
<h4>About the Excel Files</h4>
<p>
    Form 1 has multiple excel files and all files has PDPR_1 in the file name. <br/>
    There is one Excel file for every class (DEDIKASI, ELIT, ERAT....)<br/>
    Each Excel file has multiple sheets<br>
    Each sheet represent a subject (BM, BI, MATE, SN.....)<br/>
    Every sheet has students in the rows and days in the column<br/>
    If student present it is recorded using the '/' characater<br/>
</p>

In [None]:
!pip install xlrd
!pip install openpyxl

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os # provides functions for interacting with operating system.
import math # provides mathematical functions 
import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
inputfolder = '/kaggle/input/attendance/'
workingfolder = '/kaggle/working/'
month = 8

In [None]:
# Before we start, we must clean the files inside the working folder
# Let us create a function that deletes all files under working folder

def clearworkingfolder():
    for dirname, _, filenames in os.walk(workingfolder):
        for filename in filenames:
            os.remove(os.path.join(dirname, filename))

clearworkingfolder()

<h2>Massage / Process original Excel Files</h2>

In [None]:
# The column captions are day/month (string in python)
# In some cases the teacher used the real excel date (datetime in python)
# Let us create a python function 
# that converts the datatime column caption to string caption
# This function takes list of column captions as parameter
# and return list of new columne captions

def fixcolumncaption(columns):
    newcolumns = []
    for column in columns:
        if isinstance(column, datetime.date):
            newcolumns.append(str(column.month) + '/' + str(column.day))
        else:
            newcolumns.append(column)
    return newcolumns

In [None]:
# Let us create a function that massage/process the data in the excel file 
# This function takes excel file name as parameter
# Excel file has the attendance data for a specific class
# Excel file has multiple sheets for multiple subjects
# This function returns class attendance as dictionary with subject as key
# Note: The character '/' in the excel file represent "Present" status

# Step 1: Since we need subjects, load the Excel file using ExcelFile Class
# Step 2: Get the list of subjects from the sheet name 
# Step 3: Itereate through subjects and get attendance data for every subject
# Step 4: Each sheet has 6 header rows which needs to be dropped
# Step 5: Fix the column captions
# Step 6: Drop the null columns. Some cells are empty replace them with NaN
# Step 7: Replace the nan value with 0 and replace character '/' with 1
# Step 8: Drop the rows that do not have Name (empty or 0)
# Step 9: Add this data frame into a dictionary with subject as key

def getclassattendance(fileName):
    print("Processing the class:", fileName)
    attendance = dict()
    excelFile = pd.ExcelFile(fileName)
    subjects = excelFile.sheet_names
    for subject in subjects:
        if (not subject.lower().startswith('sheet')):
            subjectAttendance = excelFile.parse(subject)
            columns = subjectAttendance.iloc[7].tolist()
            columns = fixcolumncaption(columns)
            rows = len(subjectAttendance)
            data = subjectAttendance.iloc[8:rows].copy()
            columns[0] = "Bill"
            columns[1] = "Nama"
            data.columns = columns
            data = data.loc[:, data.columns.notnull()]
            data.replace(r'^\s*$', np.NaN, regex=True, inplace=True)
            data.replace(np.nan, 0, inplace=True)
            data.replace('/', 1, inplace=True)
            data.replace(r'^\s*/$', 1, regex=True, inplace=True)
            data.replace(r'^/\s*$', 1, regex=True, inplace=True)
            data = data[data['Nama'] != 0]
            attendance[subject] = data
    return attendance

In [None]:
# Every processed class attendance by the function getclassattendance
# must be stored in another dictionary where the key is "class name"
# Let us create a function that generates this class name
# For example: When KEHADIRAN PDPR_1 ELIT 2021 file name is passed
# as paramter it must be converted to PDPR_1_ELIT_2021

def getclassname(name):
    name = name.replace("KEHADIRAN ", '')
    name = name.replace(".xlsx", '')
    name = name.replace(" ", "_")
    return name

In [None]:
# This is the main function that takes all the excel files
# and pass them to getclassattendance function.
# The getclassattendance returns class attendance and processattendance
# function add class attendance inside another dictionary using class as key

def processattendance():
    attendance = dict()
    for dirname, _, filenames in os.walk(inputfolder):
        for name in filenames:
            if name.endswith(".xlsx"):
                data = getclassattendance(inputfolder + name)
                attendance[getclassname(name)] = data
    return attendance

In [None]:
# Now we have enough functions let us process the excel files
# and get the attendance in a dictionary format

schoolattendance = processattendance()

<h2>Calculate the Attendance by month</h2>

In [None]:
# From the school attendance we need to create summary for every class

# Let us create a function that takes classname, subject, month as
# parameter and retrives relavant data from schoolattendance dictionary
# Since we are interested only in certain columns 
# Let us create a list of columns that consists of Nama and dates
# Retrive only those columns and calculate total attendance
# Let us add the total as new column with subject as column name

def summary(classname, subject, month):
    attendance = schoolattendance[classname][subject]
    allcolumns = attendance.columns.tolist()
    columns = ['Nama']
    getdaycolumns = lambda x: x.endswith('/' + str(month))
    columns.extend(list(filter(getdaycolumns, allcolumns)))
    attendance = attendance[columns].copy()
    attendance[subject] = attendance.sum(numeric_only=True, axis=1)
    return attendance

In [None]:
def generatesummary():
    attendances = dict()
    for classname, value in schoolattendance.items():
        classDataFrame = pd.DataFrame()
        for subject, attendance in value.items():
            cols = ['Nama', subject]
            if (classDataFrame.empty):
                classDataFrame = summary(classname, subject, month)[cols]
            else:
                current = summary(classname, subject, 8)[cols]
                classDataFrame = pd.merge(left=classDataFrame, 
                                          right=current, 
                                          how='left', on=['Nama'])
        attendances[classname] = classDataFrame
    return attendances

In [None]:
summaryattendance = generatesummary()

In [None]:
for key in summaryattendance.keys():
    summaryattendance[key].to_excel(workingfolder + key + '.xlsx', index=False)

<h3>Checking the Data</h3>

In [None]:
# This will help you to find is there any column suppose to be 
# number but it is object which cause the calculation go wrong

def checkdata():
    for key in summaryattendance.keys():
        for column in summaryattendance[key].columns:
            dtype = summaryattendance[key][column].dtype
            if column != 'Nama' and  dtype == 'object': print(column, dtype)

checkdata()

<h3>Analyze</h3>

In [None]:
summaryattendance.keys()

In [None]:
# By default the pandas output are normally with 5 decimal places
# It can be changed using the key display.float_format of pandas set_option function
pd.set_option('display.float_format', lambda x: '%.0f' % x)

In [None]:
pdproneelit = summaryattendance['PDPR_1_ELIT_2021']
pdproneelit

In [None]:
# The head() function is used to get the first n rows. 
# By default value of n is 5
# It is useful for quickly testing the dataset.
pdproneelit.head()

In [None]:
# The tail() function is used to get the last n rows. 
# By default value of n is 5
# It is useful for quickly testing the dataset.
pdproneelit.tail()

In [None]:
# The info() function is used to print a concise summary of a DataFrame. 
# This method prints information about a DataFrame including
# 1. total number of rows (start index, end index)
# 2. total number of columns
# 3. data type of each column 
# 4. how many non-null count for each column
# 5. in this case the total entry is 17 and non-null count of each field is also 17
# 6. the dataset do not have null values
# 7. summary of data type => you have 4 integer columns and 1 non-number column
# 8. how much memory is being used in bytes
pdproneelit.info()

In [None]:
# The memory_usage() function return memory usage of each column in bytes.
pdproneelit.memory_usage()

In [None]:
# The describe() method is used for calculating some statistical data like count, 
# mean, std, min, mean and percentile of the numerical values. 
# It analyzes both numeric and object column sets of mixed data types.
pdproneelit.describe()

<h3>Selection rows and columns</h3>

Let's learn the various methods to grab data from a DataFrame

In [None]:
# How to retrieve a specific column
# Pass the column names as index
pdproneelit['Nama']

In [None]:
# How to retrieve more than one column
# Pass multiple column names as index list 
pdproneelit[['Nama', 'BM', 'BI']]

In [None]:
# Since rows do not have index we can pull out the row using position number only
# How to retrieve a specific row using row index (based on position)
# Row index start with 0
pdproneelit.iloc[0]

In [None]:
# How to retrieve more than one row using row index (based on position)
# Row index start with 1, 3, 5, 7
pdproneelit.iloc[[1, 3, 5, 7, 9]]

<h3>Add, Drop, Rearrange Columns and Rows</h3>

<p>Let's learn how to do the following items</p>
<ol>
    <li>Drop a row from a DataFrame</li>
    <li>Add a row to the DataFrame</li>
</ol>


In [None]:
# There are 3 rows and it has the total, percentage
# I would like to drop rows from 34 onwards
# By default value of axis is 0 which refers to drop row
# Since we want to drop the row let us use the 
# named parameter axis = 0
# By default the rows is not dropped permanently
# in order to drop the row permanently let us use the 
# named parameter inplace = True
pdproneelit.drop(pdproneelit.index[[34, 35, 36]], axis=0, inplace=True)
pdproneelit

In [None]:
pdproneelit.set_index('Nama', inplace=True)

In [None]:
# Let us plot the graph using Dataframe

ax = pdproneelit[pdproneelit['BM'] != 0]['BM'].plot(kind="barh", figsize=(16, 24))
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(16)
plt.title("Elit Attendance for BM", fontsize=26)
plt.xlabel("Number of Days", fontsize=24)
plt.ylabel("Students", fontsize=24)
plt.grid()
plt.show()

In [None]:
# Let us plot the graph using Dataframe

ax = pdproneelit['PI'].plot(kind="barh", figsize=(16, 24))
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(16)
plt.title("Elit Attendance for PI", fontsize=26)
plt.xlabel("Number of Days", fontsize=24)
plt.ylabel("Students", fontsize=24)
plt.grid()
plt.show()

In [None]:
# Let us plot the graph using Dataframe

ax = pdproneelit[['BM', 'BI']].plot(kind="barh", figsize=(16, 24))
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(16)
plt.title("Elit Attendance for BM and BI", fontsize=26)
plt.xlabel("Number of Days", fontsize=24)
plt.ylabel("Students", fontsize=24)
plt.grid()
plt.show()

In [None]:
# Let us plot the graph using Dataframe

def getcolumns(dataframe):
    allcolumns = dataframe.columns.tolist()
    columns = ['Nama']
    getdaycolumns = lambda x: x.endswith('/' + str(month))
    days = list(filter(getdaycolumns, allcolumns))
    columns.extend(days)
    return columns

def getattendancecs(classname, student):
    allsubjectattendance = pd.DataFrame()
    for subject, attendance in schoolattendance[classname].items():
        subjectDataFrame = attendance[attendance["Nama"] == student].copy()
        columns = getcolumns(subjectDataFrame)
        subjectDataFrame = subjectDataFrame[columns]
        
        multiindex = []
        for column in columns[1:]:
            multiindex.append((subject, column))
        subjectDataFrame.set_index('Nama', inplace=True)
        subjectDataFrame.columns = pd.MultiIndex.from_tuples(multiindex)
        
        if allsubjectattendance.empty:
            if not subjectDataFrame.empty:
                allsubjectattendance = subjectDataFrame
        else:
            if not subjectDataFrame.empty:
                allsubjectattendance = pd.merge(left=allsubjectattendance, 
                                                right=subjectDataFrame, 
                                                how='left', on=['Nama'])
    return allsubjectattendance

allsubjectattendance = getattendancecs('PDPR_1_ELIT_2021', 'SUSHI LOW SHU XIN')
ax = allsubjectattendance.unstack().plot(kind="bar", figsize=(16, 4))
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(12)
plt.title("Attendance for SUSHI LOW SHU XIN", fontsize=22)
plt.xlabel("Days", fontsize=20)
plt.ylabel("Attendance", fontsize=20)
labels = [item.get_text() for item in ax.get_xticklabels()]
for index in range(0, len(labels)):
    names = labels[index][1:len(labels[index])-1].split(",")
    labels[index] = '-'.join(names[0:2])
ax.set_xticklabels(labels)
ax.set_yticklabels([])
plt.grid()
plt.show()

In [None]:
# Let us plot the graph using Dataframe

def getcolumns(dataframe):
    allcolumns = dataframe.columns.tolist()
    columns = ['Nama']
    getdaycolumns = lambda x: x.endswith('/' + str(month))
    days = list(filter(getdaycolumns, allcolumns))
    columns.extend(days)
    return columns

def getattendancecss(classname, subject, student):
    subjectDataFrame = schoolattendance[classname][subject]
    subjectDataFrame = subjectDataFrame[subjectDataFrame["Nama"] == student]
    columns = getcolumns(subjectDataFrame)
    subjectDataFrame = subjectDataFrame[columns].copy()
    subjectDataFrame.set_index('Nama', inplace=True)
    subjectDataFrame = subjectDataFrame.T
    return subjectDataFrame

subjectDataFrame = getattendancecss('PDPR_1_ELIT_2021', 'BM', 'PHANINDRA A/L PAUL SARAVANAN')
ax = subjectDataFrame.plot(kind="bar", figsize=(16, 4))
for index, p in enumerate(ax.patches):
    if (subjectDataFrame.iloc[index]['PHANINDRA A/L PAUL SARAVANAN']):
        value = f"Present"
        ax.annotate(value, xy =(p.get_x() + 0.06, p.get_height() + 0.1), fontsize=14)
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(16)
plt.title("BM Attendance for PHANINDRA A/L PAUL SARAVANAN", fontsize=22)
plt.xlabel("Days", fontsize=20)
plt.ylabel("Attendance", fontsize=20)
plt.grid()
plt.ylim(0, 1.5)
ax.set_yticklabels([])
plt.show()

In [None]:
studentattendance = pdproneelit.loc['PHANINDRA A/L PAUL SARAVANAN']
maxdays = pdproneelit.describe().loc['max']
ax = studentattendance.plot(kind="bar", figsize=(16, 4))
for index, p in enumerate(ax.patches):
    value = f"{studentattendance[index]:.0f}/{maxdays[index]:.0f}"
    ax.annotate(value, xy =(p.get_x() + 0.12, p.get_height() + 0.25), fontsize=14)
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(14)
plt.title("Summary Attendance for PHANINDRA A/L PAUL SARAVANAN", fontsize=22)
plt.xlabel("Subjects", fontsize=20)
plt.ylabel("Attendance", fontsize=20)
plt.grid()
plt.ylim(0, 8)
plt.show()

In [None]:
studentattendance = pdproneelit.loc['PHANINDRA A/L PAUL SARAVANAN']
maxdays = pdproneelit.describe().loc['max']

plt.figure(figsize=(16, 4))
ax = sns.barplot(x=studentattendance.index, y=studentattendance)
for index, p in enumerate(ax.patches):
    value = f"{studentattendance[index]:.0f}/{maxdays[index]:.0f}"
    ax.annotate(value, xy =(p.get_x() + 0.2, p.get_height() + 0.25), fontsize=14)
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(14)
plt.title("Summary Attendance for PHANINDRA A/L PAUL SARAVANAN", fontsize=22)
plt.xlabel("Subjects", fontsize=20)
plt.ylabel("Attendance", fontsize=20)
plt.ylim(0, 8)
plt.grid()

In [None]:
nan