In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
oPath = 'database/tables'
# Read all the sheets from the excel file.
students = pd.read_csv('{}/students.csv'.format(oPath), header = 0)
classes = pd.read_csv('{}/classes.csv'.format(oPath), header = 0)
tests = pd.read_csv('{}/tests.csv'.format(oPath), header = 0)
test_results = pd.read_csv('{}/test_results.csv'.format(oPath), header = 0)
attendance = pd.read_csv('{}/attendance.csv'.format(oPath), header = 0)
hw = pd.read_csv('{}/hw_assignments.csv'.format(oPath), header = 0)
hw_results = pd.read_csv('{}/hw_grades.csv'.format(oPath), header = 0)
participation = pd.read_csv('{}/participation.csv'.format(oPath), header = 0)

In [3]:
# Define a function to show values
def show_values_on_bars(axs):
    def _show_on_single_plot(ax):        
        for p in ax.patches:
            _x = p.get_x() + p.get_width() / 2
            _y = p.get_y() + p.get_height() + 0.05
            value = '{:.0f}'.format(p.get_height())
            ax.text(_x, _y, value, ha="center") 

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)

In [4]:
# Get a list of all class IDs
lClasses = classes['CLASS_ID'].to_list()
# Get a list of student IDs
lStudents = students['STUDENT_ID'].to_list()

# Test Scores

In [194]:
test_results

Unnamed: 0,TEST_ID,STUDENT_ID,TEST_SCORE,TEST_SCORE_MAX,TEST_SCORE_PERCENTAGE,TEST_DATE,CLASS_ID
0,0,20,32,50,0.64,2013-03-27,0
1,0,29,45,50,0.90,2013-03-27,0
2,0,32,47,50,0.94,2013-03-27,0
3,0,56,35,50,0.70,2013-03-27,0
4,0,58,43,50,0.86,2013-03-27,0
...,...,...,...,...,...,...,...
14495,144,319,29,50,0.58,2015-07-26,26
14496,144,342,25,50,0.50,2015-07-26,26
14497,144,354,3,50,0.06,2015-07-26,26
14498,144,375,42,50,0.84,2015-07-26,26


In [6]:
# Create a DataFrame of test results for each class
dfTests = {}
# Loop through the class list
for i in lClasses:
    # Define a DF for the class based on its ID
    dfTests[i] = test_results.where(test_results['CLASS_ID'] == i)
    # Drop the rows with all nans
    dfTests[i].dropna(how = 'all', inplace = True)
    # Reset the index
    dfTests[i] = dfTests[i].reset_index(drop = True)
    # Set the types of columns
    dfTests[i] = dfTests[i].astype({'STUDENT_ID': 'int32', 'TEST_ID': 'int32', 'TEST_SCORE': 'int32', 'TEST_SCORE_MAX': 'int32', 'CLASS_ID': 'int32'})

In [13]:
dfTests[1]

Unnamed: 0,TEST_ID,STUDENT_ID,TEST_SCORE,TEST_SCORE_MAX,TEST_SCORE_PERCENTAGE,TEST_DATE,CLASS_ID
0,0,24,11,50,0.22,2013-03-27,1
1,0,70,17,50,0.34,2013-03-27,1
2,0,88,34,50,0.68,2013-03-27,1
3,0,149,35,50,0.70,2013-03-27,1
4,0,166,15,50,0.30,2013-03-27,1
...,...,...,...,...,...,...,...
430,28,393,0,50,0.00,2015-07-29,1
431,28,411,0,50,0.00,2015-07-29,1
432,28,440,7,50,0.14,2015-07-29,1
433,28,458,18,50,0.36,2015-07-29,1


In [12]:
i = 1

lTests = dfTests[i]['TEST_ID'].unique()
dfGroup = pd.DataFrame(dfTests[i].groupby(['CLASS_ID', 'TEST_ID'])['TEST_SCORE'].mean())
dfGroup.rename(columns = {'TEST_SCORE':'MEAN_SCORE'})
dfGroup

Unnamed: 0_level_0,Unnamed: 1_level_0,TEST_SCORE
CLASS_ID,TEST_ID,Unnamed: 2_level_1
1,0,22.266667
1,1,24.533333
1,2,22.933333
1,3,27.066667
1,4,26.0
1,5,22.933333
1,6,25.4
1,7,25.0
1,8,22.733333
1,9,25.666667


# Students

In [None]:
# Create a new DF by count of each name
dfTopNames = students.groupby(['STUDENT_NAME_ENGLISH'])['STUDENT_ID'].nunique().sort_values(ascending = False)

In [None]:
# Plot Top 20 Names
# Set the title
tTitle = "Top 20 Names"
# Index the top 20
dfTop20Names = dfTopNames.head(20)
# Plot the new DF
pTop20Names = dfTop20Names.plot.bar(x = 'STUDENT_NAME_ENGLISH', figsize=(12,6))
# Format the graph
pTop20Names.set_title(tTitle, size = 30)
pTop20Names.set_xlabel("Name", size = 18)
pTop20Names.set_ylabel("Participation Count", size = 18)
# Show values at the tops of the bars
show_values_on_bars(pTop20Names)
# Save the figure
pTop20Names.figure.savefig("images\{}.png".format(tTitle))

In [None]:
# Plot Bottom 20 Names
# Set the title
tTitle = "Bottom 20 Names"
# Index the top 20
dfBottom20Names = dfTopNames.tail(20)
# Plot the new DF
pBottom20Names = dfBottom20Names.plot.bar(x = 'STUDENT_NAME_ENGLISH', figsize=(12, 6))
# Format the graph
pBottom20Names.set_title(tTitle, size = 30)
pBottom20Names.set_xlabel("Name", size = 18)
pBottom20Names.set_ylabel("Participation Count", size = 18)
# Show values at the tops of the bars
show_values_on_bars(pBottom20Names)

In [None]:
# Plot Student Sex Distribution
# Set the title
tTitle = "Student Sex Distribution"
# Create figure and axis objects
fig, ax = plt.subplots(figsize=(10, 6))
pStudentSexDistribution = sns.countplot(x = 'STUDENT_SEX', data = students, ax = ax)
# Format the graph
pStudentSexDistribution.set_title(tTitle, size = 30)
pStudentSexDistribution.set_xlabel("Sex", size = 18)
pStudentSexDistribution.set_ylabel("Count", size = 18)
# Show bar values on the tops of the bars
show_values_on_bars(pStudentSexDistribution)
# Save the figure
pStudentSexDistribution.figure.savefig("images\{}.png".format(tTitle))

In [None]:
# Plot Student Sex Distribution by Enrolled Status
# Set the title
tTitle = "Student Sex Distribution by Enrolled Status"
# Create figure and axis objects
fig, ax = plt.subplots(figsize=(10, 6))
pStudentSexDistribution = sns.countplot(x = 'STUDENT_SEX', data = students, ax = ax, hue = 'STUDENT_ENROLLED')
# Format the graph
pStudentSexDistribution.set_title(tTitle, size = 30)
pStudentSexDistribution.set_xlabel("Sex", size = 18)
pStudentSexDistribution.set_ylabel("Count", size = 18)
# Show bar values on the tops of the bars
show_values_on_bars(pStudentSexDistribution)
# Save the figure
pStudentSexDistribution.figure.savefig("images\{}.png".format(tTitle))

In [None]:
# Plot Student Age Distribution
# Set the title
tTitle = "Student Age Distribution"
# Create figure and axis objects
fig, ax = plt.subplots(figsize=(10, 6))
pStudentAgeDistribution = sns.countplot(x = 'STUDENT_AGE', data = students, ax = ax)
# Format the graph
pStudentAgeDistribution.set_title(tTitle, size = 30)
pStudentAgeDistribution.set_xlabel("Age", size = 18)
pStudentAgeDistribution.set_ylabel("Count", size = 18)
# Show bar values on the tops of the bars
show_values_on_bars(pStudentAgeDistribution)
# Save the figure
pStudentAgeDistribution.figure.savefig("images\{}.png".format(tTitle))

In [None]:
# Plot Student Age Distribution by Sex
# Set the title
tTitle = "Student Age Distribution by Sex"
# Create figure and axis objects
fig, ax = plt.subplots(figsize=(10, 6))
pStudentAgeDistribution = sns.countplot(x = 'STUDENT_AGE', data = students, ax = ax, hue = 'STUDENT_SEX')
# Format the graph
pStudentAgeDistribution.set_title(tTitle, size = 30)
pStudentAgeDistribution.set_xlabel("Age", size = 18)
pStudentAgeDistribution.set_ylabel("Count", size = 18)
# Show bar values on the tops of the bars
show_values_on_bars(pStudentAgeDistribution)
# Save the figure
pStudentAgeDistribution.figure.savefig("images\{}.png".format(tTitle))

In [None]:
# Plot Student Age Distribution by Enrolled Status
# Set the title
tTitle = "Student Age Distribution by Enrolled Status"
# Create figure and axis objects
fig, ax = plt.subplots(figsize=(10, 6))
pStudentAgeDistribution = sns.countplot(x = 'STUDENT_AGE', data = students, ax = ax, hue = 'STUDENT_ENROLLED')
# Format the graph
pStudentAgeDistribution.set_title(tTitle, size = 30)
pStudentAgeDistribution.set_xlabel("Age", size = 18)
pStudentAgeDistribution.set_ylabel("Count", size = 18)
# Show bar values on the tops of the bars
show_values_on_bars(pStudentAgeDistribution)
# Save the figure
pStudentAgeDistribution.figure.savefig("images\{}.png".format(tTitle))

In [None]:
# Plot Student Class Distribution
# Set the title
tTitle = "Student Class Distribution"
# Create figure and axis objects
fig, ax = plt.subplots(figsize=(12, 6))
pStudentAgeDistribution = sns.countplot(x = 'CLASS_ID', data = students, ax = ax)
# Format the graph
pStudentAgeDistribution.set_title(tTitle, size = 30)
pStudentAgeDistribution.set_xlabel("Class", size = 18)
pStudentAgeDistribution.set_ylabel("Count", size = 18)
# Show bar values on the tops of the bars
show_values_on_bars(pStudentAgeDistribution)
# Save the figure
pStudentAgeDistribution.figure.savefig("images\{}.png".format(tTitle))

In [None]:
# Plot Student Class Distribution grouped by sex
# Set the title
tTitle = "Student Class Distribution by Sex"
# Create figure and axis objects
fig, ax = plt.subplots(figsize=(15, 6))
pStudentAgeDistribution = sns.countplot(x = 'CLASS_ID', data = students, ax = ax, hue = 'STUDENT_SEX')
# Format the graph
pStudentAgeDistribution.set_title(tTitle, size = 30)
pStudentAgeDistribution.set_xlabel("Class", size = 18)
pStudentAgeDistribution.set_ylabel("Count", size = 18)
# Show bar values on the tops of the bars
show_values_on_bars(pStudentAgeDistribution)
# Save the figure
pStudentAgeDistribution.figure.savefig("images\{}.png".format(tTitle))

In [None]:
# Plot Student Class Distribution grouped by enrolled
# Set the title
tTitle = "Student Class Distribution by Enrolled Status"
# Create figure and axis objects
fig, ax = plt.subplots(figsize=(15, 4))
pStudentAgeDistribution = sns.countplot(x = 'CLASS_ID', data = students, ax = ax, hue = 'STUDENT_ENROLLED')
# Format the graph
pStudentAgeDistribution.set_title(tTitle, size = 30)
pStudentAgeDistribution.set_xlabel("Class", size = 18)
pStudentAgeDistribution.set_ylabel("Count", size = 18)
# Show bar values on the tops of the bars
show_values_on_bars(pStudentAgeDistribution)
# Save the figure
pStudentAgeDistribution.figure.savefig("images\{}.png".format(tTitle))

# Participation

## Average Attempts per Instance

### Prepare DataFrames

In [26]:
dfPartStudent = {}
student_id = 0
dfPartStudent[student_id] = participation.where(participation['STUDENT_ID'] == student_id)
dfPartStudent[student_id].dropna(how = 'all', inplace = True)
dfPartStudent[student_id]

Unnamed: 0,STUDENT_ID,PARTICIPATION_DATETIME,PARTICIPATION_TYPE,PARTICIPATION_ATTEMPTS,PARTICIPATION_HINTS,CLASS_ID,CLASS_TIME,CLASS_DAY
9,0.0,4/26/2014,volunteer,1.0,0.0,17.0,17:30:00,Saturday
416,0.0,1/31/2015,cold call,3.0,0.0,17.0,17:30:00,Saturday
844,0.0,5/9/2015,cold call,2.0,1.0,17.0,17:30:00,Saturday
2493,0.0,4/5/2014,volunteer,2.0,1.0,17.0,17:30:00,Saturday
3729,0.0,5/18/2013,cold call,1.0,0.0,17.0,17:30:00,Saturday
...,...,...,...,...,...,...,...,...
298811,0.0,11/15/2014,volunteer,1.0,0.0,17.0,17:30:00,Saturday
299102,0.0,6/8/2013,cold call,2.0,0.0,17.0,17:30:00,Saturday
299288,0.0,7/27/2013,volunteer,1.0,0.0,17.0,17:30:00,Saturday
299541,0.0,4/12/2014,cold call,1.0,0.0,17.0,17:30:00,Saturday


In [35]:
dfPartStudent = {}
class_id = 0
dfPartStudent[class_id] = participation.where(participation['CLASS_ID'] == class_id)
dfPartStudent[class_id].dropna(how = 'all', inplace = True)
dfPartStudent[class_id] = dfPartStudent[class_id].astype({'STUDENT_ID': 'int32'})
dfPartStudent[class_id]

Unnamed: 0,STUDENT_ID,PARTICIPATION_DATETIME,PARTICIPATION_TYPE,PARTICIPATION_ATTEMPTS,PARTICIPATION_HINTS,CLASS_ID,CLASS_TIME,CLASS_DAY
5,262,6/19/2013,volunteer,5.0,3.0,0.0,16:30:00,Wednesday
33,133,5/29/2013,cold call,5.0,4.0,0.0,16:30:00,Wednesday
36,29,6/11/2014,cold call,1.0,0.0,0.0,16:30:00,Wednesday
80,163,6/10/2015,cold call,2.0,0.0,0.0,16:30:00,Wednesday
128,262,4/1/2015,volunteer,1.0,0.0,0.0,16:30:00,Wednesday
...,...,...,...,...,...,...,...,...
299888,476,8/6/2014,volunteer,1.0,0.0,0.0,16:30:00,Wednesday
299915,75,4/2/2014,cold call,3.0,1.0,0.0,16:30:00,Wednesday
299931,499,8/13/2014,volunteer,2.0,1.0,0.0,16:30:00,Wednesday
299934,499,10/8/2014,volunteer,2.0,0.0,0.0,16:30:00,Wednesday


In [38]:
lClass0Students = dfPartStudent[class_id]['STUDENT_ID'].unique()
lClass0Students

array([262, 133,  29, 163,  56, 309, 428,  75,  82,  32, 395, 343,  72,
       476, 406, 117,  69, 368,  20, 332, 232, 421,  58, 499])

## Average Hints per Instance

### Prepare DataFrames

## Participations Count by Class

### Prepare DataFrames

In [None]:
# Create new DF from students and participation
dfVOL = pd.merge(students, participation, on = 'STUDENT_ID')
value_counts = dfVOL['STUDENT_ID'].value_counts()
# Convert value_counts to a DataFrame
dfPart = pd.DataFrame(value_counts)
# Reset its index
dfPart = dfPart.reset_index()
# Rename the columns to make them more readable
dfPart.columns = ['STUDENT_ID', 'PARTICIPATIONS COUNT']
# Merge the dfPart and students DataFrames
dfPart = pd.merge(dfPart, students, how = 'outer', on = 'STUDENT_ID')

In [None]:
# Get the mean number of participations for each class
class_mean = dfPart.groupby('CLASS_ID')['PARTICIPATIONS COUNT'].mean()
# Convert class_mean to a DataFrame
class_mean = pd.DataFrame(class_mean)
# Reset its index
class_mean = class_mean.reset_index()
# Rename the columns
class_mean.columns = ['CLASS_ID', 'MEAN # OF PARTICIPATIONS']
# Merge the class_mean and the dfPart DataFrames 
dfNoPall = pd.merge(dfPart, class_mean, how = 'outer', on = 'CLASS_ID')

In [18]:
# Get a list of all CLASS_IDs in ascending order
dfSorted = dfNoPall.sort_values(by = 'CLASS_ID')
# Loop through all CLASS_IDs to create a DataFrame for each class
dfNoP = {}
for i in lClasses:
    dfNoP[i] = dfNoPall.where(dfNoPall['CLASS_ID'] == i)
    dfNoP[i].drop(columns = ['STUDENT_FIRST_DAY', 'STUDENT_LAST_DAY'], axis = 0, inplace = True)
    dfNoP[i] = dfNoP[i].dropna(how = 'all')
    dfNoP[i].reset_index(inplace = True, drop = True)
    dfNoP[i] = dfNoP[i].sort_values(by = 'PARTICIPATIONS COUNT', ascending = False).reset_index(drop = True)
    dfNoP[i] = dfNoP[i].astype({'STUDENT_ID': 'int32'})

### Graphs

In [None]:
# Generate a graph of the count of participations for each student within each class
for i in lClasses:
    tTitle = "Particpation Count by Student for Class {}".format(i)
    # Plot participations count
    pBlah = dfNoP[i].plot.bar(x = "STUDENT_NAME_ENGLISH", y = 'PARTICIPATIONS COUNT', color = 'darkgreen', figsize=(20,12), legend = False)
    # Plot the average as a line on the second axis
    dfNoP[i].plot(x = "STUDENT_NAME_ENGLISH", y = 'MEAN # OF PARTICIPATIONS', ax = pBlah, color = 'lightblue', figsize=(20,12), lw = 4, legend = True)
    # Format the graph
    pBlah.set_title(tTitle, size = 30)
    pBlah.set_xlabel("Name", size = 18)
    pBlah.set_ylabel("Participation Count", size = 18)
    pBlah.legend(["Class Mean: " + str(round(dfNoP[i]['MEAN # OF PARTICIPATIONS'][0]))])
    # Label the bars
    show_values_on_bars(pBlah)
    # Save the figure
    pBlah.figure.savefig("images\{}.png".format(tTitle))

## Participations Count Average Per Day

### Prepare DataFrames

In [23]:
dfByDate = dfVOL.groupby(['STUDENT_ID', 'PARTICIPATION_DATETIME'])['PARTICIPATION_DATETIME'].count()
dfByDate = pd.DataFrame(dfByDate)
dfByDate = dfByDate.rename(columns = {'STUDENT_ID': 'STUDENT_ID', 'PARTICIPATION_DATETIME': 'DATETIME', 'PARTICIPATION_DATETIME': 'COUNT'})

In [None]:
# Define a dictionary to store each student's DF in
dfPC = {}
# Define a list of Means to store in the new dataframe
lMeans = []
# Loop through the STUDENT_IDs and calculate their mean number of participations
for i in lStudents:
    # Copy the DataFrame for safety resons
    dfa = dfByDate
    # Create a new DF just for the one student
    dfPC[i] = dfa.loc[i]
    # Set their ID to their ID from the list
    dfPC[i]['STUDENT_ID'] = i
    # Calculate the mean participations per class
    dfPC[i]['MEAN'] = dfPC[i]['COUNT'].mean()
    # Sort from highest to lowest count
    dfPC[i] = dfPC[i].sort_values(by = 'COUNT', ascending = False)
    # Append the mean to lMeans
    lMeans.append(dfPC[i]['MEAN'][0])

In [None]:
# Create a new dataframe of the students and their mean participations per class
dfMeanPart = pd.DataFrame(columns = ['STUDENT_ID', 'MEAN_PARTICIPATIONS_PER_CLASS'])
# Set the student IDs to lStudents which is a list of student IDs
dfMeanPart['STUDENT_ID'] = lStudents
# Set the mean for each student to the corresponding value in lMeans
dfMeanPart['MEAN_PARTICIPATIONS_PER_CLASS'] = lMeans
# Merge the new DF with students
dfMeanPart = pd.merge(dfMeanPart, students, how = 'inner', on = 'STUDENT_ID')
# Drop unneeded columns
dfMeanPart.drop(columns = ['STUDENT_ENROLLED', 'STUDENT_FIRST_DAY', 'STUDENT_LAST_DAY'], inplace = True)

### Graphs

In [None]:
# Plot the students' means by class
for i in lClasses:
    # Set the title
    tTitle = "Mean Particpations for Class {}".format(i)
    # Filter the DF based on the class ID
    df = dfMeanPart.where(dfMeanPart['CLASS_ID'] == i)
    # Drop rows that are entirely nan
    df.dropna(how = 'all', inplace = True)
    # Sort values from highest to lowest mean participation
    df = df.sort_values(by = 'MEAN_PARTICIPATIONS_PER_CLASS', ascending = False)
    # Create class mean column
    df['CLASS_MEAN'] = df['MEAN_PARTICIPATIONS_PER_CLASS'].mean()
    # Plot participations count
    pBlah = df.plot.bar(x = "STUDENT_NAME_ENGLISH", y = 'MEAN_PARTICIPATIONS_PER_CLASS', color = 'darkgreen', figsize=(20,12), legend = False)
    # Plot the average as a line on the second axis
    df.plot(x = "STUDENT_NAME_ENGLISH", y = 'CLASS_MEAN', ax = pBlah, color = 'lightblue', figsize=(20,12), lw = 4, legend = True)
    # Format the graph
    pBlah.set_title(tTitle, size = 30)
    pBlah.set_xlabel("Name", size = 18)
    pBlah.set_ylabel("Mean Participations per Class", size = 18)
    pBlah.legend(["Class Mean: " + str(round(df['MEAN_PARTICIPATIONS_PER_CLASS'].mean(), 2))])
    # Label the bars
    show_values_on_bars(pBlah)
    # Save the figure
    pBlah.figure.savefig("images\{}.png".format(tTitle))