# AP Scores and Demographics: A Data Visualization Project

For my project, I will be working with a the 2016 AP Test data where for each test, information about student demographics and scores are given. I will produce different data visualizations and then I perform a chi-squared test for categorial variables to test the null hypothesis: "All demographics score in the same manner on [insert AP test name]"

In [None]:
from __future__ import print_function, division
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.optimize as opt
from matplotlib.backends.backend_pdf import PdfPages
import random

### Loading the Data:

In [None]:
examdata = pd.read_csv("new_exams.csv")
examdata

In [None]:
# Creating a numpy array of all of the 37 classes.
classnames = []
all_classes = []
for i in range(0, 37):
    start = 7 * i
    stop = start + 5
    this_class = []
    classnames.append(examdata.iloc[start,0])
    for j in range(2, 9):
        this_class.append(examdata.iloc[start:stop,j])
    all_classes.append(this_class)
    
all_classes = np.asarray(all_classes)

# Getting the total for each demographic for each test
totals = []
for i in range(0, 37):
    start = 5 + 7 * i
    this_total = []
    for j in range(2, 9):
        this_total.append(examdata.iloc[start][j])
    totals.append(this_total)

Now that I have created a numpy array of all 37 classes, I can create the first visual: stacked bar charts (not normalized) for every class (including every demographic). These will be saved as the following: "stacked-CLASSNAME-abnormal" 

In [None]:
colors_arr = [[75/255, 145/255, 194/255], [226/255, 80/255, 77/255], [255/255, 150/255, 70/255], [86/255, 180/255, 86/255], [216/255, 152/255, 197/255], [.2, 0.4, 0.8], [0.4, 0.1, 0.7]]              

In [None]:
def getNRandomColors(n):
    colors = []
    alpha = 0.7
    for i in range(0, n):
        color = []
        for j in range(0, 4):
            if j == 3:
                color.append(alpha)
            else:
                color.append(random.random())
        colors.append(color)
    return colors

colors_arr = getNRandomColors(7)

In [None]:
colors_arr = colors_arr

In [None]:
# Function to created a stacked bar chart
def stacked_bar(xvalues, yvalues, legend_names, ax):
    bottomTracker = 0
    for i in range (0, len(yvalues)):
        ax.bar(xvalues, yvalues[i], label = legend_names[i], bottom = bottomTracker, color=colors_arr[i], alpha=0.8)
        bottomTracker += yvalues[i]
    return ax

def save_stacked(scores, this_class, classnames, i, ax=None):
    legend_names = np.array(["White", "Black", "Latinx", "Asian", 
                             "American Indian/Alaska Native", "Native Hawaiian/Pacific Islander", 
                             "2+ Races"])
    if(ax==None):
        fig, ax = plt.subplots(1,1)
    ax1 = stacked_bar(scores, this_class, legend_names, ax)
    ax.set_title(classnames[i], fontsize=6)
    ax.tick_params(labelsize=3)

In [None]:
scores = np.array([5, 4, 3, 2, 1])

In [None]:
f, ax = plt.subplots(8,5)
f.suptitle("2016 AP Test Scores by Student-Identified Race", y = 0.92, weight='semibold')
f.set_size_inches(8,11)
axarr = np.ndarray.flatten(ax)0

for i in range(0, 40):
    if(i<37):
        save_stacked(scores, all_classes[i].astype("float64"), classnames, i, axarr[i])
    else:
        axarr[i].axis("off")

#plt.tight_layout()
f.subplots_adjust(hspace=0.5, wspace=0.5)
plt.savefig("StackedGraphsNewColors.pdf", dpi=500)

### Gauging Popularity

In [None]:
# First, I will be looking at the total amount of students taking each test.

In [None]:
# Creating a numpy array of totals for each class
all_students = []
for i in range(0, 37):
    index = 7 * i + 5
    all_students.append(examdata.iloc[index,9])

print(all_students)
print(classnames)

In [None]:
colors = []
alpha = 0.75
r_comp = all_students / all_students[9]
for i in range(0, 37):
    color = []
    color.append(r_comp[i]/10)
    color.append(0)
    color.append(r_comp[i])
    color.append(alpha)
    colors.append(color)
    
f, ax = plt.subplots(1,1)
f.set_size_inches(8,8)
plt.barh(classnames, all_students, color = colors, capstyle='projecting')
plt.xlabel("Number of Students")
plt.title("2016 AP Test Popularity", weight='semibold')
plt.tight_layout()
#plt.savefig("testbarh.pdf", dpi = 500)

### Chi-Square Tests: Is AP Score on a given AP test independent of demographic?

Null: The two variables are independent.
Alternative hypothesis: The two (categorical) variables are dependent.

We employ chi-square test because the outcomes are categorical and we are focused on proportions between groups.

In [None]:
def expectedValue(rowTotal, colTotal, sampleSize):
    return (rowTotal * colTotal) / sampleSize

def getRowTotal(table):
    colTotal = []
    for arr in table:
        colTotal.append(arr[5])
    return colTotal

def makeExpectedTable(rowTotals, colTotals, sampleSize):
    etable = [] # Will be 5 by 8
    for r in range(0, 8):
        row = []
        for c in range(0, 5):
            exp_val = expectedValue(rowTotals[r], colTotals[c], sampleSize)
            row.append(exp_val)
        row.append(rowTotals[r])
        etable.append(row)
    return etable

def chisquare(o, e, sampleSize):
    if e == 0:
        return 0
    else:
        return ((o - e)**2) / e

def chisquare_table(table, etable, sampleSize):
    chitable = []
    for r in range(0, 7):
        row = []
        for c in range(0, 5):
            chis = chisquare(table[r][c], etable[r][c], sampleSize)
            row.append(chis)
        chitable.append(row)
    return chitable

def getchisum(chitable):
    chisum = 0
    for row in chitable:
        chisum += np.sum(np.array(row))
    return chisum
        
        

In [None]:
# We have the columns of Class #i as each element of this array. 
# This represents our chi-square table.
table = all_classes[0] ## change this
sampleSize = table[7][5]
colTotals = table[7]
rowTotals = getRowTotal(table)

table = np.array(table)
etable = np.array(makeExpectedTable(rowTotals, colTotals, sampleSize))


print("------")
print("original")
print("------")
print(np.array(table).astype('int'))
print("------")
print("expected")
print("------")
print(etable.astype('int'))
print("------")
print("chisquare addends")
print("------")
chitable = chisquare_table(table, etable, sampleSize)
print(np.array(chitable).astype('int'))
print("------")
print("chisum = ", getchisum(chitable))
print("------")

In [None]:
def build_chi_array():
    chi_square_array = []
    for i in range(0, 37):
        table = all_classes[i] ## change this
        sampleSize = table[7][5]
        colTotals = table[7]
        rowTotals = getRowTotal(table)
        table = np.array(table)
        etable = np.array(makeExpectedTable(rowTotals, colTotals, sampleSize))
        chitable = chisquare_table(table, etable, sampleSize)
        chi_square_array.append(getchisum(chitable))
    return np.array(chi_square_array)

In [None]:
arr = build_chi_array()
arr

In [None]:
af = {"AP Test" : classnames, "Chi-Squared Values" : arr}
af = pd.DataFrame(af)
af

### Frequency Distributions for Each Test

In [None]:
# Creating a numpy array of all of the 37 classes.
classnames = []
all_classes = []
for i in range(0, 37):
    start = 7 * i
    stop = start + 5
    this_class = []
    classnames.append(examdata.iloc[start,0])
    for j in range(2, 10):
        this_class.append(examdata.iloc[start:stop,j])
    all_classes.append(this_class)
    
all_classes = np.asarray(all_classes)

all_classes[2].shape

In [None]:
legend_names = np.array(["White", "Black", "Latinx", "Asian", "American Indian/Alaska Native", "Native Hawaiian/Pacific Islander", "2+ Races"])

def freq_dist_all():
    for j in range(0, 37):
        legend_names = np.array(["White", "Black", "Latinx", "Asian", "American Indian/Alaska Native", "Native Hawaiian/Pacific Islander", "2+ Races"])
        # For each demographic, graph the scores for THE CLASS
        ff, axx = plt.subplots(4, 2)
        ff.set_size_inches(8,11)
        axarr = np.ndarray.flatten(axx)

        for i in range(0, 8):
            if i == 7:
                axarr[i].axis("off")
            else:
                axarr[i].bar(scores, all_classes[j][i], color = colors_arr[i])
                axarr[i].set_title(legend_names[i])
                axarr[i].set_xlabel("score")
                axarr[i].set_ylabel("# of students")

        ff.suptitle(classnames[j], y= 0.94, weight="bold")
        #ff.tight_layout()
        filename = classnames[j] + "_freq.pdf"
        ff.subplots_adjust(hspace=0.5, wspace=0.5)
        caption = "$\chi^{2}$ statistic = " + str(arr[j])
        ff.text(0.5, 0.05, caption, horizontalalignment='center', weight="bold")
        plt.savefig(filename)

freq_dist_all()