In [18]:
# Import key librarys

import pandas as pd
import numpy as np

import plotly.express as px

import chart_studio
import chart_studio.plotly as py
import chart_studio.tools as tls

In [19]:
# Define level
level = 'Advanced_Higher'
level_label = level.replace('_', ' ')

# Import data
df = pd.read_excel("./more_data/attainment-statistics-(august)-2022-provisional-main.xlsx", 
                        sheet_name = level, skiprows = 2)

df17 = pd.read_excel("./more_data/ASR2017_" + level + ".xls", 
                        sheet_name = "Table 4", skiprows = 7)

df16 = pd.read_excel("./more_data/ASR2016_" + level + ".xls", 
                        sheet_name = "Table 4", skiprows = 7)

In [20]:
years = ['2021', '2020', '2019', '2018', '2017', '2016']
totals_students = {}

for year in years:
    
    dfs = pd.read_excel('./even_more_data/ASR' + year + '_' + level + '.xls', 
                        sheet_name = "Tables 6a, 6b & 6c", skiprows = 6, nrows = 17, 
                           usecols=[0,1, 3, 5])
    
    totals_students[year] = dfs.loc[dfs['NUMBER OF COURSES ENTERED'] == 'Total learners'].values[0][1]

In [21]:
def tidy_up_1(df):
    
    # Remove spaces and replace with '_'
    df.columns = df.columns.str.replace(' ','_')

    #df.drop(df[df.Grade_A_Count_2022 == '[z]'].index, inplace=True)
    df = df.replace('[z]', np.nan)
    df = df.replace('[c]', np.nan)
    df = df.replace('[low]', np.nan)

    # Remove last row
    df = df.drop(df.index[-1])

    # Remove columns with Percentage in the heading
    df = df[df.columns.drop(list(df.filter(regex='Percentage')))]
    
    return df
    
df = tidy_up_1(df)

In [22]:
def tidy_up_1a(df, year):
    
    # Sort out column titles
    df = df.rename({'TITLE': 'Subject'}, axis=1)
    df = df.rename({'ENTRIES': 'Entries_' + year}, axis=1)
    
    df = df.rename({'A': 'As_' + year}, axis=1)
    df = df.rename({'B': 'Bs_' + year}, axis=1)
    df = df.rename({'C': 'Cs_' + year}, axis=1)
    df = df.rename({'D': 'Ds_' + year}, axis=1)
    
    #df = df.rename({'NAs' + year: 'NAs_' + year}, axis=1)

    # Remove last row
    df = df.drop(df.index[-1])
 
    return df
    
df16 = tidy_up_1a(df17, '2016')
df17 = tidy_up_1a(df17, '2017')

In [23]:
def count(df, year):

    # Calculate number of Bs, Cs and Ds
    df['As_' + year] = round(df['Entries_' + year] * df['As_' + year] / 100)
    df['Bs_' + year] = round(df['Entries_' + year] * df['Bs_' + year] / 100)
    df['Cs_' + year] = round(df['Entries_' + year] * df['Cs_' + year] / 100)
    df['Ds_' + year] = round(df['Entries_' + year] * df['Ds_' + year] / 100)
    
    # Calculate number of NAs
    df['NAs_' + year] = round(df['Entries_' + year] 
                              - df['As_' + year]
                              - df['Bs_' + year]
                              - df['Cs_' + year]
                              - df['Ds_' + year])
        
    return df

df16 = count(df16, '2016')
df17 = count(df17, '2017')

In [24]:
df16.tail()

Unnamed: 0,Subject,Entries_2016,As_2016,Bs_2016,Cs_2016,Ds_2016,NAs_2016
30,Physical Education,350.0,100.0,87.0,93.0,24.0,46.0
31,Physics,1860.0,564.0,487.0,414.0,148.0,247.0
32,"Religious, Moral and Philosophical Studies",172.0,61.0,30.0,35.0,15.0,31.0
33,Spanish,433.0,149.0,109.0,71.0,37.0,67.0
34,Statistics,189.0,71.0,42.0,33.0,12.0,31.0


In [25]:
def tidy_up_2(df):

    # Create a list of years
    years = ['2022', '2021', '2020', '2019', '2018']

    # Sort out column titles
    for year in years:
        df = df.rename({'Grade_A_Count_' + year: 'As_' + year}, axis=1)
        df = df.rename({'Grades_A-B_Count_' + year: 'A-Bs_' + year}, axis=1)
        df = df.rename({'Grades_A-C_Count_' + year: 'A-Cs_' + year}, axis=1)
        df = df.rename({'Grades_A-D_Count_' + year: 'A-Ds_' + year}, axis=1)
        df = df.rename({'No_Award_Count_' + year: 'NAs_' + year}, axis=1)

    # Calculate number of Bs, Cs and Ds
    for year in years:
        df['Bs_' + year] = df['A-Bs_' + year] - df['As_' + year]
        df['Cs_' + year] = df['A-Cs_' + year] - df['A-Bs_' + year]
        df['Ds_' + year] = df['A-Ds_' + year] - df['A-Cs_' + year]

    # Remove original columns
    for year in years:
        df.drop('A-Bs_' + year, axis=1, inplace=True)
        df.drop('A-Cs_' + year, axis=1, inplace=True)
        df.drop('A-Ds_' + year, axis=1, inplace=True)
        
    return df
        
df = tidy_up_2(df)

In [26]:
# Sort out subjects names in 2016 and 2017 datasets

def tidy_up_2a(df):

    # Update names
    di = {'Hospitality: Practical Cake Craft': 'Practical Cake Craft',
          'Hospitality: Practical Cookery': 'Practical Cookery',
          'Cantonese': 'Chinese Languages', 'Gàidhlig': 'Gaidhlig'}
    
    df = df.replace({"Subject": di})
    
    # Remove duplicate names
    values = ['Cruinn-eolas (Geography)', 'Nuadh-Eolas (Modern Studies)', 'Eachdraidh (History)', 
        'Matamataig (Mathematics)', 'Matamataig Fad-bheatha (Lifeskills Mathematics)',
            'Mandarin (Simplified)', 'Mandarin (Traditional)']

    df = df.loc[~df['Subject'].isin(values)]
    
    return df

df16 = tidy_up_2a(df16)
df17 = tidy_up_2a(df17)

In [27]:
df.head()

Unnamed: 0,Subject,As_2022,NAs_2022,Entries_2022,As_2021,NAs_2021,Entries_2021,As_2020,NAs_2020,Entries_2020,...,Ds_2021,Bs_2020,Cs_2020,Ds_2020,Bs_2019,Cs_2019,Ds_2019,Bs_2018,Cs_2018,Ds_2018
0,Accounting,25,5.0,65,45,5.0,75,40,,60,...,5,10,10,0,15.0,15.0,5.0,15.0,10.0,0.0
1,Art and Design (Design),220,10.0,665,295,15.0,635,240,10.0,540,...,15,190,85,15,140.0,140.0,25.0,150.0,190.0,35.0
2,Art and Design (Expressive),475,10.0,1280,550,20.0,1165,475,10.0,1015,...,35,340,170,20,290.0,280.0,60.0,275.0,300.0,60.0
3,Biology,775,265.0,3165,1240,195.0,2960,910,85.0,2510,...,280,720,610,190,570.0,575.0,285.0,585.0,595.0,280.0
4,Business Management,205,65.0,790,340,10.0,590,285,5.0,560,...,20,165,85,20,115.0,115.0,40.0,135.0,115.0,45.0


In [28]:
# Concat together three dfs (on Subject as index)

dfc = pd.concat([df.set_index('Subject'), df17.set_index('Subject'), df16.set_index('Subject')], 
                  axis=1).reset_index()

In [29]:
def as_per(df):

    # Create a list of years
    years = ['2021', '2020', '2019', '2018', '2017', '2016']

    # Calculate perc of As
    for year in years:
        
        df['perAs_' + year] = round(df['As_' + year] / totals_students[year], 4)
        
    return df

dfc = as_per(dfc)
        
dfc.tail()

Unnamed: 0,Subject,As_2022,NAs_2022,Entries_2022,As_2021,NAs_2021,Entries_2021,As_2020,NAs_2020,Entries_2020,...,Bs_2016,Cs_2016,Ds_2016,NAs_2016,perAs_2021,perAs_2020,perAs_2019,perAs_2018,perAs_2017,perAs_2016
31,Physics,770.0,150.0,2130.0,1005.0,100.0,1945.0,865.0,40.0,1710.0,...,487.0,414.0,148.0,247.0,0.0641,0.0623,0.0373,0.0402,0.0392,0.0396
32,"Religious, Moral and Philosophical Studies",85.0,20.0,250.0,155.0,,235.0,95.0,0.0,180.0,...,30.0,35.0,15.0,31.0,0.0099,0.0068,0.004,0.0024,0.0042,0.0043
33,Spanish,200.0,35.0,470.0,280.0,15.0,465.0,250.0,,465.0,...,109.0,71.0,37.0,67.0,0.0179,0.018,0.0126,0.0107,0.0104,0.0105
34,Statistics,55.0,15.0,140.0,140.0,10.0,205.0,100.0,5.0,185.0,...,42.0,33.0,12.0,31.0,0.0089,0.0072,0.005,0.0048,0.0049,0.005
35,,,,,,,,,,,...,,,,,,,,,,


In [30]:
# Convert into long format

def tolong(df):
    ldf = pd.melt(df, id_vars = ['Subject'], var_name = 'abc', value_name ='Count')
    ldf['Grades'] = ldf['abc'].apply(lambda x: (x.rsplit('_', 1))[0])
    ldf['Year'] = ldf.pop('abc').apply(lambda x: (x.split('_'))[-1])
    
    return ldf
    
ldf = tolong(dfc)

ldf.tail()

Unnamed: 0,Subject,Count,Grades,Year
1723,Physics,0.0396,perAs,2016
1724,"Religious, Moral and Philosophical Studies",0.0043,perAs,2016
1725,Spanish,0.0105,perAs,2016
1726,Statistics,0.005,perAs,2016
1727,,,perAs,2016


In [31]:
ldf = ldf[(ldf.Grades == 'perAs')]
ldf.dropna(inplace=True)

ldf.tail()

Unnamed: 0,Subject,Count,Grades,Year
1722,Physical Education,0.007,perAs,2016
1723,Physics,0.0396,perAs,2016
1724,"Religious, Moral and Philosophical Studies",0.0043,perAs,2016
1725,Spanish,0.0105,perAs,2016
1726,Statistics,0.005,perAs,2016


In [32]:
ldf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 203 entries, 1512 to 1726
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Subject  203 non-null    object 
 1   Count    203 non-null    float64
 2   Grades   203 non-null    object 
 3   Year     203 non-null    object 
dtypes: float64(1), object(3)
memory usage: 7.9+ KB


In [33]:
# Plot graph

# Create a list of subjects in alphabetical order
subjects = sorted(ldf['Subject'].unique())
years = sorted(ldf['Year'].unique())

fig = px.line(ldf, x = "Year", y = "Count", 
                color = "Subject",
                facet_row = 'Grades',
                width = 800,
                height = 800,
                category_orders = {'Subject' : subjects, 'Year' : years},
                labels = {"Grades": "Category"},
                title = 'Total As per total students subject from 2016 - 2021 at ' + level_label
                )

fig.update_traces(mode = "markers+lines")

fig.update_yaxes(matches=None, rangemode = "tozero")

fig.update_traces(mode = "markers+lines")

filename = 'As_per_cohort_' + level + '.html'

fig.write_html('./graphs/' + filename)

fig.show()

In [34]:
# Setup Chart Studio
username = '###' # your username
api_key = '###' # your api key - go to profile > settings > regenerate key
chart_studio.tools.set_credentials_file(username=username, api_key=api_key)

# Upload to Chart Studio
#py.plot(fig, filename = filename, auto_open=True)

'https://plotly.com/~mrcrookes/97/'