In [35]:
# Import key librarys

import pandas as pd
import numpy as np

import plotly.express as px

import chart_studio
import chart_studio.plotly as py
import chart_studio.tools as tls

In [36]:
# Setup Chart Studio
username = 'mrcrookes' # your username
api_key = 'ZgEVmRwdEN7u8cqkga9A' # your api key - go to profile > settings > regenerate key
chart_studio.tools.set_credentials_file(username=username, api_key=api_key)

In [37]:
# Define level
level = 'National_5'
level_label = level.replace('_', ' ')

# Import data
ind_df = pd.read_excel("attainment-statistics-(august)-2022-provisional-centre-independent.ods", 
                       engine = "odf", sheet_name = level, skiprows = 2)

state_df = pd.read_excel("attainment-statistics-(august)-2022-provisional-centre-education-authority.ods", 
                        engine = "odf", sheet_name = level, skiprows = 2)

In [38]:
def tidy_up_1(df):
    
    # Remove spaces and replace with '_'
    df.columns = df.columns.str.replace(' ','_')

    #df.drop(df[df.Grade_A_Count_2022 == '[z]'].index, inplace=True)
    df = df.replace('[z]', np.nan)
    df = df.replace('[c]', np.nan)
    df = df.replace('[low]', np.nan)

    # Remove last row
    df = df.drop(df.index[-1])

    # Remove columns with Percentage in the heading
    df = df[df.columns.drop(list(df.filter(regex='Percentage')))]
    
    return df
    
ind_df = tidy_up_1(ind_df)
state_df = tidy_up_1(state_df)

In [39]:
def tidy_up_2(df):

    # Create a list of years
    years = ['2022', '2021', '2020', '2019', '2018']

    # Sort out column titles
    for year in years:
        df = df.rename({'Grade_A_Count_' + year: 'As_' + year}, axis=1)
        df = df.rename({'Grades_A-B_Count_' + year: 'A-Bs_' + year}, axis=1)
        df = df.rename({'Grades_A-C_Count_' + year: 'A-Cs_' + year}, axis=1)
        df = df.rename({'Grades_A-D_Count_' + year: 'A-Ds_' + year}, axis=1)
        df = df.rename({'No_Award_Count_' + year: 'NAs_' + year}, axis=1)

    # Calculate number of Bs, Cs and Ds
    for year in years:
        df['Bs_' + year] = df['A-Bs_' + year] - df['As_' + year]
        df['Cs_' + year] = df['A-Cs_' + year] - df['A-Bs_' + year]
        df['Ds_' + year] = df['A-Ds_' + year] - df['A-Cs_' + year]

    # Remove original columns
    for year in years:
        df.drop('A-Bs_' + year, axis=1, inplace=True)
        df.drop('A-Cs_' + year, axis=1, inplace=True)
        df.drop('A-Ds_' + year, axis=1, inplace=True)
        
    return df
        
ind_df = tidy_up_2(ind_df)
state_df = tidy_up_2(state_df)

In [40]:
ind_df.head()

Unnamed: 0,Subject,As_2022,NAs_2022,Entries_2022,As_2021,NAs_2021,Entries_2021,As_2020,NAs_2020,Entries_2020,...,Ds_2021,Bs_2020,Cs_2020,Ds_2020,Bs_2019,Cs_2019,Ds_2019,Bs_2018,Cs_2018,Ds_2018
0,Accounting,15.0,,20.0,25.0,0.0,35,30.0,0.0,35.0,...,0.0,5.0,0.0,0.0,5.0,5.0,5.0,5.0,5.0,0.0
1,Administration and IT,10.0,,30.0,20.0,0.0,35,10.0,0.0,25.0,...,0.0,10.0,5.0,0.0,10.0,5.0,5.0,10.0,5.0,5.0
2,Applications of Mathematics,75.0,5.0,145.0,75.0,15.0,155,75.0,5.0,135.0,...,5.0,20.0,25.0,10.0,20.0,5.0,10.0,5.0,15.0,5.0
3,Art and Design,405.0,,535.0,370.0,,460,395.0,,520.0,...,0.0,95.0,25.0,5.0,85.0,30.0,10.0,105.0,35.0,10.0
4,Biology,1110.0,35.0,1630.0,1100.0,40.0,1555,1030.0,10.0,1560.0,...,55.0,295.0,190.0,35.0,265.0,150.0,80.0,280.0,145.0,70.0


In [41]:
# Convert into long format

def tolong(df):
    ldf = pd.melt(df, id_vars = ['Subject'], var_name = 'abc', value_name ='Count')
    ldf['Grades'] = ldf['abc'].apply(lambda x: (x.rsplit('_', 1))[0])
    ldf['Year'] = ldf.pop('abc').apply(lambda x: (x.split('_'))[-1])
    
    return ldf
    
ind_ldf = tolong(ind_df)
state_ldf = tolong(state_df)

state_ldf.head()

Unnamed: 0,Subject,Count,Grades,Year
0,Accounting,425.0,As,2022
1,Administration and IT,1665.0,As,2022
2,Applications of Mathematics,3350.0,As,2022
3,Art and Design,4075.0,As,2022
4,Biology,6220.0,As,2022


In [42]:
# Merge together two df
dfc = pd.merge(state_ldf, ind_ldf, on = ['Subject', 'Grades', 'Year'])

# Rename columns
cols = ['Subject', 'Year', 'Grades', 'Count_x', 'Count_y']
dfc = dfc[cols]

# Calculate percentage of Grades from independent schools
dfc['perc'] = round(dfc['Count_y'] / (dfc['Count_x'] + dfc['Count_y']), 3)

# Tidy up data
#dfc['perc'].values[dfc['perc'] > 1] = 1
dfc = dfc[dfc.Grades == 'As']
dfc = dfc[dfc.perc != 0]
dfc.dropna(inplace=True)

dfc.head(10)

Unnamed: 0,Subject,Year,Grades,Count_x,Count_y,perc
0,Accounting,2022,As,425.0,15.0,0.034
1,Administration and IT,2022,As,1665.0,10.0,0.006
2,Applications of Mathematics,2022,As,3350.0,75.0,0.022
3,Art and Design,2022,As,4075.0,405.0,0.09
4,Biology,2022,As,6220.0,1110.0,0.151
5,Business Management,2022,As,3165.0,485.0,0.133
6,Chemistry,2022,As,5545.0,1075.0,0.162
7,Chinese Languages,2022,As,95.0,95.0,0.5
8,Classical Studies,2022,As,50.0,45.0,0.474
9,Computing Science,2022,As,2195.0,430.0,0.164


In [43]:
dfc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 192 entries, 0 to 596
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Subject  192 non-null    object 
 1   Year     192 non-null    object 
 2   Grades   192 non-null    object 
 3   Count_x  192 non-null    float64
 4   Count_y  192 non-null    float64
 5   perc     192 non-null    float64
dtypes: float64(3), object(3)
memory usage: 10.5+ KB


In [44]:
# Plot graph

# Create a list of subjects in alphabetical order
subjects = sorted(dfc['Subject'].unique())
years = sorted(dfc['Year'].unique())

fig = px.line(dfc, x = "Subject", y = "perc", 
                color = "Year",
                #barmode = 'overlay',
                width = 900,
                height = 800,
                category_orders = {'Subject' : subjects, 'Year' : years},
                labels = {"perc": "Percentage"},
                title = 'Percentage of As from independent Schools per subject at ' + level_label
                )

fig.update_yaxes(rangemode = "tozero")

fig.write_html('./graphs/ind_perc_' + level + '.html')

fig.show()

In [45]:
# Upload to Chart Studio
py.plot(fig, filename = 'Ind_As_percentage_line_' + level, auto_open=True)

# Output html embed code
#tls.get_embed('https://plotly.com/~mrcrookes/14/') #change to your url

'https://plotly.com/~mrcrookes/53/'