In [1]:
# Import key librarys

import pandas as pd
import numpy as np

import plotly.express as px

import chart_studio
import chart_studio.plotly as py
import chart_studio.tools as tls

In [2]:
# Create empty dataframe
dft = pd.DataFrame()

In [3]:
# Define level and years
levels = ['Higher', 'Advanced_Higher']
years = ['2021', '2020', '2019', '2018', '2017', '2016', '2015', '2014',
            '2013', '2012', '2011', '2010', '2009', '2008', '2007', '2006']

# Define sheetnames to grab and header to search for
sheets = ['table 4','4b']
header = 'title|subject'

for level in levels:
    
    for year in years:
        
        # create list of sheetnames
        tabs = pd.ExcelFile('./even_more_data/ASR' + year + '_' + level + '.xls').sheet_names
        # Use key to find correct substring to search sheetnames for
        subs = sheets
        # Find correct sheetname, case insensitive search
        name = [s for s in tabs if any(tabs in s.lower() for tabs in sheets)][0]
        
        # read in xls
        dfs = pd.read_excel('./even_more_data/ASR' + year + '_' + level + '.xls', 
                            sheet_name = name)
        
        # Search for header in column and get index  
        header_index = dfs[dfs.iloc[:, 0].str.match(header, na = False, case = False)].index[0]
        # Grab the first row for the header
        new_header = dfs.iloc[header_index]
        # Take the data less the header row
        dfs = dfs[header_index + 1:]
        # Set the header row as the df header
        dfs.columns = new_header.values
        
        # Replace weird values with NaN
        dfs = dfs.replace('***', np.nan)
        dfs = dfs.replace('-', np.nan)
        dfs = dfs.replace(' -', np.nan)
        # Change columns to lower case
        dfs.columns = dfs.columns.str.lower()
        # Rename first column to subject
        dfs.rename(columns = {dfs.columns[0]:'subject'}, inplace=True)
        # Drop rows with NaN in subject column
        dfs = dfs.dropna(subset = ['subject'])
        # Select most useful columns
        dfs = dfs[['subject', 'entries', 'a', 'b', 'c', 'd']]
        # Remove last row
        dfs = dfs.drop(dfs.index[-1])
        
        # Update to recognise datatypes
        dfs = dfs.infer_objects()

        # Change columns from percentages to counts
        dfs['a'] = dfs['a'] * dfs['entries'] // 100
        dfs['b'] = dfs['b'] * dfs['entries'] // 100
        dfs['c'] = dfs['c'] * dfs['entries'] // 100
        dfs['d'] = dfs['d'] * dfs['entries'] // 100
        
        # Add column with na count
        dfs['na'] = dfs['entries'] - dfs['a'] - dfs['b'] - dfs['c'] - dfs['d']
        
        # Change into long format
        dfl = pd.melt(dfs, id_vars =['subject'], 
                value_vars = ['entries', 'a', 'b', 'c', 'd', 'na'],
                        var_name ='grade', value_name ='count')
        
        # Add in year and level columns
        dfl['year'] = int(year)
        dfl['level'] = level
        
        # Append to main df
        dft = dft.append(dfl, ignore_index = True)

In [4]:
# Define level and years
levels = ['National_5']
years = ['2021', '2020', '2019', '2018', '2017', '2016', '2015', '2014']

# Define sheetnames to grab and header to search for
sheets = ['table 4']
header = 'title|subject'

for level in levels:
    
    for year in years:
        
        # create list of sheetnames
        tabs = pd.ExcelFile('./even_more_data/ASR' + year + '_' + level + '.xls').sheet_names
        # Use key to find correct substring to search sheetnames for
        subs = sheets
        # Find correct sheetname, case insensitive search
        name = [s for s in tabs if any(tabs in s.lower() for tabs in sheets)][0]
        
        # read in xls
        dfs = pd.read_excel('./even_more_data/ASR' + year + '_' + level + '.xls', 
                            sheet_name = name)
        
        # Search for header in column and get index  
        header_index = dfs[dfs.iloc[:, 0].str.match(header, na = False, case = False)].index[0]
        # Grab the first row for the header
        new_header = dfs.iloc[header_index]
        # Take the data less the header row
        dfs = dfs[header_index + 1:]
        # Set the header row as the df header
        dfs.columns = new_header.values
        
        # Replace weird values with NaN
        dfs = dfs.replace('***', np.nan)
        dfs = dfs.replace('-', np.nan)
        dfs = dfs.replace(' -', np.nan)
        # Change columns to lower case
        dfs.columns = dfs.columns.str.lower()
        # Rename first column to subject
        dfs.rename(columns = {dfs.columns[0]:'subject'}, inplace=True)
        # Drop rows with NaN in subject column
        dfs = dfs.dropna(subset = ['subject'])
        # Select most useful columns
        dfs = dfs[['subject', 'entries', 'a', 'b', 'c', 'd']]
        # Remove last row
        dfs = dfs.drop(dfs.index[-1])
        
        # Update to recognise datatypes
        dfs = dfs.infer_objects()

        # Change columns from percentages to counts
        dfs['a'] = dfs['a'] * dfs['entries'] // 100
        dfs['b'] = dfs['b'] * dfs['entries'] // 100
        dfs['c'] = dfs['c'] * dfs['entries'] // 100
        dfs['d'] = dfs['d'] * dfs['entries'] // 100
        
        # Add column with na count
        dfs['na'] = dfs['entries'] - dfs['a'] - dfs['b'] - dfs['c'] - dfs['d']
        
        # Change into long format
        dfl = pd.melt(dfs, id_vars =['subject'], 
                value_vars = ['entries', 'a', 'b', 'c', 'd', 'na'],
                        var_name ='grade', value_name ='count')
        
        # Add in year and level columns
        dfl['year'] = int(year)
        dfl['level'] = level
        
        # Append to main df
        dft = dft.append(dfl, ignore_index = True)

In [4]:
dft.head()

Unnamed: 0,subject,grade,count,year,level
0,Accounting,entries,1338.0,2021,Higher
1,Administration and IT,entries,4361.0,2021,Higher
2,Art and Design,entries,5714.0,2021,Higher
3,Biology,entries,7393.0,2021,Higher
4,Business Management,entries,8781.0,2021,Higher


In [5]:
def tidy_up(df):

    # Update names
    di = {'Home Economics: Fashion and Textile Technology': 'Fashion and Textile Technology',
            'Home Economics: Health and Food Technology': 'Health and Food Technology',
            'Home Economics: Lifestyle and Consumer Technology': 'Lifestyle and Consumer Technology',
            'Administration': 'Administration and IT',
            'Computing': 'Computing Science',
            'Dance Practice': 'Dance',
            'Managing Environmental Resources': 'Environmental Science',
            'Media': 'Media Studies',
            'Product Design': 'Design and Manufacture',
            'Art and Design: Design': 'Art and Design (Design)',
            'Art and Design: Expressive': 'Art and Design (Expressive)',
            'Art and Design Enquiry: Design': 'Art and Design (Design)',
            'Art and Design Enquiry: Expressive': 'Art and Design (Expressive)',
            'Gaidhlig': 'Gàidhlig',
            'Accounting*': 'Accounting & Finance',
            'Accounting': 'Accounting & Finance',
            'Art & Design': 'Art and Design',
            'Religious Studies': 'Religious, Moral and Philosophical Studies',
            'Computing Studies': 'Computing Science',
            'Home Economics': 'Health and Food Technology',
            'Photography for the Media': 'Photography',
            'Psychology (New)': 'Psychology', 
            'Computing (New)': 'Computing Science',
            'Computing Studies (New)': 'Computing Science',
            'Mental Health Care (New)': 'Mental Health Care',
            'Retail Travel (New)': 'Retail Travel',
            'Travel and Tourism*': 'Travel and Tourism',
            'Social and Vocational Skills' : 'Social & Vocational Skills',
            'Practical Cake Craft': 'Hospitality: Practical Cake Craft',
            'Practical Cookery': 'Hospitality: Practical Cookery',
            'Craft and Design': 'Craft & Design',
            'Care Issues for Society : Child Care': 'Care Issues for Society: Child Care',
            'Geography (New)': 'Geography',
            'Hospitality - Practical Cookery': 'Hospitality: Practical Cookery',
            'Hospitality - Reception and Accommodation Operations': 
                          'Hospitality: Reception and Accommodation Operation',
            'Hospitality - Professional Cookery': 'Hospitality: Professional Cookery',
            'Hospitality - General Operations': 'Hospitality: General Operations',
            'Woodworking Skills': 'Practical Woodworking',
            'Applied Practical Electronics': 'Practical Electronics',
            'Practical Cookery**': 'Practical Cookery',
            'Applications of Mathematics*': 'Applications of Mathematics',
            'Creative Cake Production': 'Hospitality: Practical Cake Craft'}
    
    df = df.replace({"subject": di})
    
    # Remove revised science courses
    values = ['Biology (Revised)', 'Chemistry (Revised)', 'Physics (Revised)', 'Human Biology (Revised)']
    
    df = df.loc[~df['subject'].isin(values)]
    
    # Remove other courses that don't fit into graphs too well
    values = ['Religious, Moral and Philosophical Studies (New)', 'Selling Scheduled Air Travel',
                 'Building Construction', 'Construction', 'Art and Design: Research and Appreciation',
                    'Architectural Technology', 'Beauty', 'Health and Social Care',
                    'Personal Development', 'Professional Patisserie',
                    'Fitness and Exercise', 'Sports Coaching Studies', 'Design',
                    'Fabrication and Welding Engineering', 'Visual Arts',
                    'Hospitality - Food and Drink Service',
                    'Personal and Social Education', 'Sports Organisation',
                    'Building and Architectural Technology', 'Building Services',
                    'Civil Engineering', 'Early Years Care and Education',
                    'Food Production Supervision', 'Structural Engineering',
                    'Advertising, Marketing and Public Relations',
                    'Interviewing, Writing and Publishing',
                    'Construction Craft Skills',
                    'Construction Industry Practice']
       
    df = df.loc[~df['subject'].isin(values)]
    
    # TO DO - Add these up into Chinese langugages category, drop for now
    values = ['Mandarin (Simplified)', 'Mandarin (Traditional)', 'Cantonese']
    
    df = df.loc[~df['subject'].isin(values)]
    
    return df

dft = tidy_up(dft)

In [6]:
#dft['subject'].unique()

In [16]:
# Plot graph

# Create a list of subjects and year in correct order
subjects = sorted(dft['subject'].unique())
years = sorted(dft['year'].unique())
levels = ['Higher', 'Advanced_Higher']

fig = px.line(dft, x = "year", y = ["count"], 
                color = "subject",
                facet_col = 'grade',
                facet_row = 'level',
                facet_row_spacing = 0.02, 
                facet_col_spacing = 0.01, 
                width = 1500,
                height = 800,
                category_orders = {'subject' : subjects, 
                                       'year' : years,
                                          'level' : levels},
                labels = {"value": "Count",
                             "subject": "Subject",
                                  "level": "Level",
                                     "grade": "Grade",
                                         "year": "Year"},
                title = 'Total entries and grades per subject for Higher and Advance Higher 2016 - 2021'
                )

fig.update_yaxes(rangemode = "tozero")

fig.update_xaxes(tickangle = 60)

fig.layout.yaxis.matches = 'y'
fig.layout.yaxis2.matches = 'y'
fig.layout.yaxis3.matches = 'y'
fig.layout.yaxis4.matches = 'y'
fig.layout.yaxis5.matches = 'y'
fig.layout.yaxis6.matches = 'y'

fig.layout.yaxis7.matches = 'y7'
fig.layout.yaxis8.matches = 'y7'
fig.layout.yaxis9.matches = 'y7'
fig.layout.yaxis10.matches = 'y7'
fig.layout.yaxis11.matches = 'y7'
fig.layout.yaxis12.matches = 'y7'


fig.update_traces(mode = "markers+lines")

filename = 'total_students_details_higher_only'

fig.write_html('./graphs/' + filename + '.html')

fig.show()

In [17]:
# Setup Chart Studio
username = '###' # your username
api_key = '###' # your api key - go to profile > settings > regenerate key
chart_studio.tools.set_credentials_file(username=username, api_key=api_key)

# Upload to Chart Studio
#py.plot(fig, filename = filename, auto_open=True)