In [59]:
# Import key librarys

import pandas as pd
import numpy as np

import plotly.express as px

import chart_studio
import chart_studio.plotly as py
import chart_studio.tools as tls

In [60]:
# Create empty dataframe
dft = pd.DataFrame()

In [61]:
def read_sheet(year, level, sheets):
        
    # create list of sheetnames
    tabs = pd.ExcelFile('./even_more_data/ASR' + year + '_' + level + '.xls').sheet_names
    # Use key to find correct substring to search sheetnames for
    subs = sheets
    # Find correct sheetname, case insensitive search
    name = [s for s in tabs if any(tabs in s.lower() for tabs in sheets)][0]

    # read in xls
    df = pd.read_excel('./even_more_data/ASR' + year + '_' + level + '.xls', 
                        sheet_name = name)
    
    # Look for header index
    header_index = df[df.iloc[:, 0].str.match(header, na = False, case = False)].index[0]

    # Read in sheet skipping the correct number of rows
    df = pd.read_excel('./even_more_data/ASR' + year + '_' + level + '.xls', 
                        sheet_name = name, skiprows = header_index + 1)
    
    return df

def tidy_df(df, use_columns, footer):
    
    # Replace weird values with NaN
    df = df.replace('***', np.nan)
    df = df.replace('-', np.nan)
    df = df.replace(' -', np.nan)
    
    # Change columns to lower case
    df.columns = df.columns.astype(str).str.lower()

    # Rename first column to subject
    df.rename(columns = {df.columns[0]:'subject'}, inplace = True)
    
    # Drop rows with NaN in subject column
    df = df.dropna(subset = ['subject'])
    
    # Reset index
    df.reset_index(drop = True, inplace = True)

    # Search for footer in column and get index  
    footer_index = df[df.iloc[:, 0].str.match(footer, na = False, case = False)].index[0]
    # Drop all rows under footer column
    df = df.drop(df.index[footer_index:])

    # Update to recognise datatypes
    df = df.infer_objects()
    
    # Select most useful columns
    df = df[use_columns]
    
    return df

def grade_count(df):
    
    # Change columns from percentages to counts
    df['a'] = df['a'] * df['entries'] // 100
    df['b'] = df['b'] * df['entries'] // 100
    df['c'] = df['c'] * df['entries'] // 100
    df['d'] = df['d'] * df['entries'] // 100

    # Add column with na count
    df['na'] = df['entries'] - df['a'] - df['b'] - df['c'] - df['d']
    
    return df

In [62]:
# Define level and years
levels = ['National_5', 'Higher', 'Advanced_Higher']

years_di = {'Higher' : ['2021', '2020', '2019', '2018', '2017', '2016', '2015', '2014',
                            '2013', '2012', '2011', '2010', '2009', '2008', '2007', '2006'],
            
            'Advanced_Higher' : ['2021', '2020', '2019', '2018', '2017', '2016', '2015', '2014',
                                    '2013', '2012', '2011', '2010', '2009', '2008', '2007', '2006'],
           
            'National_5': ['2021', '2020', '2019', '2018', '2017', '2016', '2015', '2014']}


# Define sheetnames to grab and header, columns and footer to search for
sheets = ['table 4','4b']
header = 'title|subject'
use_columns = ['subject', 'entries', 'a', 'b', 'c', 'd']
footer = 'totals|total|subtotals'

for level in levels:
    
    years = years_di[level]
    
    for year in years:
        
        df = read_sheet(year, level, sheets)
        
        df = tidy_df(df, use_columns, footer)
        
        df = grade_count(df)
        
        # Change into long format
        dfl = pd.melt(df, id_vars =['subject'], 
                value_vars = ['entries', 'a', 'b', 'c', 'd', 'na'],
                        var_name ='grade', value_name ='count')
        
        # Add in year and level columns
        dfl['year'] = int(year)
        dfl['level'] = level
        
        # Append to main df
        dft = dft.append(dfl, ignore_index = True)

In [63]:
# Define level and years
levels = ['Intermediate2', 'Intermediate1']
years = ['2013', '2012', '2011', '2010', '2009', '2008', '2007', '2006']

# Define sheetnames to grab and header to search for
sheets = ['4a']
header = 'title|subject'
use_columns = ['subject', 'entries', 'a', 'b', 'c', 'd', 'passes in ungraded courses']
footer = 'totals|total|subtotals'

for level in levels:
    
    years = years
    
    for year in years:
        
        df = read_sheet(year, level, sheets)
        
        df = tidy_df(df, use_columns, footer)
        
        ### Look at graded courses
        
        # Drop rows without grades
        df1 = df[pd.isnull(df['passes in ungraded courses'])].copy()
        
        # Add column with na count
        df1['na'] = df1['entries'] - df1['a'] - df1['b'] - df1['c'] - df1['d']
        
        # Change into long format
        dfl = pd.melt(df1, id_vars =['subject'], 
                value_vars = ['entries', 'a', 'b', 'c', 'd', 'na'],
                        var_name ='grade', value_name ='count')
        
        # Add in year and level columns
        dfl['year'] = int(year)
        dfl['level'] = level
        
        # Append to main df
        dft = dft.append(dfl, ignore_index = True)
        
        
        ### Look at ungraded courses
        
        # Drop rows with grades
        df2 = df[pd.notnull(df['passes in ungraded courses'])].copy()
        
        # Sort column name
        df2.rename(columns={'passes in ungraded courses': 'passes'}, inplace=True)
        
        # Add column with na count
        df2['na'] = df['entries'] - df2['passes']
        
        # Change into long format
        dfl = pd.melt(df2, id_vars =['subject'], 
                value_vars = ['entries', 'passes', 'na'],
                        var_name ='grade', value_name ='count')
        
        # Add in year and level columns
        dfl['year'] = int(year)
        dfl['level'] = level
        
        # Append to main df
        dft = dft.append(dfl, ignore_index = True)

In [64]:
# Define level and years
levels = ['National_4']
years = ['2021', '2020', '2019', '2018', '2017', '2016', '2015', '2014']

# Define sheetnames to grab and header to search for
sheet1 = 'TABLE 1'
sheet2 = 'TABLE 2'
header = 'TITLE'
footer = 'Total'

for level in levels:
    
    for year in years:
        
        use_columns = ['subject', year]
        
        
        ### read in first sheet
        
        # read in xls
        df1 = pd.read_excel('./even_more_data/ASR' + year + '_' + level + '.xls', 
                            sheet_name = sheet1)

        # Look for header index
        header_index = df1[df1.iloc[:, 0].str.match(header, na = False, case = False)].index[0]

        # Read in sheet skipping the correct number of rows
        df1 = pd.read_excel('./even_more_data/ASR' + year + '_' + level + '.xls', 
                            sheet_name = sheet1, skiprows = header_index + 1)
        
        # Tidy up
        df1 = tidy_df(df1, use_columns, footer)
        
        # Sort headers
        df1.columns = ['subject', 'count']
        
        # Add in year and level columns
        df1['year'] = int(year)
        df1['level'] = level
        df1['grade'] = 'entries'
        
        # Append to main df
        dft = dft.append(df1, ignore_index = True)
        
        
        ### read in second sheet
        
        # read in xls
        df2 = pd.read_excel('./even_more_data/ASR' + year + '_' + level + '.xls', 
                            sheet_name = sheet2)

        # Look for header index
        header_index = df2[df2.iloc[:, 0].str.match(header, na = False, case = False)].index[0]

        # Read in sheet skipping the correct number of rows
        df2 = pd.read_excel('./even_more_data/ASR' + year + '_' + level + '.xls', 
                            sheet_name = sheet2, skiprows = header_index + 1)
        
        # Tidy up
        df2 = tidy_df(df2, use_columns, footer)
        
        # Sort headers
        df2.columns = ['subject', 'count']
        
        # Add in year and level columns
        df2['year'] = int(year)
        df2['level'] = level
        df2['grade'] = 'passes'
        
        # Append to main df
        dft = dft.append(df2, ignore_index = True)
        
        
        ### Calculate NAs
        
        # Copy first orginal df
        df3 = df1
        
        # Change grade to b - represents NAs
        df3['grade'] = 'na'
        
        # Change count to difference between entries and passes
        df3['count'] = df1['count'] - df2['count']
        
        # Append to main df
        dft = dft.append(df3, ignore_index = True)

In [65]:
# Define level and years
levels = ['Standard_Grade']
years = ['2013', '2012', '2011', '2010', '2009', '2008', '2007', '2006']

# Define sheetnames to grab and header to search for
sheets = ['sg3']
use_columns = ['subject', 'entries', '1', '2', '3', '4', '5', '6', '7']
header = 'title|subject'
footer = 'subtotal'

for level in levels:
    
    for year in years:
        
        df = read_sheet(year, level, sheets)
        
        df = tidy_df(df, use_columns, footer)
        
        df['na'] = df['7']
        
        # Change into long format
        dfl = pd.melt(df, id_vars =['subject'], 
                value_vars = ['entries', '1', '2', '3', '4', '5', '6', 'na'],
                        var_name ='grade', value_name ='count')
        

        
        # Add in year and level columns
        dfl['year'] = int(year)
        dfl['level'] = level
        
        # Append to main df
        dft = dft.append(dfl, ignore_index = True)
        
        
#         # Rough conversion to N5 grades - 1,2 > a, 3,4 > b, 5 > c, 6 > d, 7 > na
#         dfs['a'] = dfs['1'] + dfs['2']
#         dfs['b'] = dfs['3'] + dfs['4']
#         dfs['c'] = dfs['5']
#         dfs['d'] = dfs['6']
#         dfs['na'] = dfs['7']

In [68]:
def tidy_up(df):

    # Update names
    di = {'Home Economics: Fashion and Textile Technology': 'Fashion and Textile Technology',
            'Home Economics: Health and Food Technology': 'Health and Food Technology',
            'Home Economics: Lifestyle and Consumer Technology': 'Lifestyle and Consumer Technology',
            'Administration': 'Administration and IT',
            'Computing': 'Computing Science',
            'Dance Practice': 'Dance',
            'Managing Environmental Resources': 'Environmental Science',
            'Media': 'Media Studies',
            'Product Design': 'Design and Manufacture',
            'Art and Design: Design': 'Art and Design (Design)',
            'Art and Design: Expressive': 'Art and Design (Expressive)',
            'Art and Design Enquiry: Design': 'Art and Design (Design)',
            'Art and Design Enquiry: Expressive': 'Art and Design (Expressive)',
            'Gaidhlig': 'Gàidhlig',
            'Accounting*': 'Accounting & Finance',
            'Accounting': 'Accounting & Finance',
            'Art & Design': 'Art and Design',
            'Religious Studies': 'Religious, Moral and Philosophical Studies',
            'Computing Studies': 'Computing Science',
            'Home Economics': 'Health and Food Technology',
            'Photography for the Media': 'Photography',
            'Psychology (New)': 'Psychology', 
            'Computing (New)': 'Computing Science',
            'Computing Studies (New)': 'Computing Science',
            'Mental Health Care (New)': 'Mental Health Care',
            'Retail Travel (New)': 'Retail Travel',
            'Travel and Tourism*': 'Travel and Tourism',
            'Social and Vocational Skills' : 'Social & Vocational Skills',
            'Practical Cake Craft': 'Hospitality: Practical Cake Craft',
            'Practical Cookery': 'Hospitality: Practical Cookery',
            'Craft and Design': 'Craft & Design',
            'Care Issues for Society : Child Care': 'Care Issues for Society: Child Care',
            'Geography (New)': 'Geography',
            'Hospitality - Practical Cookery': 'Hospitality: Practical Cookery',
            'Hospitality - Reception and Accommodation Operations': 
            'Hospitality: Reception and Accommodation Operation',
            'Hospitality - Professional Cookery': 'Hospitality: Professional Cookery',
            'Hospitality - General Operations': 'Hospitality: General Operations',
            'Woodworking Skills': 'Practical Woodworking',
            'Applied Practical Electronics': 'Practical Electronics',
            'Practical Cookery**': 'Practical Cookery',
            'Applications of Mathematics*': 'Applications of Mathematics',
            'Creative Cake Production': 'Hospitality: Practical Cake Craft',
            'Accounting and Finance': 'Accounting & Finance'}
    
    df = df.replace({"subject": di})
    
    # Remove revised science courses
    values = ['Biology (Revised)', 'Chemistry (Revised)', 'Physics (Revised)', 'Human Biology (Revised)']
    
    #df = df.loc[~df['subject'].isin(values)]
    
    # Remove other courses that don't fit into graphs too well?
    values = ['Religious, Moral and Philosophical Studies (New)', 'Selling Scheduled Air Travel',
                 'Building Construction', 'Construction', 'Art and Design: Research and Appreciation',
                    'Architectural Technology', 'Beauty', 'Health and Social Care',
                    'Personal Development', 'Professional Patisserie',
                    'Fitness and Exercise', 'Sports Coaching Studies', 'Design',
                    'Fabrication and Welding Engineering', 'Visual Arts',
                    'Hospitality - Food and Drink Service',
                    'Personal and Social Education', 'Sports Organisation',
                    'Building and Architectural Technology', 'Building Services',
                    'Civil Engineering', 'Early Years Care and Education',
                    'Food Production Supervision', 'Structural Engineering',
                    'Advertising, Marketing and Public Relations',
                    'Interviewing, Writing and Publishing',
                    'Construction Craft Skills',
                    'Construction Industry Practice']
       
    #df = df.loc[~df['subject'].isin(values)]
    
    # TO DO - Add Religious, Moral and Philosophical Studies (New) to Religious, 
                # Moral and Philosophical Studiesin 2006
    
    # TO DO - Add these up into Chinese langugages category, drop for now
    values = ['Mandarin (Simplified)', 'Mandarin (Traditional)', 'Cantonese']
    
    #df = df.loc[~df['subject'].isin(values)]
    
    return df

dft = tidy_up(dft)

In [69]:
# Create a list of subjects and year in correct order
subjects = sorted(dft['subject'].unique())
years = sorted(dft['year'].unique())

print(subjects)

['Accounting & Finance', 'Administration and IT', 'Advertising, Marketing and Public Relations', 'Applications of Mathematics', 'Applied Mathematics', 'Architectural Technology', 'Art and Design', 'Art and Design (Design)', 'Art and Design (Expressive)', 'Art and Design: Research and Appreciation', 'Automotive Skills', 'Beauty', 'Biology', 'Biology (Revised)', 'Biotechnology', 'Building Construction', 'Building Services', 'Building and Architectural Technology', 'Business', 'Business Management', 'Cantonese', 'Care', 'Care Issues for Society: Child Care', 'Care Issues for Society: Older People', 'Care Practice', 'Chemistry', 'Chemistry (Revised)', 'Childcare and Development', 'Chinese Languages', 'Civil Engineering', 'Classical Greek', 'Classical Studies', 'Computing Science', 'Construction', 'Construction Craft Skills', 'Construction Crafts', 'Construction Industry Practice', 'Contemporary Social Studies', 'Craft & Design', 'Creative Digital Media', 'Creative Industries', 'Dance', 'De

In [70]:
dft.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20083 entries, 0 to 20082
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   subject  20083 non-null  object 
 1   grade    20083 non-null  object 
 2   count    19832 non-null  float64
 3   year     20083 non-null  int64  
 4   level    20083 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 784.6+ KB


In [71]:
# Plot graph


levels = ['Advanced_Higher', 'Higher', 'National_5', 'Intermediate2', 'Intermediate1']

fig = px.line(dft, x = "year", y = ["count"], 
                color = "subject",
                facet_col = 'grade',
                facet_row = 'level',
                facet_row_spacing = 0.02, 
                facet_col_spacing = 0.01, 
                width = 2000,
                height = 1500,
                category_orders = {'subject' : subjects, 
                                       'year' : years,
                                          'level' : levels},
                labels = {"value": "Count",
                             "subject": "Subject",
                                  "level": "Level",
                                     "grade": "Grade",
                                         "year": "Year"},
                title = 'Total entries and grades per subject  2006 - 2021'
                )

fig.update_yaxes(rangemode = "tozero")

fig.update_xaxes(tickangle = 60)

# fig.layout.yaxis.matches = 'y'
# fig.layout.yaxis2.matches = 'y'
# fig.layout.yaxis3.matches = 'y'
# fig.layout.yaxis4.matches = 'y'
# fig.layout.yaxis5.matches = 'y'
# fig.layout.yaxis6.matches = 'y'

# fig.layout.yaxis7.matches = 'y7'
# fig.layout.yaxis8.matches = 'y7'
# fig.layout.yaxis9.matches = 'y7'
# fig.layout.yaxis10.matches = 'y7'
# fig.layout.yaxis11.matches = 'y7'
# fig.layout.yaxis12.matches = 'y7'

# fig.layout.yaxis13.matches = 'y13'
# fig.layout.yaxis14.matches = 'y13'
# fig.layout.yaxis15.matches = 'y13'
# fig.layout.yaxis16.matches = 'y13'
# fig.layout.yaxis17.matches = 'y13'
# fig.layout.yaxis18.matches = 'y13'

# fig.layout.yaxis19.matches = 'y19'
# fig.layout.yaxis20.matches = 'y19'
# fig.layout.yaxis21.matches = 'y19'
# fig.layout.yaxis22.matches = 'y19'
# fig.layout.yaxis23.matches = 'y19'
# fig.layout.yaxis24.matches = 'y19'

# fig.layout.yaxis25.matches = 'y25'
# fig.layout.yaxis26.matches = 'y25'
# fig.layout.yaxis27.matches = 'y25'
# fig.layout.yaxis28.matches = 'y25'
# fig.layout.yaxis29.matches = 'y25'
# fig.layout.yaxis30.matches = 'y25'

# fig.layout.yaxis31.matches = 'y31'
# fig.layout.yaxis32.matches = 'y31'
# fig.layout.yaxis33.matches = 'y31'
# fig.layout.yaxis34.matches = 'y31'
# fig.layout.yaxis35.matches = 'y31'
# fig.layout.yaxis36.matches = 'y31'

# fig.layout.yaxis37.matches = 'y37'
# fig.layout.yaxis38.matches = 'y37'
# fig.layout.yaxis39.matches = 'y37'


fig.update_traces(mode = "markers+lines")

filename = 'Total_students_details'

fig.write_html('./graphs/' + filename + '.html')

fig.show()

In [42]:
# Graph too big to upload!!! :)

In [34]:
dft.to_csv('./csvs/2006_2021_data.csv')