In [1]:
# Import key librarys

import pandas as pd
import numpy as np

import plotly.express as px

import chart_studio
import chart_studio.plotly as py
import chart_studio.tools as tls

In [2]:
def read_single_sheet(year, level, header, sheet_to_find):
    
    # create list of sheetnames
    sheet_names = pd.ExcelFile('./even_more_data/ASR' + year + '_' + level + '.xls').sheet_names
    
    # format the list of sheet names
    sheet_names = [name.lower() for name in sheet_names]
    
    # get the index that matches our sheet to find
    index = sheet_names.index(sheet_to_find)
    
    # feed this index into pandas
    df = pd.read_excel('./even_more_data/ASR' + year + '_' + level + '.xls', 
                        sheet_name = index)
    
    # Look for header index
    header_index = df[df.iloc[:, 0].str.match(header, na = False, case = False)].index[0]

    # Read in sheet skipping the correct number of rows
    df = pd.read_excel('./even_more_data/ASR' + year + '_' + level + '.xls', 
                        sheet_name = index, skiprows = header_index + 1)
    
    return df

def read_sheet(year, level, header, sheets_to_find):
        
    # create list of sheetnames
    tabs = pd.ExcelFile('./even_more_data/ASR' + year + '_' + level + '.xls').sheet_names
    # Use key to find correct substring to search sheetnames for
    subs = sheets_to_find
    # Find correct sheetname, case insensitive search
    name = [s for s in tabs if any(tabs in s.lower() for tabs in sheets)][0]

    # read in xls
    df = pd.read_excel('./even_more_data/ASR' + year + '_' + level + '.xls', 
                        sheet_name = name)
    
    # Look for header index
    header_index = df[df.iloc[:, 0].str.match(header, na = False, case = False)].index[0]

    # Read in sheet skipping the correct number of rows
    df = pd.read_excel('./even_more_data/ASR' + year + '_' + level + '.xls', 
                        sheet_name = name, skiprows = header_index + 1)
    
    return df

def read_old_single_sheet(year, header, sheet_to_find):
    
    # create list of sheetnames
    sheet_names = pd.ExcelFile('./even_older_data/Stats' + year + '.xls').sheet_names
    
    # format the list of sheet names
    sheet_names = [name.lower() for name in sheet_names]
    
    # get the index that matches our sheet to find
    index = sheet_names.index(sheet_to_find)

    # feed this index into pandas
    df = pd.read_excel('./even_older_data/Stats' + year + '.xls', 
                        sheet_name = index)
    
    # Look for header index
    header_index = df[df.iloc[:, 0].str.match(header, na = False, case = False)].index[0]

    # Read in sheet skipping the correct number of rows
    df = pd.read_excel('./even_older_data/Stats' + year + '.xls', 
                        sheet_name = index, skiprows = header_index + 2)
    
    return df

def select_old_columns(df):
    
    # Select total entries and grades columns
    ls = ['subject', 'entries.2', '1.2', '2.2', '3.2', '4.2', '5.2', '6.2', '7.2']
    df = df[ls]
    
    # Rename total entries and grades columns
    di = {'entries.2': 'entries', '1.2': '1', '2.2': '2', '3.2': '3', '4.2': '4', '5.2': '5',
                                    '6.2': '6', '7.2': '7'}
    df = df.rename(columns = di)
    
    return df

def tidy_df(df, footer):
    
    # Replace weird values with NaN
    df = df.replace('***', np.nan)
    df = df.replace('-', np.nan)
    df = df.replace(' -', np.nan)
    
    
    # Change columns to lower case
    df.columns = df.columns.astype(str).str.lower()

    # Rename first column to subject
    df.rename(columns = {df.columns[0]:'subject'}, inplace = True)
    
    # Drop rows with NaN in subject column
    df = df.dropna(subset = ['subject'])
    

    
    # Reset index
    df.reset_index(drop = True, inplace = True)

    # Search for footer in column and get index  
    footer_index = df[df.iloc[:, 0].str.match(footer, na = False, case = False)].index[0]
    # Drop all rows under footer column
    df = df.drop(df.index[footer_index:])
    
    if 'comp' in df.columns.values:
        df.rename(columns = {'comp':'d'}, inplace = True)
    
    # Update to recognise datatypes
    df = df.infer_objects()
    
    return df

def tidy_old_df(df, footer):

    # Replace weird values with NaN
    df = df.replace('***', np.nan)
    df = df.replace('-', np.nan)
    df = df.replace(' -', np.nan)
    
    # Change columns to lower case
    df.columns = df.columns.astype(str).str.lower()
    
    # Rename first column to subject
    df.rename(columns = {df.columns[0]:'subject'}, inplace = True)

    # Drop rows with NaN in subject column
    df = df.dropna(subset = ['subject'])
    
    # Reset index
    df.reset_index(drop = True, inplace = True)

    # Search for footer in column and get index  
    footer_index = df[df.iloc[:, 0].str.match(footer, na = False, case = False)].index[0]
    # Drop all rows under footer column
    df = df.drop(df.index[footer_index:])
    
    # Remove whitespace from the end of all subjects
    df['subject'] = df['subject'].str.strip()
    
    # Update to recognise datatypes
    df = df.infer_objects()
    
    return df

def grade_count(df):
    
    # Change columns from percentages to counts
    df['a'] = df['a'] * df['entries'] // 100
    df['b'] = df['b'] * df['entries'] // 100
    df['c'] = df['c'] * df['entries'] // 100
    df['d'] = df['d'] * df['entries'] // 100
    
    return df

def pass_fail(df):
    
    # Replace nans with zeros
    df = df.fillna(0)
    
    # Add column with na count
    df['na'] = df['entries'] - df['a'] - df['b'] - df['c'] - df['d']
    
    # Add column with pass count
    df['pass'] = df['entries'] - df['na']
    
    return df

def pass_fail_no_d(df):
    
    # Replace nans with zeros
    df = df.fillna(0)
    
    # Add column with na count
    df['na'] = df['entries'] - df['a'] - df['b'] - df['c']
    
    # Add column with pass count
    df['pass'] = df['entries'] - df['na']
    
    return df

# Function to combine different versions of the same subject (level)

def combine_subs_level(df, mainlevel, subs):
    
    # Create empty dataframes
    dfs = pd.DataFrame()
    dfu = pd.DataFrame()
    
    # Loop through list items
    for l in subs:
        
        # Find rows with level equal to other version of subject levels
        dfw = df[(df['level'] == l)]
        
        # Append df for all values in list
        dfs = dfs.append(dfw, ignore_index = True)
        
        # Change level value to main value
        dfs['level'] = mainlevel
        
    
    # Merge list of other versions to main df and fill NaNs with zeros
    dfu = pd.merge(df, dfs, how = 'outer', on = ['subject', 'grade', 'year', 'level'])
    dfu = dfu.fillna(0)
    
    # Add together two counts for each row
    dfu['count'] = dfu['count_x'] + dfu['count_y']
    
    # Drop working columns
    dfu = dfu.drop(['count_x', 'count_y'], axis=1)
    
    # Drop rows with other versions
    values = subs
    dfu = dfu.loc[~dfu['level'].isin(values)]
    
    return dfu

# Function to combine different versions of the same subject (subject name)

def combine_subs_name(df, mainname, subs):
    
    # Create empty dataframes
    dfs = pd.DataFrame()
    dfu = pd.DataFrame()
    
    # Loop through list items
    for n in subs:
        
        # Find rows with level equal to other version of subject levels
        dfw = df[(df['subject'] == n)]
        
        # Append df for all values in list
        dfs = dfs.append(dfw, ignore_index = True)
        
        # Change level value to main value
        dfs['subject'] = mainname
        
        # Add together duplicate rows
        dfs = dfs.groupby(['subject', 'grade', 'year', 'level'], as_index = False)['count'].sum()
    
    # Merge list of other versions to main df and fill NaNs with zeros
    dfu = pd.merge(df, dfs, how = 'outer', on = ['subject', 'grade', 'year', 'level'])
    dfu = dfu.fillna(0)
    
    # Add together two counts for each row
    dfu['count'] = dfu['count_x'] + dfu['count_y']
    
    # Drop working columns
    dfu = dfu.drop(['count_x', 'count_y'], axis=1)
    
    # Drop rows with other versions
    values = subs
    dfu = dfu.loc[~dfu['subject'].isin(values)]
    
    return dfu

In [3]:
# Create empty dataframe
dft = pd.DataFrame()

In [4]:
# Define level and years
levels = ['National_5', 'Higher', 'Advanced_Higher']

years = ['2021', '2020', '2019', '2018', '2017', '2016', '2015', '2014']

# Define sheetnames to grab and header, columns and footer to search for
sheets = ['table 4','4b']
header = 'title|subject'
footer = 'totals|total|subtotals'

# Create empty dataframe
dfl = pd.DataFrame()

for level in levels:
    
    for year in years:
        
        # Read and tidy data
        df = read_sheet(year, level, header, sheets)
        df = tidy_df(df, footer)
        
        # Calculate grade counts from percentages
        df = grade_count(df)

        # Calculate pass and fails
        df = pass_fail(df)
        
        # Change into long format
        dfl = pd.melt(df, id_vars =['subject'], 
                value_vars = ['entries', 'a', 'b', 'c', 'd', 'pass', 'na'],
                        var_name ='grade', value_name ='count')
        
        # Add in year and level columns
        dfl['year'] = int(year)
        dfl['level'] = level
        
        # Append to main df
        dft = dft.append(dfl, ignore_index = True)

In [5]:
level = 'New_Higher'
year = '2015'

sheet = 'table 4'
header = 'title'
footer = 'total'

# Create empty dataframe
dfl = pd.DataFrame()

# Read and tidy data
df = read_single_sheet(year, level, header, sheet)
df = tidy_df(df, footer)

# Calculate grade counts from percentages
df = grade_count(df)

# Calculate pass and fails
df = pass_fail(df)

# Change into long format
dfl = pd.melt(df, id_vars =['subject'], 
        value_vars = ['entries', 'a', 'b', 'c', 'd', 'pass', 'na'],
                var_name ='grade', value_name ='count')

# Add in year and level columns
dfl['year'] = int(year)
dfl['level'] = level

# Append to main df
dft = dft.append(dfl, ignore_index = True)

In [6]:
# Define level and years
levels = ['Intermediate2', 'Intermediate1']

years = ['2015', '2014']

# Define sheetnames to grab and header, columns and footer to search for
sheets = ['table 4','4b']
header = 'title|subject'
footer = 'totals|total|subtotals'

# Create empty dataframe
dfl = pd.DataFrame()

for level in levels:
    
    for year in years:
        
        # Read and tidy data
        df = read_sheet(year, level, header, sheets)
        df = tidy_df(df, footer)
        
        # Calculate grade counts from percentages
        df = grade_count(df)
        
        # Calculate pass and fails
        df = pass_fail(df)
        
        # Change into long format
        dfl = pd.melt(df, id_vars =['subject'], 
                value_vars = ['entries', 'a', 'b', 'c', 'd', 'pass', 'na'],
                        var_name ='grade', value_name ='count')
        
        # Add in year and level columns
        dfl['year'] = int(year)
        dfl['level'] = level
        
        # Append to main df
        dft = dft.append(dfl, ignore_index = True)

In [7]:
# Define level and years
levels = ['Intermediate2', 'Intermediate1', 'Higher', 'Advanced_Higher']
years = ['2013', '2012', '2011', '2010', '2009', '2008', '2007', '2006', '2005', '2004']

# Define sheetnames to grab and header to search for
sheets = ['4a']
header = 'title|subject'
footer = 'totals|total|subtotals'

# Create empty dataframe
dfl = pd.DataFrame()

for level in levels:
    
    for year in years:
        
        # Read and tidy data
        df = read_sheet(year, level, header, sheets)
        df = tidy_df(df, footer)
        
        ### Look at graded courses
        if 'passes in ungraded courses' in df.columns.values:
            df1 = df[pd.isnull(df['passes in ungraded courses'])].copy()
        else:
            df1 = df
        
        # Calculate pass and fails
        df1 = pass_fail(df1)

        # Change into long format
        dfl = pd.melt(df1, id_vars =['subject'], 
                value_vars = ['entries', 'a', 'b', 'c', 'd', 'pass', 'na'],
                        var_name ='grade', value_name ='count')
        
        # Add in year and level columns
        dfl['year'] = int(year)
        dfl['level'] = level
        
        # Append to main df
        dft = dft.append(dfl, ignore_index = True)
        
        
        ### Look at ungraded courses
        if 'passes in ungraded courses' in df.columns.values:
            
            # Drop rows with grades
            df2 = df[pd.notnull(df['passes in ungraded courses'])].copy()
            
            # Sort column name
            df2.rename(columns = {'passes in ungraded courses': 'pass'}, inplace = True)
        
            # Add column with na count
            df2['na'] = df['entries'] - df2['pass']

            # Change into long format
            dfl = pd.melt(df2, id_vars =['subject'], 
                    value_vars = ['entries', 'pass', 'na'],
                            var_name ='grade', value_name ='count')

            # Add in year and level columns
            dfl['year'] = int(year)
            dfl['level'] = level

            # Append to main df
            dft = dft.append(dfl, ignore_index = True)

In [8]:
# Define level and years
levels = ['Advanced_Higher']
years = ['2003', '2002', '2001']

# Define sheetnames to grab and header to search for
sheets = ['4a']
header = 'title|subject'
footer = 'totals|total|subtotals'

# Create empty dataframe
dfl = pd.DataFrame()

for level in levels:
    
    for year in years:
        
        # Read and tidy data
        df = read_sheet(year, level, header, sheets)
        df = tidy_df(df, footer)
        
        
        # Calculate pass and fails
        df = pass_fail(df)
        
        # Change into long format
        dfl = pd.melt(df, id_vars =['subject'], 
                value_vars = ['entries', 'a', 'b', 'c', 'pass', 'na'],
                        var_name ='grade', value_name ='count')
        
        # Add in year and level columns
        dfl['year'] = int(year)
        dfl['level'] = level
        
        # Append to main df
        dft = dft.append(dfl, ignore_index = True)

In [9]:
# Define level and years
levels = ['Intermediate1']
years = ['2003', '2002', '2001', '2000']

# Define sheetnames to grab and header to search for
sheets = ['4a']
header = 'title|subject'
footer = 'totals|total|subtotals'

# Create empty dataframe
dfl = pd.DataFrame()

for level in levels:
    
    for year in years:
        
        # Read and tidy data
        df = read_sheet(year, level, header, sheets)
        df = tidy_df(df, footer)
        
        # Calculate pass and fails
        df = pass_fail_no_d(df)
        
        # Change into long format
        dfl = pd.melt(df, id_vars =['subject'], 
                value_vars = ['entries', 'a', 'b', 'c', 'pass', 'na'],
                        var_name ='grade', value_name ='count')
        
        # Add in year and level columns
        dfl['year'] = int(year)
        dfl['level'] = level
        
        # Append to main df
        dft = dft.append(dfl, ignore_index = True)

In [10]:
# Define level and years
levels = ['Intermediate2']
years = ['2003', '2002', '2001', '2000']

# Define sheetnames to grab and header to search for
sheets = ['4a']
header = 'title|subject'
footer = 'totals|total|subtotals'

# Create empty dataframe
dfl = pd.DataFrame()

for level in levels:
    
    for year in years:
        
        # Read and tidy data
        df = read_sheet(year, level, header, sheets)
        df = tidy_df(df, footer)
        
        # Calculate pass and fails
        df = pass_fail(df)
        
        # Change into long format
        dfl = pd.melt(df, id_vars =['subject'], 
                value_vars = ['entries', 'a', 'b', 'c', 'd', 'pass', 'na'],
                        var_name ='grade', value_name ='count')
        
        # Add in year and level columns
        dfl['year'] = int(year)
        dfl['level'] = level
        
        # Append to main df
        dft = dft.append(dfl, ignore_index = True)

In [11]:
# Define level and years
levels = ['Higher']
years = ['2003', '2002', '2001', '2000']

# Define sheetnames to grab and header to search for
sheets = ['3a']
header = 'title|subject'
footer = 'totals|total|subtotals'

# Create empty dataframe
dfl = pd.DataFrame()

for level in levels:
    
    for year in years:
        
        # Read and tidy data
        df = read_sheet(year, level, header, sheets)
        df = tidy_df(df, footer)
        
        # Calculate pass and fails
        df = pass_fail(df)
        
        # Change into long format
        dfl = pd.melt(df, id_vars =['subject'], 
                value_vars = ['entries', 'a', 'b', 'c', 'd', 'pass', 'na'],
                        var_name ='grade', value_name ='count')
        
        # Add in year and level columns
        dfl['year'] = int(year)
        dfl['level'] = level
        
        # Append to main df
        dft = dft.append(dfl, ignore_index = True)

In [12]:
real_level = 'Old_Higher'
level = 'Standard_Grade' # Stats hidden away in SG document
years = ['2001', '2000']

sheet = 'hg3a'
header = 'subject'
footer = 'total|subtotals'

for year in years:
    
    # Read and tidy data
    df = read_single_sheet(year, level, header, sheet)
    df = tidy_old_df(df, footer)

    # Calculate pass and fails
    df = pass_fail(df)

    # Change into long format
    dfl = pd.melt(df, id_vars =['subject'], 
            value_vars = ['entries', 'a', 'b', 'c', 'd', 'pass', 'na'],
                    var_name ='grade', value_name ='count')

    # Add in year and level columns
    dfl['year'] = int(year)
    dfl['level'] = real_level

    # Append to main df
    dft = dft.append(dfl, ignore_index = True)

In [13]:
# Define level and years
levels = ['National_4']
years = ['2021', '2020', '2019', '2018', '2017', '2016', '2015', '2014']

# Define sheetnames to grab and header to search for
sheet1 = 'table 1'
sheet2 = 'table 2'
header = 'TITLE'
footer = 'Total'

# Create empty dataframe
dfl = pd.DataFrame()

for level in levels:
    
    for year in years:
        
        ### read in first sheet
        
        # Read and tidy data
        df1 = read_single_sheet(year, level, header, sheet1)
        df1 = tidy_df(df1, footer)
        
        # Select correct columns
        df1 = df1[['subject', year]]
        
        # Sort headers
        df1.columns = ['subject', 'count']
        
        # Add in year and level columns
        df1['year'] = int(year)
        df1['level'] = level
        df1['grade'] = 'entries'
        
        # Append to main df
        dft = dft.append(df1, ignore_index = True)
        
        
        ### read in second sheet
        
        # Read and tidy data
        df2 = read_single_sheet(year, level, header, sheet2)
        df2 = tidy_df(df2, footer)
        
        # Select correct columns
        df2 = df2[['subject', year]]
        
        # Sort headers
        df2.columns = ['subject', 'count']
        
        # Add in year and level columns
        df2['year'] = int(year)
        df2['level'] = level
        df2['grade'] = 'pass'
        
        # Append to main df
        dft = dft.append(df2, ignore_index = True)
        
        
        ### Calculate NAs
        
        # Copy first orginal df
        df3 = df1
        
        # Change grade to b - represents NAs
        df3['grade'] = 'na'
        
        # Change count to difference between entries and passes
        df3['count'] = df1['count'] - df2['count']
        
        # Append to main df
        dft = dft.append(df3, ignore_index = True)

In [14]:
# Define level and years
levels = ['Standard_Grade']
years = ['2013', '2012', '2011', '2010', '2009', '2008', '2007', '2006', 
                 '2005', '2004', '2003']

# Define sheetnames to grab and header to search for
sheets = ['sg3']
header = 'title|subject'
footer = 'subtotal|totals'

# Create empty dataframe
dfl = pd.DataFrame()

for level in levels:
    
    for year in years:
        
        # Read and tidy data
        df = read_sheet(year, level, header, sheets)
        df = tidy_df(df, footer)
        
        # Replace nans with zeros
        df = df.fillna(0)
        
        # Replace grade 7 with no award
        df['na'] = df['7']
        
        # Add column with pass count
        df['pass'] = df['entries'] - df['na']
        
        # Change into long format
        dfl = pd.melt(df, id_vars =['subject'], 
                value_vars = ['entries', '1', '2', '3', '4', '5', '6', 'pass', 'na'],
                        var_name ='grade', value_name ='count')
        
        
        # Add in year and level columns
        dfl['year'] = int(year)
        dfl['level'] = level
        
        # Append to main df
        dft = dft.append(dfl, ignore_index = True)

In [15]:
# Define level and years
levels = ['Standard_Grade']
years = ['2002', '2001', '2000']

# Define sheetnames to grab and header to search for
sheet1 = 'sg4a'
sheet2 = 'sg4b'
header = 'subject'
footer = 'subtotals'

# Create empty dataframe
dfl = pd.DataFrame()

for level in levels:
    
    for year in years:
        
        ### read in first sheet (males)
        
        # Read and tidy data
        df1 = read_single_sheet(year, level, header, sheet1)
        df1 = tidy_df(df1, footer)
        
        # Replace nans with zeros
        df = df.fillna(0)
        
        # Replace grade 7 with no award
        df1['na'] = df1['7']
        
        # Add column with pass count
        df1['pass'] = df1['entries'] - df1['na']
        
        
        ### read in second sheet (females)
        
        # Read and tidy data
        df2 = read_single_sheet(year, level, header, sheet2)
        df2 = tidy_df(df2, footer)
        
        # Replace grade 7 with no award
        df2['na'] = df2['7']
        
        # Add column with pass count
        df2['pass'] = df2['entries'] - df1['na']
        
        
        ### Add together values
        df3 = df1
        
        df3['entries'] =  df1['entries'] + df2['entries']
        df3['pass'] =  df1['pass'] + df2['pass']
        df3['na'] =  df1['na'] + df2['na']
        
        df3['1'] =  df1['1'] + df2['1']
        df3['2'] =  df1['2'] + df2['2']
        df3['3'] =  df1['3'] + df2['3']
        df3['4'] =  df1['4'] + df2['4']
        df3['5'] =  df1['5'] + df2['5']
        df3['6'] =  df1['6'] + df2['6']
        
        # Change into long format
        dfl = pd.melt(df3, id_vars =['subject'], 
                value_vars = ['entries', '1', '2', '3', '4', '5', '6', 'pass', 'na'],
                        var_name ='grade', value_name ='count')
        
        
        # Add in year and level columns
        dfl['year'] = int(year)
        dfl['level'] = level
        
        # Append to main df
        dft = dft.append(dfl, ignore_index = True)

In [16]:
#         # Rough conversion to N5 grades idea - 1,2 > a, 3,4 > b, 5 > c, 6 > d, 7 > na
#         dfs['a'] = dfs['1'] + dfs['2']
#         dfs['b'] = dfs['3'] + dfs['4']
#         dfs['c'] = dfs['5']
#         dfs['d'] = dfs['6']
#         dfs['na'] = dfs['7']

In [17]:
years = ['1999', '1998', '1997', '1996', '1995', '1994', '1993',
                    '1992', '1991', '1990', '1989', '1988', '1987', '1986']

level = 'Standard_Grade'

# Define sheetnames to grab and header to search for
sheet = 'std grade'
header = 'subject'
footer = 'totals'

for year in years:
    
    # Read and tidy data
    df = read_old_single_sheet(year, header, sheet)
    df = tidy_old_df(df, footer)
    df = select_old_columns(df)
    
    # Replace nans with zeros
    df = df.fillna(0)

    # Replace grade 7 with no award
    df['na'] = df['7']

    # Add column with pass count
    df['pass'] = df['entries'] - df['na']

    # Change into long format
    dfl = pd.melt(df, id_vars =['subject'], 
            value_vars = ['entries', '1', '2', '3', '4', '5', '6', 'pass', 'na'],
                    var_name ='grade', value_name ='count')


    # Add in year and level columns
    dfl['year'] = int(year)
    dfl['level'] = level

    # Append to main df
    dft = dft.append(dfl, ignore_index = True)

In [18]:
def read_old_higher_single_sheet(year, header, sheet_to_find):
    
    # create list of sheetnames
    sheet_names = pd.ExcelFile('./even_older_data/Stats' + year + '.xls').sheet_names
    
    # format the list of sheet names
    sheet_names = [name.lower() for name in sheet_names]
    
    # get the index that matches our sheet to find
    index = sheet_names.index(sheet_to_find)

    # feed this index into pandas
    df = pd.read_excel('./even_older_data/Stats' + year + '.xls', 
                        sheet_name = index)
    
    # Delete first column
    del df[df.columns[0]]
    
    # Look for header index
    header_index = df[df.iloc[:, 0].str.match(header, na = False, case = False)].index[0]

    # Read in sheet skipping the correct number of rows
    df = pd.read_excel('./even_older_data/Stats' + year + '.xls', 
                        sheet_name = index, skiprows = header_index + 3)
    

    # Delete first column
    del df[df.columns[0]]
    
    return df

def select_old_higher_columns(df):
    
    # Select total entries and grades columns
    ls = ['subject', 'entries.2', 'a.2', 'b.2', 'c.2', 'd.2']
    df = df[ls]
    
    # Rename total entries and grades columns
    di = {'entries.2': 'entries', 'a.2': 'a', 'b.2': 'b', 'c.2': 'c', 'd.2': 'd'}
    df = df.rename(columns = di)
    
    # Convert strs in entries column to NaN
    df['entries'] = pd.to_numeric(df['entries'], errors='coerce')
    
    return df

In [19]:
years = ['1999', '1998', '1997', '1996', '1995', '1994', '1993',
                    '1992', '1991', '1990', '1989', '1988', '1987', '1986']

level = 'Higher'

# Define sheetnames to grab and header to search for
sheet = 'higher'
header = 'subject'
footer = 'totals'

for year in years:
    
    # Read and tidy data
    df = read_old_higher_single_sheet(year, header, sheet)
    df = tidy_old_df(df, footer)
    df = select_old_higher_columns(df)

    # Calculate pass and fails
    df = pass_fail(df)
    
    # Change into long format
    dfl = pd.melt(df, id_vars =['subject'], 
            value_vars = ['entries', 'a', 'b', 'c', 'd', 'pass', 'na'],
                    var_name ='grade', value_name ='count')


    # Add in year and level columns
    dfl['year'] = int(year)
    dfl['level'] = level

    # Append to main df
    dft = dft.append(dfl, ignore_index = True)

In [20]:
# def tidy_up_subjects(df):

#     # Update names
#     di = {,
#             'Health and Food Technology': 'Home Economics: Health and Food Technology',
#             'Product Design': 'Design and Manufacture',
#             'Food Production Supervision': 'Hospitality: Food Production Supervision',
#             'Creative Cake Production': 'Hospitality: Practical Cake Craft',
#             'Craft and Design': 'Craft & Design',
#             'Woodworking Skills': 'Practical Woodworking',
#             'Applied Practical Electronics': 'Practical Electronics',
#             'Fabrication and Welding Engineering': 'Fabrication and Welding',
#             'Hairdressing: Composite Skills in Hairdressing': 'Hairdressing',
#              }
    
#     df = df.replace({"subject": di})
    
#     # Remove writing parts of language courses
#     values = ['German (Writing)*', 'French  (Writing)*', 'Spanish (Writing)*']
#     df = df.loc[~df['subject'].isin(values)]
    
#     return df

# # Tidy up subjects
# dft = tidy_up_subjects(dft)

In [21]:
# Define updated df to work on for graphs

dfu = dft

In [22]:
subls = sorted(dfu['subject'].unique())

tag = ' (Revised)'

# Look for items with tag in the namme
matches = [match for match in subls if tag in match]

# Remove this from the name
matches_clean = [s.replace(tag, '') for s in matches]

# Create dictionary of the two lists above
di = dict(zip(matches, matches_clean))

# Loop through dictionary and call function to combine different versions of subject

for k in di:
    
    subs = [k]
    
    dfu = combine_subs_name(dfu, di[k], subs)

In [23]:
subls = sorted(dfu['subject'].unique())

tag = ' (Unrevised)'

# Look for items with tag in the namme
matches = [match for match in subls if tag in match]

# Remove this from the name
matches_clean = [s.replace(tag, '') for s in matches]

# Create dictionary of the two lists above
di = dict(zip(matches, matches_clean))

# Loop through dictionary and call function to combine different versions of subject

for k in di:
    
    subs = [k]
    
    dfu = combine_subs_name(dfu, di[k], subs)

In [24]:
subls = sorted(dfu['subject'].unique())

tag = ' (New)'

# Look for items with tag in the namme
matches = [match for match in subls if tag in match]

# Remove this from the name
matches_clean = [s.replace(tag, '') for s in matches]

# Create dictionary of the two lists above
di = dict(zip(matches, matches_clean))

# Loop through dictionary and call function to combine different versions of subject

for k in di:
    
    subs = [k]
    
    dfu = combine_subs_name(dfu, di[k], subs)

In [25]:
subls = sorted(dfu['subject'].unique())

tag = '*'

# Look for items with tag in the namme
matches = [match for match in subls if tag in match]

# Remove this from the name
matches_clean = [s.replace(tag, '') for s in matches]

# Create dictionary of the two lists above
di = dict(zip(matches, matches_clean))

# Loop through dictionary and call function to combine different versions of subject

for k in di:
    
    subs = [k]
    
    dfu = combine_subs_name(dfu, di[k], subs)

In [26]:
subls = sorted(dfu['subject'].unique())

tag = '**'

# Look for items with tag in the namme
matches = [match for match in subls if tag in match]

# Remove this from the name
matches_clean = [s.replace(tag, '') for s in matches]

# Create dictionary of the two lists above
di = dict(zip(matches, matches_clean))

# Loop through dictionary and call function to combine different versions of subject

for k in di:
    
    subs = [k]
    
    dfu = combine_subs_name(dfu, di[k], subs)

In [27]:
subls = sorted(dfu['subject'].unique())

tag = ' - '

# Look for items with tag in the namme
matches = [match for match in subls if tag in match]

# Remove this from the name
matches_clean = [s.replace(tag, ': ') for s in matches]

# Create dictionary of the two lists above
di = dict(zip(matches, matches_clean))

# Loop through dictionary and call function to combine different versions of subject

for k in di:
    
    subs = [k]
    
    dfu = combine_subs_name(dfu, di[k], subs)

In [28]:
subls = sorted(dfu['subject'].unique())

tag = ' : '

# Look for items with tag in the namme
matches = [match for match in subls if tag in match]

# Remove this from the name
matches_clean = [s.replace(tag, ': ') for s in matches]

# Create dictionary of the two lists above
di = dict(zip(matches, matches_clean))

# Loop through dictionary and call function to combine different versions of subject

for k in di:
    
    subs = [k]
    
    dfu = combine_subs_name(dfu, di[k], subs)

In [29]:
subls = sorted(dfu['subject'].unique())

tag = '  '

# Look for items with tag in the namme
matches = [match for match in subls if tag in match]

# Remove this from the name
matches_clean = [s.replace(tag, ' ') for s in matches]

# Create dictionary of the two lists above
di = dict(zip(matches, matches_clean))

# Loop through dictionary and call function to combine different versions of subject

for k in di:
    
    subs = [k]
    
    dfu = combine_subs_name(dfu, di[k], subs)

In [30]:
subls = sorted(dfu['subject'].unique())

tag = ':  '

# Look for items with tag in the namme
matches = [match for match in subls if tag in match]

# Remove this from the name
matches_clean = [s.replace(tag, ': ') for s in matches]

# Create dictionary of the two lists above
di = dict(zip(matches, matches_clean))

# Loop through dictionary and call function to combine different versions of subject

for k in di:
    
    subs = [k]
    
    dfu = combine_subs_name(dfu, di[k], subs)

In [31]:
subls = sorted(dfu['subject'].unique())

tag = ' & '

# Look for items with tag in the namme
matches = [match for match in subls if tag in match]

# Remove this from the name
matches_clean = [s.replace(tag, ' and ') for s in matches]

# Create dictionary of the two lists above
di = dict(zip(matches, matches_clean))

# Loop through dictionary and call function to combine different versions of subject

for k in di:
    
    subs = [k]
    
    dfu = combine_subs_name(dfu, di[k], subs)

In [48]:
# Combine subject names

subs = ['English and Communication']
dfu = combine_subs_name(dfu, 'English', subs)

subs = ['History (Traditional)', 'History (Alternative)']
dfu = combine_subs_name(dfu, 'History', subs)

subs = ['Secretarial Studies (Audio-typewriting)', 'Secretarial Studies (Word Processing)']
dfu = combine_subs_name(dfu, 'Secretarial Studies', subs)

subs = ['Computing', 'Computing Studies']
dfu = combine_subs_name(dfu, 'Computing Science', subs)

subs = ['Gáidhlig', 'Gàidhlig', 'Gaelic (Native Speakers)']
dfu = combine_subs_name(dfu, 'Gaidhlig', subs)

subs = ['Practical Cookery']
dfu = combine_subs_name(dfu, 'Hospitality: Practical Cookery', subs)

subs = ['Practical Cake Craft']
dfu = combine_subs_name(dfu, 'Hospitality: Practical Cake Craft', subs)

subs = ['Fashion and Textile Technology', 'Home Economics (Fabrics and Fashion)']
dfu = combine_subs_name(dfu, 'Home Economics: Fashion and Textile Technology', subs)

subs = ['Art and Design (Design)', 'Art and Design Enquiry: Design']
dfu = combine_subs_name(dfu, 'Art and Design: Design', subs)

subs = ['Art and Design (Expressive)', 'Art and Design Enquiry: Expressive']
dfu = combine_subs_name(dfu, 'Art and Design: Expressive', subs)

subs = ['Music: Inventing (Comp)']
dfu = combine_subs_name(dfu, 'Music: Inventing (Composition)', subs)

subs = ['Accounting']
dfu = combine_subs_name(dfu, 'Accounting and Finance', subs)

subs = ['Religious Studies']
dfu = combine_subs_name(dfu, 'Religious, Moral and Philosophical Studies', subs)

subs = ['Administration', 'Office and Information Studies']
dfu = combine_subs_name(dfu, 'Administration and IT', subs)

subs = ['Early Years Care and Education', 'Early Education and Childcare']
dfu = combine_subs_name(dfu, 'Childcare and Development', subs)

subs = ['Anatomy, Physiology and Health']
dfu = combine_subs_name(dfu, 'Human Biology', subs)

subs = ['Home Economics (Food and Nutrition)']
dfu = combine_subs_name(dfu, 'Home Economics: Food and Nutrition', subs)

subs = ['Mandarin (Simplified)', 'Mandarin (Traditional)', 'Cantonese']
dfu = combine_subs_name(dfu, 'Chinese Languages', subs)

subs = ['Hospitality Event Supervision']
dfu = combine_subs_name(dfu, 'Hospitality: Event Supervision', subs)

subs = ['Photography for the Media']
dfu = combine_subs_name(dfu, 'Photography', subs)

subs = ['Dance Practice']
dfu = combine_subs_name(dfu, 'Dance', subs)

subs = ['Media']
dfu = combine_subs_name(dfu, 'Media Studies', subs)

In [49]:
# Remove dodgy values from reading csv process
values = ['Subject', '(continued)']
dfu = dfu.loc[~dfu['subject'].isin(values)]

# Remove writing parts of language courses
values = ['German (Writing)', 'French (Writing)', 'Spanish (Writing)', 'Gaelic (Learners) (Writing)',
             'Italian (Writing)', 'Russian (Writing)', 'Spanish (Writing)', 'Urdu (Writing)']
dfu = dfu.loc[~dfu['subject'].isin(values)]

In [50]:
# Combine subject levels

subs = ['New_Higher', 'Old_Higher']
dfu = combine_subs_level(dfu, 'Higher', subs)

In [56]:
# Create df for Graph

# Sort out row order
dfu = dfu.sort_values(['year', 'level', 'subject', 'grade'])
dfu.reset_index(drop=True, inplace=True)

# Look only at entries, passes and fails
dfgraph = dfu.loc[dfu['grade'].isin(['entries', 'pass', 'na'])].copy()

# Look only at subjects with a total counts of over 1,000
dfg = dfgraph.groupby('subject', as_index = False)['count'].sum()
dfg.drop(dfg.index[dfg['count'] < 1000], inplace = True)
popular_subs = dfg['subject'].values
dfgraph = dfgraph.loc[dfu['subject'].isin(popular_subs)].copy()

# Create a list of subjects and year in correct order
subjects = sorted(dfgraph['subject'].unique())
years = sorted(dfgraph['year'].unique())
grades = ['entries', 'pass', 'na']

#print(subjects)

In [52]:
# TO DO
#n/a > np.nan??

In [53]:
# Plot graph

levels = ['Advanced_Higher', 'Higher', 'National_5', 'National_4', 'Standard_Grade', 
                  'Intermediate2', 'Intermediate1']

fig = px.line(dfgraph, x = "year", y = ["count"], 
                color = "subject",
                facet_col = 'grade',
                facet_row = 'level',
                facet_row_spacing = 0.02, 
                facet_col_spacing = 0.01, 
                width = 1500,
                height = 1500,
                category_orders = {'subject' : subjects, 
                                       'year' : years,
                                          'level' : levels,
                                              'grade': grades},
                labels = {"value": "Count",
                             "subject": "Subject",
                                  "level": "Level",
                                     "grade": "Grade",
                                         "year": "Year"},
                title = 'Total entries, passes, no awards per subject  1986 - 2021'
                )

fig.update_yaxes(rangemode = "tozero")

fig.update_xaxes(tickangle = 60)

fig.layout.yaxis.matches = 'y'
fig.layout.yaxis2.matches = 'y'
fig.layout.yaxis3.matches = 'y'

fig.layout.yaxis4.matches = 'y4'
fig.layout.yaxis5.matches = 'y4'
fig.layout.yaxis6.matches = 'y4'

fig.layout.yaxis7.matches = 'y7'
fig.layout.yaxis8.matches = 'y7'
fig.layout.yaxis9.matches = 'y7'

fig.layout.yaxis10.matches = 'y10'
fig.layout.yaxis11.matches = 'y10'
fig.layout.yaxis12.matches = 'y10'

fig.layout.yaxis13.matches = 'y13'
fig.layout.yaxis14.matches = 'y13'
fig.layout.yaxis15.matches = 'y13'

fig.layout.yaxis16.matches = 'y16'
fig.layout.yaxis17.matches = 'y16'
fig.layout.yaxis18.matches = 'y16'

fig.layout.yaxis19.matches = 'y19'
fig.layout.yaxis20.matches = 'y19'
fig.layout.yaxis21.matches = 'y19'

fig.update_traces(mode = "markers+lines")

filename = 'total_students_details'

fig.write_html('./graphs/' + filename + '.html')

fig.show()

In [54]:
# Setup Chart Studio
username = 'mrcrookes' # your username
api_key = '3PGooCAVzH5J2Q2zJv1f' # your api key - go to profile > settings > regenerate key
chart_studio.tools.set_credentials_file(username=username, api_key=api_key)

# Upload to Chart Studio
py.plot(fig, filename = filename, auto_open=True)

'https://plotly.com/~mrcrookes/136/'

In [55]:
dft.to_csv('./csvs/1986_2021_data.csv')