In [None]:
import pandas as pd
import seaborn as sns
import seaborn.objects as so
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from scipy.stats import ttest_rel
from scipy.stats import mannwhitneyu

GCSE_national_data_path = "Data_original/2223_national_data_provisional.csv"

GCSE_national_data = pd.read_csv(GCSE_national_data_path)


GCSE_national_data.head()

In [None]:
GCSE_subject_data_path="Data/subject_data_all_years.csv"
GCSE_subject_data= pd.read_csv(GCSE_subject_data_path)
del GCSE_subject_data['Unnamed: 0']

GCSE_subject_data

In [None]:
GCSE_subject_data= pd.read_csv(GCSE_subject_data_path)

GCSE_subject_data_df = GCSE_subject_data[((GCSE_subject_data ["gender"]=="Boys") | (GCSE_subject_data["gender"]=="Girls"))]

GCSE_subject_data_df.head()

In [None]:
GCSE_subject_data_filtered = GCSE_subject_data_df[(GCSE_subject_data_df["total_exam_entries"] != "c")]
GCSE_subject_data_filtered.head()

In [None]:
del GCSE_subject_data_filtered['Unnamed: 0']

print(GCSE_subject_data_filtered)

In [None]:
all_subjects= GCSE_subject_data_filtered[(GCSE_subject_data_filtered["subject"] == "All Subjects")]

all_subjects.set_index('time_period')

In [None]:
time_periods = [
    '202223', '202122', '202021', '201920', '201819', '201718',
    '201617', '201516', '201415', '201314', '201213', '201112',
    '201011', '200910'
]

boys_entries = [
    2381705, 2294489, 2261168, 2220439, 2161431, 2143924,
    2379498, 2264550, 2268834, 2284001, 2238514, 2180535,
    2220810, 2321259
]

girls_entries = [
    2354678, 2289265, 2271365, 2232353, 2171421, 2158193,
    2411165, 2324264, 2343738, 2372286, 2347182, 2247133,
    2290185, 2371428
]

plt.figure(figsize=(10, 6))

plt.fill_between(time_periods, boys_entries, color='purple', alpha=0.3, label='Boys')
plt.fill_between(time_periods, girls_entries, color='pink', alpha=0.3, label='Girls')

plt.xlabel('Time Period')
plt.ylabel('Total Exam Entries')
plt.title('All Subjects Entries Over Time for Boys and Girls')
plt.xticks(rotation=45)  
plt.legend()
plt.tight_layout()

plt.savefig("Graphs\Q2_All Subjects Entries Over Time for Boys and Girls.png")
plt.show()

In [None]:

time_periods = [
    '202223', '202122', '202021', '201920', '201819', '201718',
    '201617', '201516', '201415', '201314', '201213', '201112',
    '201011', '200910'
]

boys_entries = [
    2381705, 2294489, 2261168, 2220439, 2161431, 2143924,
    2379498, 2264550, 2268834, 2284001, 2238514, 2180535,
    2220810, 2321259
]

girls_entries = [
    2354678, 2289265, 2271365, 2232353, 2171421, 2158193,
    2411165, 2324264, 2343738, 2372286, 2347182, 2247133,
    2290185, 2371428
]

# Creating a DataFrame for better manipulation and analysis
data = pd.DataFrame({
    'Time Period': time_periods,
    'Boys Entries': boys_entries,
    'Girls Entries': girls_entries
})

# Calculating summary statistics
summary_stats = data.describe()

# Calculating correlation between boys and girls entries
correlation = data['Boys Entries'].corr(data['Girls Entries'])

print("Summary Statistics:")
print(summary_stats)
print("\nCorrelation between Boys and Girls Entries:", correlation)


In [None]:

filtered_subject_data=GCSE_subject_data_filtered[(GCSE_subject_data_filtered["subject"] == "Arabic")|
                                   (GCSE_subject_data_filtered["subject"] == "Chinese")| 
                                   (GCSE_subject_data_filtered["subject"] == "Classical Greek")|
                                   (GCSE_subject_data_filtered["subject"] == "French")|
                                   (GCSE_subject_data_filtered["subject"] == "Italian")|
                                   (GCSE_subject_data_filtered["subject"] == "Latin")|
                                   (GCSE_subject_data_filtered["subject"] == "Polish")|
                                   (GCSE_subject_data_filtered["subject"] == "Spanish")|
                                   (GCSE_subject_data_filtered["subject"] == "Urdu")]

filtered_subject_data

In [None]:
filtered_subject_data['total_exam_entries'] = filtered_subject_data['total_exam_entries'].astype(int)

In [None]:
year_filtered_data_2023= filtered_subject_data[(filtered_subject_data["time_period"]== 202223)] 

year_filtered_data_2023

In [None]:
sns.set_style("darkgrid")
sns.set(font_scale=1.25)

plt.figure(figsize=(12,6))
fig1 = sns.barplot(data=year_filtered_data_2023 , y=year_filtered_data_2023['total_exam_entries'], x=year_filtered_data_2023['subject'],hue='gender')
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0,fontsize=13)
plt.title("GCSE Second Language 2023 Entries for Boys and Girls",fontsize=18)
plt.xlabel("Second Language Subjects", fontsize=15)
plt.ylabel("Total Entries", fontsize=15)

plt.savefig("Graphs\Q2_GCSE Second Language 2023 Entries for Boys and Girls.png")
plt.show()

In [None]:
year_filtered_data_2010= filtered_subject_data[(filtered_subject_data["time_period"]== 200910)] 

year_filtered_data_2010

In [None]:
sns.set_style("darkgrid")
sns.set(font_scale=1.25)

plt.figure(figsize=(12,6))
fig1 = sns.barplot(data=year_filtered_data_2010 , y=year_filtered_data_2010['total_exam_entries'], x=year_filtered_data_2010['subject'],hue='gender')
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0,fontsize=13)
plt.title("GCSE Second Language 2010 Entries for Boys and Girls",fontsize=18)
plt.xlabel("Second Language Subjects", fontsize=15)
plt.ylabel("Total Entries", fontsize=15)
plt.savefig("Graphs\Q2_GCSE Second Language 2010 Entries for Boys and Girls.png")
plt.show()

In [None]:
# clean and sort the overall data which contains characteristics such as ethnicity, gender etc. 
# for the second analysis on subjects entered at GCSE by gender

subject_data_df = GCSE_national_data[(GCSE_national_data["time_period"]==202223) & (GCSE_national_data["establishment_type"] == "All schools")& (GCSE_national_data["breakdown"] == "Total") ]


subject_data_reduced_df = subject_data_df[["gender","t_schools","t_pupils",
                                           "t_entbasics","pt_entbasics","t_triplesci_e","pt_triplesci_e",
                                           "t_eballsci_ptq_ee","pt_eballsci_ptq_ee",
                                           "t_ent_comb_sci","pt_ent_comb_sci","t_ent_hist_geog","pt_ent_hist_geog",
                                           "t_entered_art","pt_entered_art","t_entered_music_qual","pt_entered_music_qual",
                                           "t_multilan_e","pt_multilan_e","avg_att8","avg_p8score"]]

subject_data_reduced_df = subject_data_reduced_df.dropna(how="any")

subject_data_reduced_df = subject_data_reduced_df.rename(columns={"gender":"Gender",
                                                                  "t_schools": "total_num_schools", 
                                                                  "t_pupils": "total_num_pupils",
                                                                  "t_entbasics": "tot_num_entering_english_and_maths_GCSEs",
                                                                  "pt_entbasics":"perc_pupils_entering_english_and_maths_GCSEs",
                                                                  "t_triplesci_e":"tot_num_entering_triple_science",
                                                                  "pt_triplesci_e":"perc_pupils_entering_triple_science",
                                                                  "t_eballsci_ptq_ee":"tot_num_entering_one_of_phys_chem_bio_compsci",
                                                                  "pt_eballsci_ptq_ee":"perc_pupils_entering_one_of_phys_chem_bio_compsci",
                                                                  "t_ent_comb_sci":"tot_num_entering_combined_science",
                                                                  "pt_ent_comb_sci":"perc_pupils_entering_combined_science",
                                                                  "t_ent_hist_geog":"tot_num_entering_hist_and_geog",
                                                                  "pt_ent_hist_geog":"perc_pupils_entering_hist_and_geog",
                                                                  "t_entered_art":"tot_num_entering_any_arts_subj",
                                                                  "pt_entered_art":"perc_pupils_entering_any_arts_subj",
                                                                  "t_entered_music_qual":"tot_num_entering_music_qualification",
                                                                  "pt_entered_music_qual":"perc_pupils_entering_music_qualification",
                                                                  "t_multilan_e":"tot_num_entering_more_than_one_lang",
                                                                  "pt_multilan_e":"perc_pupils_entering_more_than_one_lang",
                                                                  "avg_att8":"avg_attainment8_score:",
                                                                  "avg_p8score":"avg_progress8_score:"})

subject_data_reduced_df = subject_data_reduced_df.reset_index(drop=True)

subject_data_reduced_df


In [None]:
percentage_data= subject_data_reduced_df[['Gender', 'total_num_schools',
                                          'total_num_pupils',
                                          'perc_pupils_entering_english_and_maths_GCSEs',
                                          'perc_pupils_entering_one_of_phys_chem_bio_compsci', 
                                          'perc_pupils_entering_combined_science',
                                          'perc_pupils_entering_hist_and_geog',
                                          'perc_pupils_entering_any_arts_subj',
                                          'perc_pupils_entering_music_qualification',
                                          'perc_pupils_entering_more_than_one_lang',
                                          'perc_pupils_entering_triple_science'
                                          
                                         ]]

percentage_data=percentage_data.rename(columns={ 'total_num_schools':'Total Schools',
                                          'total_num_pupils':'Total Students',
                                          'perc_pupils_entering_english_and_maths_GCSEs':'Percentage English and Math',
                                          'perc_pupils_entering_one_of_phys_chem_bio_compsci':'Percentage Chem and Bio', 
                                          'perc_pupils_entering_combined_science':'Percentage Combined Science',
                                          'perc_pupils_entering_hist_and_geog':'Percentage History and Geography',
                                          'perc_pupils_entering_any_arts_subj':'Percentage Art',
                                          'perc_pupils_entering_music_qualification':'Percentage Music',
                                          'perc_pupils_entering_more_than_one_lang':'Percentage Language',
                                          'perc_pupils_entering_triple_science':'Percentage Triple Science'})


percentage_data['Percentage English and Math'] = percentage_data['Percentage English and Math'].astype(float)
percentage_data['Percentage Triple Science'] = percentage_data['Percentage Triple Science'].astype(float)
percentage_data['Percentage Chem and Bio'] = percentage_data['Percentage Chem and Bio'].astype(float)
percentage_data['Percentage Combined Science'] = percentage_data['Percentage Combined Science'].astype(float)
percentage_data['Percentage History and Geography'] = percentage_data['Percentage History and Geography'].astype(float)
percentage_data['Percentage Art'] = percentage_data['Percentage Art'].astype(float)
percentage_data['Percentage Music'] = percentage_data['Percentage Music'].astype(float)
percentage_data['Percentage Language'] = percentage_data['Percentage Language'].astype(float)
percentage_data

In [None]:
# Example data
labels = ['Eng/Math', 'Chem/Bio', 'Combined Sci.', 'Hist/Geo','Art','M.','Language','Triple Sci.']
sizes = [89.8, 24.9, 64.8, 7.9, 38.4, 6.9, 3.6, 24.8]  # Values for the slices
explode = (0.1, 0, 0, 0, 0, 0, 0.1, 0)  # To explode the second slice (B)

# Create a pie chart using Matplotlib
plt.figure(figsize=(6, 6))
plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', startangle=150)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.legend(labels, loc="center left", bbox_to_anchor=(1.3, 0.5))

plt.title('Total Percentage Of GCSE Subjects')
plt.savefig("Graphs\Q2_Total Percentage Of GCSE Subjects.png",bbox_inches="tight")
plt.show()



In [None]:
labels = ['Eng/Math', 'Chem/Bio', 'Combined Sci.', 'Hist/Geo','Art','M.','Language','Triple Sci.']
sizes = [88.7, 24.8, 63.6, 9.4, 28.3, 7.0, 3.0, 24.6]  
explode = (0.1, 0, 0, 0, 0, 0, 0.2, 0) 


plt.figure(figsize=(6, 6))
plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', startangle=150)
plt.axis('equal')  
plt.legend(labels,loc="center left", bbox_to_anchor=(1.3, 0.5))

plt.title('Percentage Of  GCSE Subjects for Boys')
plt.savefig("Graphs\Q2_Percentage Of  GCSE Subjects for Boys.png",bbox_inches="tight")
plt.show()

In [None]:
labels = ['Eng/Math', 'Chem/Bio', 'Combined Sci.', 'Hist/Geo','Art','M.','Language','Triple Sci.']
sizes = [91.0, 25.1, 66.0, 6.2, 49.1, 6.7, 4.2, 25.1] 
explode = (0.1, 0, 0, 0, 0, 0, 0.2, 0) 


plt.figure(figsize=(6, 6))
plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', startangle=140)
plt.axis('equal')  
plt.legend(labels, loc="center left", bbox_to_anchor=(1.3, 0.5))

plt.title('Percentage Of  GCSE Subjects for Girls')
plt.savefig("Graphs\Q2_Percentage Of  GCSE Subjects for Girls.png",bbox_inches="tight")
plt.show()

In [None]:
subject_data= subject_data_reduced_df[['Gender','total_num_pupils',
                                       'tot_num_entering_english_and_maths_GCSEs',
                                       'tot_num_entering_triple_science',
                                       'tot_num_entering_one_of_phys_chem_bio_compsci',
                                       'tot_num_entering_combined_science',
                                       'tot_num_entering_hist_and_geog', 
                                       'tot_num_entering_any_arts_subj',
                                       'tot_num_entering_music_qualification',
                                       'tot_num_entering_more_than_one_lang']]

subject_data=subject_data.rename(columns={"total_num_pupils":"Total Student",
                             "tot_num_entering_english_and_maths_GCSEs":"Math and English",
                             "tot_num_entering_triple_science":"Triple Science",
                             "tot_num_entering_one_of_phys_chem_bio_compsci":"Chem and Bio",
                             "tot_num_entering_combined_science":"Combined Science",
                             "tot_num_entering_hist_and_geog":"History and Geography",
                             "tot_num_entering_any_arts_subj":"Art",
                             "tot_num_entering_music_qualification":"Music",
                             "tot_num_entering_more_than_one_lang":"Language"})

subject_data['Math and English'] = subject_data['Math and English'].astype(int)
subject_data['Triple Science '] = subject_data['Triple Science'].str.replace(',', '').astype(int)
subject_data['Chem and Bio'] = subject_data['Chem and Bio'].astype(int)
subject_data['Combined Science'] = subject_data['Combined Science'].astype(int)
subject_data['History and Geography'] = subject_data['History and Geography'].astype(int)
subject_data['Art'] = subject_data['Art'].astype(int)
subject_data['Music'] = subject_data['Music'].astype(int)
subject_data['Language'] = subject_data['Language'].astype(int)

subject_data


In [None]:
plt.figure(figsize=(14, 8))

df = pd.DataFrame({
    'Math and English': [306059, 296793,602852],
    'Triple Science': [85025,81828,166853],
    'Chem and Bio':[85452,81943,167395],
    'Combined Science':[219593,215215,434808],
    'History and Geography':[32556,20294,52850],
    'Art':[97520,160144, 257664],
    'Music':[24138, 21866,46004],
    'Language':[10205,13723,23928]},
    index= ['Boys', 'Girls','Total'])

df.T.plot(kind='bar', title= 'GCSE Subjects by Gender')

plt.xticks(rotation=75)

plt.savefig("Graphs\Q2_GCSE Subjects by Gender.png",bbox_inches="tight") 

plt.show()

In [None]:
avr_attainment_scor= subject_data_reduced_df.loc[:,['Gender','total_num_pupils','avg_attainment8_score:','avg_progress8_score:']]

avr_attainment_scor

In [None]:
avr_attainment_scor['avg_attainment8_score:'] = avr_attainment_scor['avg_attainment8_score:'].astype(float)

In [None]:
plt.figure(figsize=(6,5))

# Draw a nested barplot by species and sex
sns.barplot(x= 'avg_attainment8_score:', y='Gender',data=avr_attainment_scor, palette='hot' )
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0,fontsize=13)
plt.title("Average Attaintment Score and Progress Score by Gender ",fontsize=15)
plt.xlabel("Average Attaintment Score", fontsize=12)
plt.ylabel("Gender", fontsize=12)

plt.savefig("Graphs\Q2_Average Attaintment Score and Progress Score by Gender.png",bbox_inches="tight")
plt.show()