In [None]:
import pandas as pd
import seaborn as sns
import seaborn.objects as so
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from scipy.stats import ttest_rel
from scipy.stats import mannwhitneyu
from scipy.stats import chi2_contingency

GCSE_national_data_path = "Data_original/2223_national_data_provisional.csv"

GCSE_national_data = pd.read_csv(GCSE_national_data_path)


GCSE_national_data.head()

In [None]:
GCSE_subject_data_path="Data/subject_data_all_years.csv"
GCSE_subject_data= pd.read_csv(GCSE_subject_data_path)

GCSE_subject_data

In [None]:
del GCSE_subject_data['Unnamed: 0']

GCSE_subject_data_df = GCSE_subject_data[((GCSE_subject_data ["gender"]=="Boys") | (GCSE_subject_data["gender"]=="Girls"))]

GCSE_subject_data_filtered = GCSE_subject_data_df[(GCSE_subject_data_df["total_exam_entries"] != "c")]

GCSE_subject_data_filtered.head()

In [None]:
A_level_subjects=GCSE_subject_data_filtered[(GCSE_subject_data_filtered["subject"] == "English Literature")|
                                   (GCSE_subject_data_filtered["subject"] == "History")| 
                                   (GCSE_subject_data_filtered["subject"] == "Biology")|
                                   (GCSE_subject_data_filtered["subject"] == "Chemistry")|
                                   (GCSE_subject_data_filtered["subject"] == "Physics")|
                                   (GCSE_subject_data_filtered["subject"] == "Geography")|
                                   (GCSE_subject_data_filtered["subject"] == "Mathematics")]
A_level_subjects.set_index('time_period')  

max_level= A_level_subjects.groupby(['subject','gender'])['total_exam_entries'].max()
max_level=pd.DataFrame(max_level)
max_level

In [None]:
min_level = A_level_subjects.groupby(['subject', 'gender'])['total_exam_entries'].min()
min_level = pd.DataFrame(min_level)
min_level


In [None]:
max_data = {
    'subject': ['Biology', 'Chemistry', 'English Literature', 'Geography', 'History', 'Mathematics', 'Physics'],
    'Boys': [89374, 86991, 303807, 152318, 145590, 312709, 87039],
    'Girls': [85978, 83580, 298388, 128951, 149854, 302024, 83190]
}

min_data = {
    'subject': ['Biology', 'Chemistry', 'English Literature', 'Geography', 'History', 'Mathematics', 'Physics'],
    'Boys': [81220, 61890, 193518, 89338, 98087, 271567, 61597],
    'Girls': [79611, 51396, 210048, 72208, 97120, 263745, 50675]
}


max_df = pd.DataFrame(max_data)
min_df = pd.DataFrame(min_data)


max_df_melted = pd.melt(max_df, id_vars='subject', var_name='gender', value_name='entries')
min_df_melted = pd.melt(min_df, id_vars='subject', var_name='gender', value_name='entries')

combined_df = pd.concat([max_df_melted.assign(dataset='Maximum'), min_df_melted.assign(dataset='Minimum')])


plt.figure(figsize=(10, 6))

sns.boxplot(x='subject', y='entries', hue='gender', data=combined_df, palette={'Boys': 'skyblue', 'Girls': 'salmon'})
plt.xlabel('Subjects')
plt.ylabel('Entries')
plt.title('Comparison of Maximum and Minimum Level Entries for Boys and Girls')
plt.legend(title='Gender')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("Graphs\Q2_Comparison of Maximum and Minimum Level Entries for Boys and Girls.png",bbox_inches="tight")
plt.show()

In [None]:
# Provided data
data = {
    'subject': ['Biology', 'Chemistry', 'English Literature', 'Geography', 'History', 'Mathematics', 'Physics'],
    'Boys': [89374, 86991, 303807, 93679, 99769, 312709, 87039],
    'Girls': [85978, 83580, 298388, 98046, 98579, 302024, 83190]
}

# Creating a DataFrame
df = pd.DataFrame(data)
df.set_index('subject', inplace=True)

# Performing the chi-square test
chi2, p, _, _ = chi2_contingency(df)

print(f"Chi-Square Statistic: {chi2}")
print(f"P-value: {p}")


In [None]:
#Chi-Square Statistic had been made to determine if there's an association between gender and subject choice?

#Chi-Square Statistic: 300.1920674311997
#P-value: 7.440737048316971e-62

#Significant Association: The small p-value (much less than the commonly used significance level of 0.05) strongly suggests a significant association between gender and subject choice for GCSE exams based on the provided data.

#Conclusion: From these results, it appears that there is a statistically significant relationship between the subjects chosen (Biology, Chemistry, English Literature, Geography, History, Mathematics, and Physics) and gender (Boys and Girls) for GCSE exams.



In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Data - Maximum Level Entries
max_data = {
    'subject': ['Biology', 'Chemistry', 'English Literature', 'Geography', 'History', 'Mathematics', 'Physics'],
    'Boys_Max': [89374, 86991, 303807, 152318, 145590, 312709, 87039],
    'Girls_Max': [85978, 83580, 298388, 128951, 149854, 302024, 83190]
}

# Data - Minimum Level Entries
min_data = {
    'subject': ['Biology', 'Chemistry', 'English Literature', 'Geography', 'History', 'Mathematics', 'Physics'],
    'Boys_Min': [81220, 61890, 193518, 89338, 98087, 271567, 61597],
    'Girls_Min': [79611, 51396, 210048, 72208, 97120, 263745, 50675]
}

# Create DataFrames
max_df = pd.DataFrame(max_data)
min_df = pd.DataFrame(min_data)

# Calculate percentages
max_df['Boys_Percentage'] = (max_df['Boys_Max'] - min_df['Boys_Min']) / min_df['Boys_Min'] * 100
max_df['Girls_Percentage'] = (max_df['Girls_Max'] - min_df['Girls_Min']) / min_df['Girls_Min'] * 100

# Plotting line graph for Boys and Girls percentage change
plt.figure(figsize=(10, 6))

plt.plot(max_df['subject'], max_df['Boys_Percentage'], label='Boys Percentage Change', marker='o', linestyle='-', color='blue')
plt.plot(max_df['subject'], max_df['Girls_Percentage'], label='Girls Percentage Change', marker='o', linestyle='-', color='red')

plt.xlabel('Subject')
plt.ylabel('Percentage Change')
plt.title('Percentage Comparison of Boys and Girls Entries (Maximum to Minimum)')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("Graphs\Q2_Percentage Comparison of Boys and Girls Entries (Maximum to Minimum).png",bbox_inches="tight")
plt.show()

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Data - Maximum Level Entries
max_data = {
    'subject': ['Biology', 'Chemistry', 'English Literature', 'Geography', 'History', 'Mathematics', 'Physics'],
    'Boys_Max': [89374, 86991, 303807, 152318, 145590, 312709, 87039],
    'Girls_Max': [85978, 83580, 298388, 128951, 149854, 302024, 83190]
}

# Data - Minimum Level Entries
min_data = {
    'subject': ['Biology', 'Chemistry', 'English Literature', 'Geography', 'History', 'Mathematics', 'Physics'],
    'Boys_Min': [81220, 61890, 193518, 89338, 98087, 271567, 61597],
    'Girls_Min': [79611, 51396, 210048, 72208, 97120, 263745, 50675]
}

# Create DataFrames
max_df = pd.DataFrame(max_data)
min_df = pd.DataFrame(min_data)

# Calculate frequencies for the chi-square test
boys_max = list(max_df['Boys_Max'])
girls_max = list(max_df['Girls_Max'])
boys_min = list(min_df['Boys_Min'])
girls_min = list(min_df['Girls_Min'])

# Perform the chi-square test
chi2, p, _, _ = chi2_contingency([boys_max, girls_max, boys_min, girls_min])

print(f"Chi-Square Statistic: {chi2}")
print(f"P-value: {p}")


In [None]:
#Chi-Square Statistic: 28902.869144972545 
#P-value: 0.0
#Significant Association: The very small p-value (<0.05, or practically zero) suggests a highly significant association between gender (Boys and Girls) and subject choice for GCSE exams based on the provided data.

#Conclusion: From these results, it's evident that there's a statistically significant relationship between the subjects chosen (Biology, Chemistry, English Literature, Geography, History, Mathematics, and Physics) and gender (Boys and Girls) for GCSE exams.

#Above statement  implies that gender might have an influence on the selection of subjects for these exams. The chi-square test indicates a strong association between gender and the choice of subjects, indicating that there's more to investigate or explore regarding the preferences and patterns in subject selection between boys and girls during their GCSE exams.


In [None]:
subject_data_df = GCSE_national_data[(GCSE_national_data["time_period"]==202223) & (GCSE_national_data["establishment_type"] == "All schools")& (GCSE_national_data["breakdown"] == "Total") ]


subject_data_reduced_df = subject_data_df[["gender","t_schools","t_pupils",
                                           "t_entbasics","pt_entbasics","t_triplesci_e","pt_triplesci_e",
                                           "t_eballsci_ptq_ee","pt_eballsci_ptq_ee",
                                           "t_ent_comb_sci","pt_ent_comb_sci","t_ent_hist_geog","pt_ent_hist_geog",
                                           "t_entered_art","pt_entered_art","t_entered_music_qual","pt_entered_music_qual",
                                           "t_multilan_e","pt_multilan_e","avg_att8","avg_p8score"]]

subject_data_reduced_df = subject_data_reduced_df.dropna(how="any")

subject_data_reduced_df = subject_data_reduced_df.rename(columns={"gender":"Gender",
                                                                  "t_schools": "total_num_schools", 
                                                                  "t_pupils": "total_num_pupils",
                                                                  "t_entbasics": "tot_num_entering_english_and_maths_GCSEs",
                                                                  "pt_entbasics":"perc_pupils_entering_english_and_maths_GCSEs",
                                                                  "t_triplesci_e":"tot_num_entering_triple_science",
                                                                  "pt_triplesci_e":"perc_pupils_entering_triple_science",
                                                                  "t_eballsci_ptq_ee":"tot_num_entering_one_of_phys_chem_bio_compsci",
                                                                  "pt_eballsci_ptq_ee":"perc_pupils_entering_one_of_phys_chem_bio_compsci",
                                                                  "t_ent_comb_sci":"tot_num_entering_combined_science",
                                                                  "pt_ent_comb_sci":"perc_pupils_entering_combined_science",
                                                                  "t_ent_hist_geog":"tot_num_entering_hist_and_geog",
                                                                  "pt_ent_hist_geog":"perc_pupils_entering_hist_and_geog",
                                                                  "t_entered_art":"tot_num_entering_any_arts_subj",
                                                                  "pt_entered_art":"perc_pupils_entering_any_arts_subj",
                                                                  "t_entered_music_qual":"tot_num_entering_music_qualification",
                                                                  "pt_entered_music_qual":"perc_pupils_entering_music_qualification",
                                                                  "t_multilan_e":"tot_num_entering_more_than_one_lang",
                                                                  "pt_multilan_e":"perc_pupils_entering_more_than_one_lang",
                                                                  "avg_att8":"avg_attainment8_score:",
                                                                  "avg_p8score":"avg_progress8_score:"})

subject_data_reduced_df = subject_data_reduced_df.reset_index(drop=True)

subject_data_reduced_df


In [None]:
subject_data_reduced_df['avg_attainment8_score:'] = subject_data_reduced_df['avg_attainment8_score:'].astype(float)
subject_data_reduced_df['avg_progress8_score:'] = subject_data_reduced_df['avg_progress8_score:'].astype(float)


In [None]:
attainment8_scores =subject_data_reduced_df ['avg_attainment8_score:']
progress8_scores =subject_data_reduced_df['avg_progress8_score:']

correlation = attainment8_scores.corr(progress8_scores)
print(f"Correlation between Attainment 8 and Progress 8 scores: {correlation}")

In [None]:
GCSE_national_data_path = "Data/2223_national_data_provisional.csv"


sns.regplot(data=subject_data_reduced_df, x="avg_attainment8_score:", y="avg_progress8_score:",
                    line_kws= {"color":"red"},ci=90);
plt.xlabel('Average Attainment Score')
plt.ylabel('Average Progress Score')
plt.title('Correlation between Attainment and Progress Scores')
plt.tight_layout()
plt.savefig("Graphs\Q2_Correlation between Attainment and Progress Scores.png",bbox_inches="tight")
plt.show()