In [1]:
import pandas as pd
import plotly.express as px

# Load the CSV file
df = pd.read_csv('suss_courses.csv')
df.head()


Unnamed: 0,title,code,url,types,featured,applications_open,applications_close,intake,duration,fees,area_of_interest
0,Graduate Diploma in Curriculum and Teaching,,https://www.suss.edu.sg/programmes/detail/grad...,Graduate Studies,True,16-Apr-25,15-Oct-25,Jan-26,,,Adult Learning
1,Graduate Diploma in Education,,https://www.suss.edu.sg/programmes/detail/grad...,Graduate Studies,True,16-Apr-25,15-Oct-25,Jan-26,,,Adult Learning
2,Master of Education,,https://www.suss.edu.sg/programmes/detail/mast...,Graduate Studies,True,16-Apr-25,15-Oct-25,Jan-26,,,Adult Learning
3,Graduate Diploma in Learning Design and Techno...,,https://www.suss.edu.sg/programmes/detail/grad...,Graduate Studies,True,16-Apr-26,15-Oct-26,Jan-27,,,Adult Learning
4,Doctor of Philosophy (Gerontology),,https://www.suss.edu.sg/programmes/detail/phd-...,Graduate Studies,False,16-Apr-24,15-Apr-25,Jul-25,,,Gerontology


In [28]:
len(df)

1423

# Course Type

Most courses are categorized under IHL Micro-Credentials, Modular Undergraduate Courses (610), followed by Modular Undergraduate Courses (209) and IHL Micro-Credentials, Modular Graduate Courses (190). Other common types include Modular Graduate Courses (97), Graduate Studies (66), and Short Courses with SkillsFuture Series (54). Fewer courses fall under types like Law Programmes and Certificate Courses.

In [2]:
# Count course types
type_counts = df['types'].value_counts().reset_index()
type_counts.columns = ['Course Type', 'Number of Courses']
type_counts = type_counts.sort_values('Number of Courses', ascending=True)  # for horizontal layout

# Create horizontal bar chart
fig = px.bar(
    type_counts,
    x='Number of Courses',
    y='Course Type',
    orientation='h',
    text='Number of Courses',
    color='Number of Courses',
    color_continuous_scale='Blues'
)

# Beautify the chart
fig.update_layout(
    title='Number of Courses by Type',
    title_x=0.5,
    xaxis_title='Number of Courses',
    yaxis_title='',
    font=dict(family='Arial', size=14),
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)',
    showlegend=False
)

fig.update_traces(textposition='outside')

# Show chart
fig.show()

“IHL Micro-Credentials, Modular Undergraduate Course” accounts for the largest share at 42.9% (610 courses), followed by “Modular Undergraduate Course” (209, 14.7%) and “IHL Micro-Credentials, Modular Graduate Course” (190, 13.4%). The remaining course types each contribute less than 10% individually.

In [3]:
fig = px.pie(type_counts, names='Course Type', values='Number of Courses', title='Course Type Distribution')
fig.show()


# Number of Courses by Area of Interest

Business Administration has the highest number of courses (265), followed by Linguistics and Languages (201), Management (184), and Finance (126). Other notable areas include Social Services, Digital Media, and Science and Technology, while fields like Legal and International Trade have relatively few courses.

In [4]:
# Count area of interest
area_counts = df['area_of_interest'].value_counts().reset_index()
area_counts.columns = ['Area of Interest', 'Number of Courses']

# Sort for better visualization (already sorted but you can reverse if needed)
area_counts = area_counts.sort_values(by='Number of Courses', ascending=True)

# Create horizontal bar chart
fig = px.bar(
    area_counts,
    x='Number of Courses',
    y='Area of Interest',
    orientation='h',  # Horizontal bars
    text='Number of Courses',
    title='Number of Courses by Area of Interest',
    color='Number of Courses',
    color_continuous_scale='Blues'
)

# Beautify
fig.update_layout(
    xaxis_title='Number of Courses',
    yaxis_title='',
    font=dict(family="Arial", size=14),
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)',
    title_x=0.5
)
fig.update_traces(textposition='outside')

# Show the figure
fig.show()

# Featured Courses

The recommended courses primarily fall under the Adult Learning domain, including programmes such as the Graduate Diploma in Curriculum and Teaching, Graduate Diploma in Education, and Master of Education. In addition, short-term, high-value offerings in Information Technology (e.g., Introduction to Generative Artificial Intelligence) and Sustainability (e.g., Carbon Management – A Systematic Approach to Manage Greenhouse Gas Emissions) are priced at SGD 700 per day, underscoring their strategic importance and relevance to emerging national and global priorities.



In [5]:
# Filter rows where 'featured' is TRUE (case-insensitive if stored as strings)
featured_courses = df[df['featured'].astype(str).str.upper() == 'TRUE']
featured_courses

Unnamed: 0,title,code,url,types,featured,applications_open,applications_close,intake,duration,fees,area_of_interest
0,Graduate Diploma in Curriculum and Teaching,,https://www.suss.edu.sg/programmes/detail/grad...,Graduate Studies,True,16-Apr-25,15-Oct-25,Jan-26,,,Adult Learning
1,Graduate Diploma in Education,,https://www.suss.edu.sg/programmes/detail/grad...,Graduate Studies,True,16-Apr-25,15-Oct-25,Jan-26,,,Adult Learning
2,Master of Education,,https://www.suss.edu.sg/programmes/detail/mast...,Graduate Studies,True,16-Apr-25,15-Oct-25,Jan-26,,,Adult Learning
3,Graduate Diploma in Learning Design and Techno...,,https://www.suss.edu.sg/programmes/detail/grad...,Graduate Studies,True,16-Apr-26,15-Oct-26,Jan-27,,,Adult Learning
12,Graduate Diploma in Learning Design and Techno...,,https://www.suss.edu.sg/programmes/detail/grad...,Graduate Studies,True,16-Apr-26,15-Oct-26,Jan-27,,,Adult Learning
151,Carbon Management – A Systematic Approach to M...,(CET258),https://www.suss.edu.sg/courses/short-course/d...,"Short Course, SkillsFuture Series",True,14-Mar-25,19-Jun-25,10-Jul-25,2 days,$1400,Others
152,Introduction to Generative Artificial Intellig...,(CET175),https://www.suss.edu.sg/courses/short-course/d...,"Short Course, SkillsFuture Series",True,12-Sep-24,13-Nov-24,04-Dec-24,1 day,$700,Information Technology


In [6]:
pd.set_option('display.max_colwidth', None)

In [7]:
featured_courses[['title', 'area_of_interest','fees']]


Unnamed: 0,title,area_of_interest,fees
0,Graduate Diploma in Curriculum and Teaching,Adult Learning,
1,Graduate Diploma in Education,Adult Learning,
2,Master of Education,Adult Learning,
3,Graduate Diploma in Learning Design and Technology,Adult Learning,
12,Graduate Diploma in Learning Design and Technology,Adult Learning,
151,Carbon Management – A Systematic Approach to Manage Greenhouse Gas Emissions,Others,$1400
152,Introduction to Generative Artificial Intelligence (AI),Information Technology,$700


# Duration
As programme listings do not indicate course durations, the following analysis pertains solely to courses under the 'Courses' category. Notably, over 93% of these courses are structured with a standard duration of six months. Only a small proportion deviate from this norm, with shorter or longer durations being relatively rare.

In [8]:
# Count the number of courses by duration
duration_counts = df['duration'].value_counts().reset_index()
duration_counts.columns = ['Duration', 'Number of Courses']
duration_counts = duration_counts.sort_values('Number of Courses', ascending=True)

# Create horizontal bar chart
fig = px.bar(
    duration_counts,
    x='Number of Courses',
    y='Duration',
    orientation='h',
    text='Number of Courses',
    color='Number of Courses',
    color_continuous_scale='PuBu'
)

# Beautify layout
fig.update_layout(
    title='Number of Courses by Duration',
    title_x=0.5,
    xaxis_title='Number of Courses',
    yaxis_title='',
    font=dict(family='Arial', size=14),
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)',
    showlegend=False
)
fig.update_traces(textposition='outside')

# Show it
fig.show()

In [9]:
# Calculate percent
duration_counts['Percent'] = (duration_counts['Number of Courses'] / duration_counts['Number of Courses'].sum() * 100).round(2)

# Create conditional labels
duration_counts['Label'] = duration_counts.apply(
    lambda row: f"{row['Duration']}<br>{row['Percent']}%" if row['Percent'] > 3 else '',
    axis=1
)

# Sort and color
duration_counts = duration_counts.sort_values(by='Number of Courses', ascending=True)
colors = px.colors.sequential.Blues[-len(duration_counts):]

# Create the pie chart (no hover_data param)
fig = px.pie(
    duration_counts,
    names='Duration',
    values='Number of Courses',
    title='Distribution of Courses by Duration',
    color=duration_counts['Duration'],
    color_discrete_sequence=colors
)

# Add percent as customdata for formatted tooltip
fig.update_traces(
    text=duration_counts['Label'],
    textinfo='text',
    customdata=duration_counts[['Percent']],
    hovertemplate="<b>%{label}</b><br>Number of Courses: %{value}<br>Percent: %{customdata[0]}%"
)

fig.update_layout(title_x=0.5)
fig.show()

# Fees
As programme listings do not provide fee information, the following data on course fees is based solely on courses listed under the 'Courses' category.

## Course Fee Distribution Summary
As programme listings do not indicate fees, the following analysis pertains solely to courses under the 'Courses' category. The distribution of course fees shows that 25% of courses are priced below 1392 SGD (Q1), while the median fee is 1461 SGD—indicating that half of all courses cost less than this amount and half cost more. Additionally, 75% of courses are priced below 1936 SGD (Q3). Any course with a fee above 2676 SGD is considered an outlier, with the highest recorded fee reaching 7875 SGD.

| Name             | Value (SGD) | Explanation                                                                 |
|------------------|-------------|------------------------------------------------------------------------------|
| **min**          | 0           |  Programme listings do not provide fee information                |
| **Q1 (25%)**     | 1392        | 25% of courses have fees lower than 1392 SGD                                |
| **Median (50%)** | 1461        | The middle value — half the courses are cheaper, half are more expensive    |
| **Q3 (75%)**     | 1936        | 75% of courses have fees lower than 1936 SGD                                |
| **Upper Fence**  | 2676        | Values above this are considered outliers                                   |
| **Max**          | 7875        | Extremely high-priced course(s)                                             |


In [10]:
# # Step 2: Clean the 'fees' column (remove $, commas, convert to numeric)
df['fees'] = df['fees'].astype(str).str.replace(r'[$,]', '', regex=True)

# Ensure 'fees' is numeric (in case it's stored as string with "$", ",", etc.)
df['fees'] = pd.to_numeric(df['fees'], errors='coerce')

In [11]:
# Drop missing values
fees_clean = df['fees'].dropna()

# Describe the fees
fees_summary = fees_clean.describe().round(2)

print(fees_summary)

count    1189.00
mean     1739.81
std       758.26
min         0.00
25%      1392.00
50%      1461.00
75%      1936.00
max      7875.00
Name: fees, dtype: float64


## Top expensive courses

When ranked by course fees, the top 15 most expensive courses are primarily concentrated in the field of Business Administration, which accounts for 7 out of the 15 courses (46.7%). Notable examples include Business Economics and Public Policy, Marketing for the Future Economy, and Managing Technology and Innovation, each priced at SGD 7,875 with a duration of 6 months.

The second most represented area is Science and Technology with 5 courses (33.3%), such as SAP Procurement, SAP Financials – Management Accounting, and SAP Financials – Financial Accounting, each priced at SGD 7,246.

Social Services, Design, and Digital Media each appear once (6.67%). For instance, the Family Business: Navigating Success and Legacy course under Social Services is a short course costing SGD 3,900. Similarly, Collaborative Applied Project under Design and Audio and Video Production Techniques under Digital Media are included among the highest-priced offerings.

In [12]:
# Step 3: Drop rows where fees are missing
df_clean = df.dropna(subset=['fees'])

# Step 4: Sort by fees descending and select top 15
top_5_expensive = df_clean.sort_values(by='fees', ascending=False).head(15)

In [13]:
top_5_expensive

Unnamed: 0,title,code,url,types,featured,applications_open,applications_close,intake,duration,fees,area_of_interest
140,Business Economics and Public Policy,(ECO631),https://www.suss.edu.sg/courses/detail/eco631,Modular Graduate Course,False,01-May-25,15-Jun-25,Jul-25,6 months,7875.0,Business Administration
528,Marketing for the Future Economy,(MKT631),https://www.suss.edu.sg/courses/detail/mkt631,Modular Graduate Course,False,01-May-25,15-Jun-25,Jul-25,6 months,7875.0,Business Administration
524,Managing Technology and Innovation,(BUS651),https://www.suss.edu.sg/courses/detail/bus651,Modular Graduate Course,False,01-May-25,15-Jun-25,Jul-25,6 months,7875.0,Business Administration
661,Strategic Leadership & Management for Excellence,(BUS613),https://www.suss.edu.sg/courses/detail/bus613,Modular Graduate Course,False,01-May-25,15-Jun-25,Jul-25,6 months,7875.0,Business Administration
617,SAP Procurement,(ICT374),https://www.suss.edu.sg/courses/detail/ict374,Modular Undergraduate Course,False,01-May-25,15-Jun-25,Jul-25,6 months,7246.0,Science and Technology
614,SAP Financials-Management Accounting,(ICT372),https://www.suss.edu.sg/courses/detail/ict372,Modular Undergraduate Course,False,01-May-25,15-Jun-25,Jul-25,6 months,7246.0,Science and Technology
613,SAP Financials-Financial Accounting,(ICT370),https://www.suss.edu.sg/courses/detail/ict370,Modular Undergraduate Course,False,01-May-25,15-Jun-25,Jul-25,6 months,7246.0,Science and Technology
276,Collaborative Applied Project,(DES591),https://www.suss.edu.sg/courses/detail/des591,"IHL Micro-Credentials, Modular Graduate Course",False,01-May-25,15-Jun-25,Jul-25,6 months,5280.0,Design
235,Applied Project,(ANL588),https://www.suss.edu.sg/courses/detail/anl588,"IHL Micro-Credentials, Modular Graduate Course",False,01-May-25,15-Jun-25,Jul-25,12 months,5280.0,Business Administration
1172,Fundamentals of Contemporary Business Management,(CET257),https://www.suss.edu.sg/courses/short-course/detail/fundamentals-of-contemporary-business-management,"Short Course, SkillsFuture Series",False,14-Sep-23,20-Feb-24,05-Mar-24,4 days,4400.0,Business Administration


In [14]:
top_5_expensive[['title','types','intake','duration','fees','area_of_interest']]

Unnamed: 0,title,types,intake,duration,fees,area_of_interest
140,Business Economics and Public Policy,Modular Graduate Course,Jul-25,6 months,7875.0,Business Administration
528,Marketing for the Future Economy,Modular Graduate Course,Jul-25,6 months,7875.0,Business Administration
524,Managing Technology and Innovation,Modular Graduate Course,Jul-25,6 months,7875.0,Business Administration
661,Strategic Leadership & Management for Excellence,Modular Graduate Course,Jul-25,6 months,7875.0,Business Administration
617,SAP Procurement,Modular Undergraduate Course,Jul-25,6 months,7246.0,Science and Technology
614,SAP Financials-Management Accounting,Modular Undergraduate Course,Jul-25,6 months,7246.0,Science and Technology
613,SAP Financials-Financial Accounting,Modular Undergraduate Course,Jul-25,6 months,7246.0,Science and Technology
276,Collaborative Applied Project,"IHL Micro-Credentials, Modular Graduate Course",Jul-25,6 months,5280.0,Design
235,Applied Project,"IHL Micro-Credentials, Modular Graduate Course",Jul-25,12 months,5280.0,Business Administration
1172,Fundamentals of Contemporary Business Management,"Short Course, SkillsFuture Series",05-Mar-24,4 days,4400.0,Business Administration


In [16]:
# Count how many of each area_of_interest appears
area_counts = top_5_expensive['area_of_interest'].value_counts().reset_index()
area_counts.columns = ['Area of Interest', 'Number of Courses']

# Sort for color mapping: small values get light color, big ones get dark
area_counts = area_counts.sort_values(by='Number of Courses', ascending=True)
colors = px.colors.sequential.Blues[-len(area_counts):]

# Create pie chart
fig = px.pie(
    area_counts,
    names='Area of Interest',
    values='Number of Courses',
    title='Area of Interest Distribution in Top 15 Most Expensive Courses',
    color=area_counts['Area of Interest'],
    color_discrete_sequence=colors
)

fig.update_traces(textinfo='percent+label', textfont_size=14)
fig.update_layout(title_x=0.5)

fig.show()

## Short Courses Fees Analysis

In the category of short courses, the top 15 most expensive offerings are predominantly focused on the area of Social Services, which accounts for 73.3% (11 out of 15 courses). This includes a range of counselling-related programmes such as Couple Counselling, Narrative Therapy in Family Counselling, and Counselling Children and Adolescents, each priced at approximately SGD 2400 for a duration of four days. Business Administration makes up 13.3% (2 courses), represented by courses like Executive Management Masterclass Programme. The remaining areas—Others and Business Administration, International Trade, Science and Technology—each contribute one course (6.67%), including programmes like Executive Leadership for Learning and Development Professionals and Counselling on Ageing Family Issues.

In [17]:
df_fees_des=df_clean.sort_values(by='fees', ascending=False)

In [18]:
df_fees_des['duration'].unique()

array(['6 months', '12 months', '4 days', '3 full days', '2 days',
       '3 days', '1 day', '4 x half-day', '30 months'], dtype=object)

In [29]:
Short_Course_Fees = df_fees_des[~df_fees_des['duration'].isin(['6 months', '12 months', '30 months'])]
Top_15_Short_Course=Short_Course_Fees.head(15)
Top_15_Short_Course[['title','types','intake','duration','fees','area_of_interest']]

Unnamed: 0,title,types,intake,duration,fees,area_of_interest
1172,Fundamentals of Contemporary Business Management,"Short Course, SkillsFuture Series",05-Mar-24,4 days,4400.0,Business Administration
742,Executive Management Masterclass Programme,Short Course,28-Mar-25,3 full days,3960.0,Business Administration
1037,Family Business: Navigating Success and Legacy,Short Course,29-Oct-24,2 days,3900.0,Social Services
1051,Executive Leadership Programme for Learning and Development Professionals,Short Course,01-Oct-24,3 days,2800.0,Others
155,Human Development: A Psychosocial Approach (Classroom & Synchronous),Short Course,05-Aug-25,4 days,2400.0,Social Services
183,Couple Counselling: Emotionally - Focused Couple Therapy (Classroom & Synchronous),Short Course,01-Sep-25,4 days,2400.0,Social Services
1036,Micro and Advanced Skills in Counselling,Short Course,12-Feb-25,4 days,2400.0,Social Services
1035,Theories and Techniques in Counselling,Short Course,25-Feb-25,4 days,2400.0,Social Services
1136,Practicum: Specialised Counselling Skills,"Short Course, SkillsFuture Series",TBC,1 day,2400.0,Social Services
741,Narrative Therapy in Family Counselling (Classroom & Synchronous),Short Course,22-Apr-25,4 days,2400.0,Social Services


In [25]:
# Count how many of each area_of_interest appears
area_counts = Top_15_Short_Course['area_of_interest'].value_counts().reset_index()
area_counts.columns = ['Area of Interest', 'Number of Courses']

# Sort for color mapping: small values get light color, big ones get dark
area_counts = area_counts.sort_values(by='Number of Courses', ascending=True)
colors = px.colors.sequential.Blues[-len(area_counts):]

# Create pie chart
fig = px.pie(
    area_counts,
    names='Area of Interest',
    values='Number of Courses',
    title='Area of Interest Distribution in Top 15 Most Expensive Short Courses',
    color=area_counts['Area of Interest'],
    color_discrete_sequence=colors
)

fig.update_traces(textinfo='percent+label', textfont_size=14)
fig.update_layout(title_x=0.5)

fig.show()

## Distribution of Course Fees

The majority of courses are priced within the range of SGD 1392 to SGD 1936, with  a  median fee of SGD 1461. While most course fees are clustered within this interquartile range, a number of outliers exist, including courses priced as high as SGD 7875.

In [30]:
import plotly.express as px

fig = px.box(
    df,
    y='fees',
    title='Distribution of Course Fees',
    points='outliers',
    labels={'fees': 'Course Fees (SGD)'},
    color_discrete_sequence=['#1f77b4']
)


fig.update_layout(
    yaxis_title='Course Fees (SGD)',
    title_x=0.5,
    font=dict(size=14),
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)'
)

fig.show()

In [31]:
import pandas as pd
import plotly.express as px

# Create fee buckets
bins = [0, 1000, 1500, 2000, 2500, 3000, 4000, 8000]
labels = ['0–1K', '1K–1.5K', '1.5K–2K', '2K–2.5K', '2.5K–3K', '3K–4K', '4K–8K']
df['fee_range'] = pd.cut(df['fees'], bins=bins, labels=labels, include_lowest=True)

# Count courses in each fee range
fee_counts = df['fee_range'].value_counts().sort_index().reset_index()
fee_counts.columns = ['Fee Range', 'Number of Courses']

# Sort by number of courses (ascending) to match color depth
fee_counts = fee_counts.sort_values(by='Number of Courses', ascending=True)

# Pick colors from a sequential scale (Blues from light to dark)
colors = px.colors.sequential.Blues[-len(fee_counts):]

# Create pie chart with mapped color sequence
fig = px.pie(
    fee_counts,
    names='Fee Range',
    values='Number of Courses',
    title='Course Fee Distribution by Range',
    color=fee_counts['Fee Range'],  # ensures color mapping works
    color_discrete_sequence=colors
)

# Formatting
fig.update_traces(textinfo='percent+label', textfont_size=14)
fig.update_layout(title_x=0.5)

fig.show()



## Course Fee Distribution by Course Type

Each horizontal box represents the distribution of course fees for a specific course type. The key components are:

* Median:
The line inside the box indicates the median (50th percentile). It divides the data into two equal halves — 50% of course fees are lower, and 50% are higher.

* Box Edges (Quartiles):

    - Left edge (Q1): 25th percentile — 25% of data points are below this value.

    - Right edge (Q3): 75th percentile — 75% of data points are below this value.
    - The box itself shows the interquartile range (IQR), where the middle 50% of the data is concentrated.
*  "Whiskers":
The lines extending from the box represent the minimum and maximum values within 1.5 times the IQR. They do not include outliers.

* Outliers:

  Small dots beyond the whiskers indicate extreme values, representing unusually high or low course fees — for example, exceptionally expensive or very cheap courses.



In [33]:
df_clean = df.dropna(subset=['fees', 'types'])  # ensure types column is not null

# Step 2: Plot box plot grouped by course type
fig = px.box(
    df_clean,
    x='fees',
    y='types',
    title='Course Fee Distribution by Course Type (Vertical)',
    labels={'types': 'Course Type', 'fees': 'Course Fee (SGD)'},
    points='outliers',
    color_discrete_sequence=['#1f77b4']  # single clean color
)

# Layout adjustments for clarity
fig.update_layout(
    xaxis_title='Course Fee (SGD)',
    yaxis_title='Course Type',
    title_x=0.5,
    font=dict(size=14),
    height=600
)

fig.show()

# Intake

Starting in 2025, there is a significant surge in course offerings, marking a major expansion in programme availability. As shown in the intake timeline, the number of courses jumps sharply from under 100 in previous years to over 1,000 in 2025, with the highest intake concentrated in July 2025 (721 courses) and January 2025 (319 courses). This dramatic increase reflects the institution's shift towards a more extensive and diversified educational strategy from that year onward.

Looking at the areas of interest, Business Administration (227 courses), Science and Technology (178), Linguistics and Languages (117), and Humanities and Social Sciences (112) are the most prominent in 2025. These four domains alone account for a large share of the new offerings, signaling strategic emphasis on both industry-relevant skills and foundational academic disciplines.

By contrast, while course availability continues into 2026 and beyond, the numbers are substantially lower. In 2026, the top areas remain consistent but at smaller scales—for example, Linguistics and Languages drops to 10 courses, and Adult Learning to 8 courses. A considerable portion of 2026–2027 offerings also fall under a “To be confirmed” category, with the largest share again in Linguistics and Languages and Adult Learning, suggesting tentative future planning or programmes in development.



In [34]:
# Count by intake period
df['intake'] = df['intake'].astype(str).str.replace(r'^\d{1,2}-', '', regex=True)
intake_counts = df['intake'].value_counts().reset_index()
intake_counts.columns = ['Intake Period', 'Number of Courses']
intake_counts = intake_counts.sort_values(by='Intake Period')
# Parse intake to datetime (using the first of the month as default)
df['intake_parsed'] = pd.to_datetime('01-' + df['intake'], format='%d-%b-%y', errors='coerce')

# Group and count by intake
intake_counts = df.groupby(['intake', 'intake_parsed']).size().reset_index(name='Number of Courses')

# Sort by actual datetime
intake_counts = intake_counts.sort_values(by='intake_parsed')

# Plot bar chart
fig = px.bar(
    intake_counts,
    x='intake',
    y='Number of Courses',
    title='Number of Courses by Intake Period (Chronological)',
    text='Number of Courses',
    color='Number of Courses',
    color_continuous_scale='Blues'
)

fig.update_layout(
    xaxis_title='Intake (Chronological)',
    yaxis_title='Number of Courses',
    title_x=0.5,
    font=dict(size=14),
    xaxis_tickangle=-45
)
fig.update_traces(textposition='outside')

fig.show()

## Future Courses by Area of Interest (2025 and Beyond)





In [35]:
df['intake_year'] = df['intake_parsed'].dt.year
future_courses = df[df['intake_year'] >= 2025]

# Drop missing area_of_interest
future_courses = future_courses.dropna(subset=['area_of_interest'])

# Group by year and area
grouped = future_courses.groupby(['intake_year', 'area_of_interest']).size().reset_index(name='Number of Courses')

# Create stacked bar chart (year on X, area on Y)
fig = px.bar(
    grouped,
    x='intake_year',
    y='Number of Courses',
    color='area_of_interest',
    title='Future Courses by Area of Interest (2025 and Beyond)',
    labels={'intake_year': 'Intake Year', 'area_of_interest': 'Area of Interest'},
    text='Number of Courses'
)

# Layout beautification
fig.update_layout(
    xaxis_title='Year',
    yaxis_title='Number of Courses',
    title_x=0.5,
    font=dict(size=14),
    barmode='stack'
)
fig.update_traces(textposition='inside')

fig.show()

In [36]:
# Filter rows where intake is "To be confirmed"
tbc_courses = df[df['intake'].astype(str).str.strip().str.lower() == 'to be confirmed']

# Group by area_of_interest
tbc_area_counts = tbc_courses['area_of_interest'].value_counts().reset_index()
tbc_area_counts.columns = ['Area of Interest', 'Number of TBC Courses']
sorted_tbc_area_counts = tbc_area_counts.sort_values(by='Number of TBC Courses', ascending=True)

fig = px.bar(
    sorted_tbc_area_counts,
    x='Number of TBC Courses',
    y='Area of Interest',
    orientation='h',
    title='Top Areas with "To be Confirmed" Intakes',
    text='Number of TBC Courses',
    color='Number of TBC Courses',
    color_continuous_scale='Blues'
)

fig.update_layout(
    xaxis_title='Number of Courses',
    yaxis_title='Area of Interest',
    title_x=0.5
)
fig.update_traces(textposition='outside')
fig.show()

In [37]:
type_counts = future_courses['types'].value_counts().reset_index()
type_counts.columns = ['Course Type', 'Number of Courses']

# Create horizontal bar chart
fig = px.bar(
    type_counts.sort_values(by='Number of Courses', ascending=True),  # smaller on top
    x='Number of Courses',
    y='Course Type',
    orientation='h',
    title='Distribution of Course Types in Future Intakes (2025 and Beyond)',
    text='Number of Courses',
    color='Number of Courses',
    color_continuous_scale='Blues'
)

# Layout tweaks
fig.update_layout(
    xaxis_title='Number of Courses',
    yaxis_title='Course Type',
    title_x=0.5,
    font=dict(size=13)
)

fig.update_traces(textposition='outside')
fig.show()

In [38]:
# Drop missing fees and area_of_interest
fees_clean = future_courses.dropna(subset=['fees', 'area_of_interest']).copy()

# Step 1: Get ordering of areas by total course count (largest first)
area_order = (
    fees_clean['area_of_interest']
    .value_counts()
    .sort_values(ascending=False)
    .index.tolist()
)

# Step 2: Convert to ordered categorical (this controls stacking order)
fees_clean['area_of_interest'] = pd.Categorical(
    fees_clean['area_of_interest'],
    categories=area_order,
    ordered=True
)

# Step 3: Convert to string — important for Plotly to respect order
fees_clean['area_of_interest'] = fees_clean['area_of_interest'].astype(str)

# Step 4: Plot stacked histogram
fig = px.histogram(
    fees_clean,
    x='fees',
    nbins=40,
    title='Distribution of Course Fees by Area of Interest',
    labels={'fees': 'Course Fee (SGD)', 'area_of_interest': 'Area of Interest'},
    color='area_of_interest',
    barmode='stack'
)

fig.update_layout(
    xaxis_title='Course Fee (SGD)',
    yaxis_title='Number of Courses',
    title_x=0.5,
    font=dict(size=14),
    legend_title='Area of Interest'
)

fig.show()


