In [24]:
import requests 
from bs4 import BeautifulSoup
url = 'https://www.ntu.edu.sg/scse/about-us/our-people'
soup_source = requests.get(url).text
soup = BeautifulSoup(soup_source,'html')
areas = {}
for row in soup.find("ol").find_all('li'):
    broad_area = row.find('strong').text
    specific_areas = row.text.split(': ')[-1].replace('\n', '').replace('(', ', ').replace(')', ', ')
    specific_areas = specific_areas.split(', ')
    specific_areas = [a for a in specific_areas if a != '' and a != '.']
    specific_areas[-1] = specific_areas[-1].split('.')[0]
    print(f'{broad_area}\n{specific_areas}\n')
    areas[broad_area] = specific_areas

Hardware & Embedded Systems
['low-energy hardware', 'hardware acceleration', 'architecture-aware algorithms', 'reconfigurable computing', 'embedded sensing', 'multi-functional sensors', 'IP-enabled devices']

Cyber Security and Forensics
['data privacy', 'privacy-preserving analytics', 'malware analysis', 'formal systems verification', 'mobile device security', 'cyber attack attribution', 'dark web monitoring', 'botnet detection', 'encrypted data processing', 'biometrics', 'cyber forensics']

Data Management & Analytics
['spatio-temporal databases', 'graph-structured databases', 'big data analytics', 'predictive analytics', 'social media analytics', 'question-answer systems', 'sentiment analysis']

Computational Intelligence
['cognitive and neuro systems', 'decision support systems', 'evolutionary', 'memetic and statistical learning', 'fuzzy systems', 'intelligent multi-agents', 'intelligent system and devices', 'machine learning and nature-inspired systems']

Computer Vision & Languag

In [38]:
import pandas as pd
keywords_df = pd.read_csv("./data/keywords_count.csv")
keywords = list(keywords_df['Keywords'])
keywords

['Computer Science and Engineering',
 'Smart Nation',
 'Disruptive Technology and Materials',
 'Electrical and Electronic Engineering',
 'Info-Communication Technology',
 'Interactive Digital Media',
 'Disrupting the Future of Industry and Manufacturing',
 'Internet & Communications',
 'Artificial and Augmented Intelligence',
 'Computational Intelligence',
 'Digital Economy',
 'Computer Networks and Communications',
 'Data Mining',
 'Smart Cities',
 'Data Management',
 'Neurotechnology',
 'Computing Hardware and Architecture',
 'Image and Video Analytics',
 'Data Science',
 'Social Physics',
 'Medical Technology',
 'Parallel and Distributed Computing',
 'Computational and Systems Biology',
 'Brain Image Analysis',
 'Mathematics',
 'EdTech & Science Communication',
 'Quantum Technology',
 'Hardware and Embedded Systems',
 'Machine Learning',
 'Urban Planning and Human Centered Design',
 'Information Sciences and Systems',
 'Cyber Security and Forensics',
 'Graphics and Interactive Compu

In [29]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Sample data
specific_areas = keywords # List of specific areas

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust the number of features
X_tfidf = tfidf_vectorizer.fit_transform(specific_areas)

# K-Means Clustering
num_clusters = 9  # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X_tfidf)

# Get cluster assignments
cluster_labels = kmeans.labels_

# Interpret and assign specific areas to broader areas based on cluster analysis
cluster_to_broader_area = {
    0: 'Hardware & Embedded Systems',
    1: 'Cyber Security and Forensics',
    2: 'Data Management & Analytics',
    3: 'Computational Intelligence',
    4: 'Computer Vision & Language',
    5: 'Graphics & Interactive Computing',
    6: 'Computer Networks & Communications',
    7: 'Parallel & Distributed Computing',
    8: 'Biomedical Informatics'
}

# Assign specific areas to broader areas
specific_areas_with_broader_areas = [(specific_area, cluster_to_broader_area[cluster_label]) for specific_area, cluster_label in zip(specific_areas, cluster_labels)]
broader_to_specific_areas = {b_area: [] for b_area in set(cluster_to_broader_area.values())}

# Populate the dictionary with specific areas
for specific_area, broader_area in specific_areas_with_broader_areas:
    broader_to_specific_areas[broader_area].append(specific_area)

# Print the results
for broader_area, specific_areas in broader_to_specific_areas.items():
    print(f"Broader Area: {broader_area}")
    for specific_area in specific_areas:
        print(f"  - Specific Area: {specific_area}")

Broader Area: Hardware & Embedded Systems
  - Specific Area: Computer Science and Engineering
  - Specific Area: Electrical and Electronic Engineering
  - Specific Area: Disrupting the Future of Industry and Manufacturing
  - Specific Area: Computational and Systems Biology
  - Specific Area: Semantics of Computation
  - Specific Area: Internet of Things
  - Specific Area: Frailty: Use of AI-R Tool for Rehabilitation at Home Reducing the Need for Hospital Physiotherapy
  - Specific Area: Mechanical & Manufacturing Engineering
  - Specific Area: Interface of Biology & Engineering (Engineering in Biology)
  - Specific Area: Biomedical Engineering
Broader Area: Computer Networks & Communications
  - Specific Area: Info-Communication Technology
  - Specific Area: Data Science
  - Specific Area: EdTech & Science Communication
  - Specific Area: Biomedical Informatics and Data Science
  - Specific Area: Nanotechnology & Nano-Science
  - Specific Area: Science, Technology and Society
  - Spec

In [31]:
broader_to_specific_areas

{'Hardware & Embedded Systems': ['Computer Science and Engineering',
  'Electrical and Electronic Engineering',
  'Disrupting the Future of Industry and Manufacturing',
  'Computational and Systems Biology',
  'Semantics of Computation',
  'Internet of Things',
  'Frailty: Use of AI-R Tool for Rehabilitation at Home Reducing the Need for Hospital Physiotherapy',
  'Mechanical & Manufacturing Engineering',
  'Interface of Biology & Engineering (Engineering in Biology)',
  'Biomedical Engineering'],
 'Computer Networks & Communications': ['Info-Communication Technology',
  'Data Science',
  'EdTech & Science Communication',
  'Biomedical Informatics and Data Science',
  'Nanotechnology & Nano-Science',
  'Science, Technology and Society',
  'Language & Communication'],
 'Computer Vision & Language': ['Smart Nation',
  'Internet & Communications',
  'Digital Economy',
  'Data Mining',
  'Smart Cities',
  'Neurotechnology',
  'Social Physics',
  'Medical Technology',
  'Mathematics',
  'Qu

In [32]:
import json

json_object = json.dumps(broader_to_specific_areas, indent=4)
with open("./data/research_areas.json", "w") as outfile:
    outfile.write(json_object)

In [45]:
f = open ('./data/research_areas.json', "r")
broader_to_specific_areas = json.loads(f.read())
f.close()
broader_to_specific_areas

{'Hardware & Embedded Systems': ['Electrical and Electronic Engineering',
  'Disrupting the Future of Industry and Manufacturing',
  'Computational and Systems Biology',
  'Semantics of Computation',
  'Internet of Things',
  'Frailty: Use of AI-R Tool for Rehabilitation at Home Reducing the Need for Hospital Physiotherapy',
  'Mechanical & Manufacturing Engineering',
  'Interface of Biology & Engineering (Engineering in Biology)',
  'Biomedical Engineering'],
 'Computer Networks & Communications': ['Info-Communication Technology',
  'Data Science',
  'EdTech & Science Communication',
  'Biomedical Informatics and Data Science',
  'Nanotechnology & Nano-Science',
  'Science, Technology and Society',
  'Language & Communication'],
 'Computer Vision & Language': ['Smart Nation',
  'Internet & Communications',
  'Digital Economy',
  'Data Mining',
  'Smart Cities',
  'Neurotechnology',
  'Social Physics',
  'Medical Technology',
  'Mathematics',
  'Quantum Technology',
  'Information Scie

In [46]:
import pandas as pd

df = pd.read_csv('./data/keywords_count.csv')

specific_area_to_count = dict(zip(df['Keywords'], df['Number of Professors']))

cluster_to_broader_area = {
    0: 'Hardware & Embedded Systems',
    1: 'Cyber Security and Forensics',
    2: 'Data Management & Analytics',
    3: 'Computational Intelligence',
    4: 'Computer Vision & Language',
    5: 'Graphics & Interactive Computing',
    6: 'Computer Networks & Communications',
    7: 'Parallel & Distributed Computing',
    8: 'Biomedical Informatics'
}

# Create a dictionary to store the sum of professors for each broader area
professors_by_broader_area = {b_area: 0 for b_area in set(cluster_to_broader_area.values())}

for broader_area, specific_areas in broader_to_specific_areas.items():
    for specific_area in specific_areas:
        count = specific_area_to_count.get(specific_area, 0)
        professors_by_broader_area[broader_area] += count

for broader_area, professors_count in professors_by_broader_area.items():
    print(f"Broader Area: {broader_area} | Sum of Professors: {professors_count}")

Broader Area: Hardware & Embedded Systems | Sum of Professors: 58
Broader Area: Computer Networks & Communications | Sum of Professors: 41
Broader Area: Computer Vision & Language | Sum of Professors: 188
Broader Area: Computational Intelligence | Sum of Professors: 12
Broader Area: Biomedical Informatics | Sum of Professors: 51
Broader Area: Graphics & Interactive Computing | Sum of Professors: 32
Broader Area: Cyber Security and Forensics | Sum of Professors: 16
Broader Area: Parallel & Distributed Computing | Sum of Professors: 12
Broader Area: Data Management & Analytics | Sum of Professors: 101


In [47]:
professors_by_broader_area

{'Hardware & Embedded Systems': 58,
 'Computer Networks & Communications': 41,
 'Computer Vision & Language': 188,
 'Computational Intelligence': 12,
 'Biomedical Informatics': 51,
 'Graphics & Interactive Computing': 32,
 'Cyber Security and Forensics': 16,
 'Parallel & Distributed Computing': 12,
 'Data Management & Analytics': 101}

In [48]:
professors_by_broader_area.keys()

dict_keys(['Hardware & Embedded Systems', 'Computer Networks & Communications', 'Computer Vision & Language', 'Computational Intelligence', 'Biomedical Informatics', 'Graphics & Interactive Computing', 'Cyber Security and Forensics', 'Parallel & Distributed Computing', 'Data Management & Analytics'])

In [52]:
new_df = {}
new_df['Broader Area'] = list(professors_by_broader_area.keys())
new_df['Number of Professors'] = list(professors_by_broader_area.values())
new_df = pd.DataFrame(new_df)
new_df.to_csv('./data/research_area_with_counts.csv',index=False)