## **Libraries needed to run the notebook**

In [1]:
# All the libraries needed to run the Notebook

import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import os
import time
import csv
import pandas as pd

import regex as re
import nltk
from forex_python.converter import CurrencyRates
from decimal import Decimal
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('stopwords')
lst_stopwords = stopwords.words('english')
import string
import pickle
import heapq
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")

import plotly.graph_objects as go
import geoplot as gplt
import geopandas as gpd
import geoplot.crs as gcrs
import imageio
import pathlib
import matplotlib.pyplot as plt
import mapclassify as mc

from googlemaps import Client as GoogleMaps
import googlemaps
import gmaps


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nephr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **1. Data collection** 

## **1.1 Get the list of master's degree courses**

Firstly, we wrote a Python script that collects all the URLs of the master courses for the first 15 pages in the Findamasters site, as requested, and puts it in a text file 'url_masters.txt':

In [None]:

link = "https://www.findamasters.com"
ext = '/masters-degrees/msc-degrees/'
pre_link = "/masters-degrees/course/"

# List to store the URLs of all the msc pages
url_masters = []

# Iterates through pages 1 to 400 as requested
for page in tqdm(range(1, 401)):

    # Builds URL for the current page
    url = link + ext + '?PG=' + str(page)
    
    # Sends a GET request and extracts the URL from the HTML received as response
    response = requests.get(url)
    pg = BeautifulSoup(response.text, 'html.parser')
    masters = pg.find('div', class_='courses col-24 px-0')
    a_masters = masters.find_all('a')
    
    # Iterates through each tag <a> in the HTML page and appends it to the list only if it's unique and if the url contains the specified prefix
    for i in a_masters:
        url_master = str(i.get('href'))
        
        if url_master in url_masters or '#' in url_master:
            continue
        
        if pre_link in url_master:
            url_masters.append(url_master)

# Prints the total number of unique master's degree
print(len(url_masters))

# Writes all the URLs found to a text file, "url_masters.txt"
with open('url_masters.txt', 'w') as file:
    for url in url_masters:
        file.write('https://www.findamasters.com' + url + '\n')

## **1.2 Crawl master's degree pages**

To solve this task we created another Python script to get the HTML of the master courses from their corresponding URLs, organizing them by folders, as requested:

In [None]:

def download_html(url, folder_path, page_number, file_counter):

    # Creating folder "page_number" if not exists
    page_folder = os.path.join(folder_path, f'page_{page_number}')
    os.makedirs(page_folder, exist_ok=True)

    # Putting an interval between the requests to not get error 429: Too Many Requests 
    time.sleep(2)
    
    # Sending a get request to the url to get the html of the msc page
    response = requests.get(url)
    if response.status_code == 200:
        html_file_path = os.path.join(page_folder, f'msc_{file_counter}.html')
        # Saving the html file in the "page_number" folder
        with open(html_file_path, 'w', encoding='utf-8') as html_file:
            html_file.write(response.text)
        
        print(f"File {file_counter} HTML downloaded and saved.")
    else:
        print(f"Failed to download file {file_counter} HTML. Status code: {response.status_code}")
    

if __name__ == "__main__":
    urls_file_path = 'url_masters.txt'
    folder_path = "downloaded_html"
    page_counter = 1

    with open(urls_file_path, 'r') as file:
        urls = file.read().splitlines()

    # Loop for downloading html
    for i, url in enumerate(urls, start=1):
        download_html(url, folder_path, page_counter, i)
        # Every 15 html, the page number increases and a new folder will be created
        if i % 15 == 0 :
            page_counter += 1


## **1.3 Parse downloaded pages**

For this task, we used HTML selectors to extract the requested information from each HTML page downloaded and we saved each of them in a .tsv file:

In [None]:

# Function that extracts the requested information from an MSC course HTML file and outputs it as a dictionary
def extract_msc_page_from_html(html_content):
    contents = {}

    # Creating a BeautifulSoup object to parse the HTML page
    page_soup = BeautifulSoup(html_content, 'html.parser')

    # Extracting all the requested information using HTML selectors
    course_name_tag = page_soup.find('h1', {'class': 'course-header__course-title'})
    contents['courseName'] = course_name_tag.text.strip() if course_name_tag else None

    university_name_tag = page_soup.find('a', {'class': 'course-header__institution'})
    contents['universityName'] = university_name_tag.contents[0] if university_name_tag else None

    faculty_name_tag = page_soup.find('a', {'class': 'course-header__department'})
    contents['facultyName'] = faculty_name_tag.contents[0] if faculty_name_tag else None

    full_time_tag = page_soup.find('a', {'class': 'inheritFont concealLink text-decoration-none text-gray-600'})
    contents['isItFullTime'] = full_time_tag.contents[0] if full_time_tag else None

    paragraphs = page_soup.select('#Snippet p')
    combined_text = ' '.join([p.get_text(strip=True) for p in paragraphs])
    contents['courseDescription'] = combined_text if combined_text else None

    start_date_span = page_soup.find('span', {'class': 'key-info__content', 'title': 'Start dates'})
    start_date_text = start_date_span.get_text(strip=True) if start_date_span else None
    contents['startDate'] = start_date_text if start_date_text else None

    fees_content = page_soup.select('.course-sections.course-sections__fees.tight.col-xs-24 > .course-sections__content')
    fees_content_texts = [element.get_text(strip=True) for element in fees_content]
    fees_combined_text = ' '.join(fees_content_texts)
    contents['fees'] = fees_combined_text if fees_combined_text else None

    modality_tag = page_soup.find('a', {'class': 'inheritFont concealLink text-gray-600 text-decoration-none'})
    contents['modality'] = modality_tag.contents[0] if modality_tag else None

    duration_span = page_soup.find('span', {'class': 'key-info__content key-info__duration py-2 pr-md-3 d-block d-md-inline-block'})
    duration_text = duration_span.get_text(strip=True) if duration_span else None
    contents['duration'] = duration_text if duration_text else None

    city_tag = page_soup.find('a', {'class': 'card-badge text-wrap text-left badge badge-gray-200 p-2 m-1 font-weight-light course-data course-data__city'})
    contents['city'] = city_tag.contents[0] if city_tag else None
    
    country_tag = page_soup.find('a', {'class': 'card-badge text-wrap text-left badge badge-gray-200 p-2 m-1 font-weight-light course-data course-data__country'})
    contents['country'] = country_tag.contents[0] if country_tag else None

    administration_tag = page_soup.find('a', {'class': 'card-badge text-wrap text-left badge badge-gray-200 p-2 m-1 font-weight-light course-data course-data__on-campus'})
    if administration_tag is not None:
        contents['administration'] = administration_tag.contents[0]
    administration_tag2 = page_soup.find('a', {'class': 'card-badge text-wrap text-left badge badge-gray-200 p-2 m-1 font-weight-light course-data course-data__online'}) 
    if administration_tag2 is not None:
        contents['administration'] = administration_tag2.contents[0]
    if administration_tag2 is not None and administration_tag is not None:
        contents['administration'] = administration_tag.contents[0] + ', ' + administration_tag2.contents[0]
    if administration_tag2 is None and administration_tag is None:
        contents['administration'] = None

    url_tag = page_soup.find('link', {'rel': 'canonical', 'href': True})
    canonical_href = url_tag.get('href') if url_tag else None
    contents['url'] = canonical_href

    # Returning the whole dictionary
    return contents

# Function to write the extracted data to a tsv file
def write_tsv_file(data, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file, delimiter='\t')
        writer.writerow(data.keys())
        writer.writerow(data.values())

if __name__ == "__main__":

    folder_path = 'C:\\Users\\nephr\\Desktop\\Uni Nuova\\ADM_HW3\\downloaded_html'
    output_folder_path = 'C:\\Users\\nephr\\Desktop\\Uni Nuova\\ADM_HW3\\downloaded_tsv_DEFINITIVO'
    os.makedirs(output_folder_path, exist_ok=True)

    file_count = 0

    # Iterates through HTML files in the input folder
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.html'):
                file_path = os.path.join(root, filename)

                # Reads HTML content from file
                with open(file_path, 'r', encoding='utf-8') as html_file:
                    html_content = html_file.read()

                # Extracts the requested data
                extracted_contents = extract_msc_page_from_html(html_content)

                # Define output file path and name
                output_filename = os.path.splitext(filename)[0] + '.tsv'
                output_path = os.path.join(output_folder_path, output_filename)

                file_count += 1

                # Writes the extracted data to a tsv file using the defined function
                write_tsv_file(extracted_contents, output_path)

                if file_count % 100 == 0:
                    print(file_count)


Afterwards, we used the following script to merge all the .tsv files, building the Dataset:

In [None]:

path = 'C:\\Users\\nephr\\Desktop\\Uni Nuova\\ADM_HW3\\downloaded_tsv_DEFINITIVO'

file_list = [f for f in os.listdir(path)]
merged = pd.DataFrame()

# This loop concatenates all the tsv files in the specified folder into a Dataframe
for file_name in tqdm(file_list):
    file_path = os.path.join(path, file_name)
    
    df = pd.read_csv(file_path, sep='\t')
 
    merged = pd.concat([merged, df], ignore_index=True)

merged.to_csv('C:\\Users\\nephr\\Desktop\\Uni Nuova\\ADM_HW3\\df_DEFINITIVO.tsv', sep='\t', index=False)

Now let's print the first rows of the merged dataset:

In [7]:
# Loading the Dataframe
df = pd.read_csv("C:\\Users\\nephr\\Desktop\\Uni Nuova\\ADM_HW3\\df_DEFINITIVO.tsv", sep='\t')

# And printing its first 10 rows
df.head(10)

Unnamed: 0,courseName,universityName,facultyName,isItFullTime,courseDescription,startDate,fees,modality,duration,city,country,administration,url
0,3D Design for Virtual Environments - MSc,Glasgow Caledonian University,School of Engineering and Built Environment,Full time,3D visualisation and animation play a role in ...,September,Please see the university website for further ...,MSc,1 year full-time,Glasgow,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
1,Air Quality Solutions - MSc,University of Leeds,Institute for Transport Studies,Full time,Up to 7 million people are estimated to die ev...,September,"UK: £12,500 (Total)International: £28,750 (Total)",MSc,"1 year full time, 2 or 3 years part-time",Leeds,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
2,Engineering Management - MSc/PGCert,University of Leeds,Leeds Online,Part time,"Designed by engineers, for engineers, the onli...","September, November",Please see the university website for further ...,PGCert,8-24 months online,Leeds,United Kingdom,Online,https://www.findamasters.com/masters-degrees/c...
3,American History (MSc),University of Edinburgh,"School of History, Classics & Archaeology",Full time,This programme allows you to explore American ...,September,Tuition fees vary between degree programmes. F...,MSc,1 year full-time or 2 years part-time,Edinburgh,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
4,Amputation and Prosthetic Rehabilitation MSc,University of Southampton,Faculty of Environmental and Life Sciences,Full time,Enhance your practice at a global top 100* uni...,September,Please see the university website for further ...,MSc,1 Year Full Time / 2-4 Years Part Time,Southampton,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
5,Anaesthesia and Perioperative Science MSc,University College London,Division of Surgery and Interventional Science,Part time,The MSc Anaesthesia and Perioperative Science ...,"March, September","Part time - £7,050",MSc,2 years part time,London,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
6,Analogue and Digital Integrated Circuit Design...,Imperial College London,Electrical and Electronic Engineering,Full time,Our MSc in Analogue and Digital Integrated Cir...,October,Please see the university website for further ...,MSc,1 year full-time,London,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
7,Analytical Bioscience (MSc/PGDip),"Birkbeck, University of London",School of Natural Sciences,Full time,Our analytical bioscience postgraduate course ...,"October, January",Please see the university website for further ...,MSc,1 year full-time or 2 years part-time,London,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
8,Analytical Chemistry (MSc),Kingston University,"Faculty of Health, Science, Social Care and Ed...",Full time,You will study good measurement and scientific...,"September, January",Please see the university website for further ...,MSc,"1 year full time, 2 years full time including ...",London,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
9,Analytical Chemistry (MSc),Sheffield Hallam University,Postgraduate Courses,Full time,Apply for the part-time course here. Work in c...,September,Our tuition fee for UK students starting full-...,MSc,"1 year full-time, 2 years part-time",Sheffield,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...


# **2. Search Engine**

## **2.0 - Preprocessing**

### **2.0.0  Preprocessing the text**

In this point we are going to the column 'courseDescription' of our dataframe in order to get it ready for our search engine. 

For this purpose we used RegexpTokenizer to tokenize the text.

In [None]:
# Load the dataframe
df = pd.read_csv('df.tsv', sep = '\t')

In [None]:
# Initialize the tokenizer, stop words, and stemmer
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

Now we are going to implement a function in order to stem a text. Then we apply the function to the colomn 'courseDescription' of our dataframe. 

In [None]:
# Function to process and stem a text
def process_text(text):
    
    # Check for NaN values, replace this with blank
    if pd.isna(text):
        return ""

    # Tokenize the text using RegexpTokenizer
    tokens = tokenizer.tokenize(text.lower())

    # Remove stopwords
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # Remove punctuation
    filtered_tokens = [token for token in filtered_tokens if token not in string.punctuation]

    # Stemming
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

    # Join the tokens back into a string
    processed_text = ' '.join(stemmed_tokens)

    return processed_text

The following lines are important in order to have 2 dataframes, the **original dataset** and the **stemmed dataset**.

In [None]:
# We copy the dataset and process the 'courseDescription' of this dataset.
df_stemmed = df.copy()
df_stemmed['courseDescription'] = df_stemmed['courseDescription'].apply(process_text)

Now, 'courseDescription' column is tokenized, stopwords are removed, punctuation is removed, and words are stemmed.

This is an example:
- '3d visualis anim play role mani area popular media keep grow digit anim provid eye catch special effect 21st centuri favourit film televis show 3d design also essenti everyday work everyth comput game develop onlin virtual world develop industri design market product design architectur gcu programm 3d design virtual environ help develop skill thrive success career visual design programm practic career focus orient toward current industri need technolog practic prior knowledg 3d design requir' 

This was the first courseDescription stemmed.

### **2.0.1 Preprocessing the fees column**

Our goal is to process the column 'fees' in order to extract, from a text, a single numeric value (the max between all the numeric values) with his relative currency.

**General Idea:**
- We reduced a text composed by numeric values, words, irregular expressions, etc. to a simple text containing only numeric values and currencies. Then we recognized those values that are 'true' fees (we found numeric values not related to the fee or expressed in any currency). During this process, for each row, we take into account the currency adding a column 'currency' to the dataframe. Now, for each row, once we have only numeric values that are 'true' fees, we convert them to float, extract the maximum value, and convert it to USD currency. 

In [None]:
# Copy the dataframe into df_fees_processed in order to process the 'fees' column. df_fees_processed will be our processed_fees dataset
df_fees_processed = df.copy() 
df_fees_processed['fees'].fillna('', inplace=True)

In [None]:
# We explore the most common currencies
top_currencies = ['usd', 'euro', 'yen', 'gbp', 'aud', 'cad', 'chf', 'cny', 'sek', 'nzd']
currency_symbols = ['$', '€', '£', '¥', '₹', '₽', '₪', '₨', '฿', '₩', '₱', '₣', '₫', '₴', '₭', '₮', '₦', '₥', '₢', '₤']
currency_pattern = '|'.join(map(re.escape, top_currencies + currency_symbols))

The 'fees' column in the dataset underwent several preprocessing steps to ensure data quality and consistency.

**Preprocessing steps and implemented functions**

#### 1. Removal of Commas from Numbers - (remove_commas_from_numbers)

   - Commas between numbers were removed using RegEx to ensure numerical consistency.
   
#### 2. Currency Identification - (find_currencies)

   - Currencies and their symbols were identified in the 'fees' column using RegEx. This allow us to recognize those rows that are not related to a currency.
   
#### 3. Currency Extraction - (extract_currency)

   - Formatted currency values were extracted from the 'fees' column, ensuring proper alignment. Actually this was done to obtain a new text containing only numeric values and currencies.
   
#### 4. Unwanted Numerical Values Removal - (remove_unwanted_values)

   - Unwanted numerical values such as years were removed from the 'fees' column to focus only on relevant numeric values.
   
#### 5. Punctuation Removal - (remove_punctuation)

   - Punctuation was removed from the 'fees' column to enhance text readability.
   
#### 6. Non-Currency Strings Removal - (remove_non_currency_strings)

   - Non-currency strings were filtered out from the 'fees' column, retaining only relevant information. We created this function to extract only numeric values that start or finish with a currency. 
   
#### 7. Numeric Conversion - (convert_to_float)

   - The values in the 'fees' column were converted to floating-point numbers. We did this also because some observations had ambiguous values (800.00, 12.500, etc...) where the dot was not used consistently.

#### 8. Maximum Value Identification - (find_max)

   - The maximum value in the 'fees' column was identified, providing insights into the highest fee.
   
   
Here is the implementation of the preprocessing steps:

In [None]:
def remove_commas_from_numbers(text):
    
    #Using regex to recognize commas between numbers
    regex_pattern = r'(\d),(\d)'
    
    # Remove commas
    updated_text = re.sub(regex_pattern, r'\1\2', text)
    
    return updated_text

def find_currencies(text):
    
    # Define regular expressions for currency symbols and codes
    currency_patterns = [
        r'£|British\s?Pound|pound|GBP',
        r'€|Euro|euro|EUR',
        r'NZD|NZ\$|New\s?Zealand\s?Dollar',
        r'\$|US\s?Dollar|USD',
        r'CA\$|C\$|Canadian\s?Dollar|CAD',
        r'SEK|kr|Swedish\s?Krona',
        r'CNY|¥|元|Chinese\s?Yuan',
        r'SAR|ر\.س|SR|Saudi\s?Riyal',
        r'HUF|Ft|Hungarian\s?Forint',
        r'DKK|kr|Danish\s?Krone',
        r'HK\$|HKD|Hong\s?Kong\s?Dollar',
        r'HRK|kn|Croatian\s?Kuna',
        r'TRY|₺|TL|Turkish\s?Lira',
        r'RON|lei|Romanian\s?Leu',
        r'JPY|¥|Japanese\s?Yen',
        r'CZK|Kč|Czech\s?Koruna',
        r'CHF|Fr\.|Swiss\s?Franc',
        r'S\$|SGD|Singapore\s?Dollar',
        r'A\$|AU\$|AUD|Australian\s?Dollar',
        r'QAR|ر\.ق|QR|Qatari\s?Riyal',
        r'ILS|₪|NIS|Israeli\s?New\s?Shekel',
        r'ISK|kr|Icelandic\s?Króna',
        r'Gibraltar\s?Pound|GIP',
        r'INR|₹|Indian\s?Rupee',
        r'RM|MYR|Malaysian\s?Ringgit',
        r'CLP|Chilean\s?Peso',
        r'₸|KZT|Kazakhstani\s?Tenge',
        r'د\.إ|AED|UAE\s?Dirham',
        r'J\$|JMD|Jamaican\s?Dollar',
        r'kr|NOK|Norwegian\s?Krone',
        
    ]

    # Iterate through each currency pattern and find matches in the text
    for pattern in currency_patterns:
        matches = re.findall(pattern, text)
        if matches:
            return matches

def extract_currency(text):
    
    #Tokenize the text
    tokens = word_tokenize(text)
    
    # To find all that number related to a currency, where the currency could be before or after the numeric value, space-separated or not
    regex_pattern = fr'(?i)(?:\s*({currency_pattern})\s*)?([\d.,]+(?:-\d+)?)\s*(?:({currency_pattern})\s*)?'

    matches = re.findall(regex_pattern, ' '.join(tokens))
    
    # Format the identified values with their respective currency symbols
    formatted_values = [f"{before_currency}{value}{after_currency}" for before_currency, value, after_currency in matches]
    
    # Join the formatted values into a string and remove leading/trailing whitespaces
    return ' '.join(formatted_values).strip()     

def remove_unwanted_values(text):
    tokens = word_tokenize(text)
    
    # to recognize years
    year_pattern = r'\d{4}[-/]\d{4}'

    # to recognize numeric values that are without a currency
    numeric_pattern = r'(?<!\S)(\d+(?:,\d{3})*(?:\.\d+)?)(?!\S)'
    
    # cleaned_tokens is a list of tokens except numeric values that are not related to a currency
    cleaned_tokens = [token if token in currency_pattern or not re.match(numeric_pattern, token) or re.match(year_pattern, token) else '' 
                      for token in tokens]
    
    cleaned_text = ' '.join(cleaned_tokens)
    
    return cleaned_text

def remove_punctuation(text):
    words = word_tokenize(text)
    
    # words is a list of all the words except punctuation
    words = [word for word in words if word not in string.punctuation]
    
    return ' '.join(words)


def remove_non_currency_strings(text):
    tokens = word_tokenize(text)
    
    #Filter the tokens that start or finish with a currency symbol
    filtered_tokens = [token for token in tokens if any(token.startswith(symbol) or token.endswith(symbol) for symbol in currency_pattern)]
    
    cleaned_text = ' '.join(filtered_tokens)
    
    return cleaned_text

def convert_to_float(text):
    # Extract numeric values using regular expression
    numeric_values = re.findall(r'\d+\.*\d*', text)
    
    # Convert each numeric value to float and return as a list
    return [float(value) for value in numeric_values]

def find_max(x):
    return max(x) if len(x) > 0 else np.nan

In [None]:
df_fees_processed['fees'] = df_fees_processed['fees'].apply(remove_commas_from_numbers)   

df_fees_processed['fees'] = df_fees_processed['fees'].apply(lambda x: '' if not find_currencies(x) else x)

df_fees_processed['currency'] = df_fees_processed['fees'].apply(lambda x: find_currencies(x)[0] if find_currencies(x) else '')

df_fees_processed['fees'] = df_fees_processed['fees'].apply(lambda x: extract_currency(x) if x!= '' else x)

df_fees_processed['fees'] = df_fees_processed['fees'].apply(lambda x: remove_unwanted_values(x) if x!= '' else x)

df_fees_processed['fees'] = df_fees_processed['fees'].apply(lambda x: remove_punctuation(x) if x!= '' else x)

df_fees_processed['fees'] = df_fees_processed['fees'].apply(lambda x: remove_non_currency_strings(x) if x!= '' else x)

df_fees_processed['fees'] = df_fees_processed['fees'].apply(lambda x: convert_to_float(x) if x!= '' else x)

df_fees_processed['fees'] = df_fees_processed['fees'].apply(lambda x: find_max(x) if x!='' else x )

In [None]:
len(df_fees_processed[df_fees_processed['fees']==''])

4741

During this process we lost 4741 rows, as they did not contain any relevant information related to the actual fee.

In [None]:
df_fees_processed['currency'].unique()

array(['', '£', '€', '$', 'EUR', 'Euro', 'SEK', 'euro', 'GBP', 'RM', 'QR',
       'USD', 'CHF', 'SGD', 'JPY'], dtype=object)

After having isolated the unique values of the column 'currency', we had to replace some of them to uniform the way currencies are expressed. We used then an API to retrieve the conversion rates between currencies. We chose to convert every currency to USD dollars, because, since it's among the most common currencies in the world.

In [None]:
#Tthe 'get_rate' function won't work if the currency is not consistent
df_fees_processed['currency']=df_fees_processed['currency'].apply(lambda x: 'EUR' if x=='Euro' or x=='euro' or x=='€' else x)
df_fees_processed['currency']=df_fees_processed['currency'].apply(lambda x: 'GBP' if x=='£' else x)
df_fees_processed['currency']=df_fees_processed['currency'].apply(lambda x: 'USD' if x=='$' else x)

In [None]:
# Our API
api_key = 'd95ee0703ae84c9ea87eaf77f1ee78ff'
c = CurrencyRates(api_key)

In [None]:
# Here we convert to a common currency, we choose USD
def convert_to_common_currency(row):
    if row['fees']!= '':
        try:
            amount = row['fees']
            currency = row['currency']
            rate = float(c.get_rate(currency, 'USD'))
            converted_amount = round(amount * rate, 2)
            return pd.Series({'fees': converted_amount})
        except Exception as e:
            print(f"Error converting {amount} {currency} to common currency. Error: {e}")
            return pd.Series({'fees': None})
    else:
        return pd.Series({'fees': row['fees'] })

df_fees_processed[['fees']] = df_fees_processed.apply(convert_to_common_currency, axis=1)

In [None]:
len(df_fees_processed[df_fees_processed['fees'] != ''])

1259

In [None]:
# A better view of our processed dataset focusing on non-empty rows. 
df_well_processed_fees = df_fees_processed[df_fees_processed['fees'] != '']
df_well_processed_fees.head(10)

Unnamed: 0,courseName,universityName,facultyName,isItFullTime,courseDescription,startDate,fees,modality,duration,city,country,administration,url,currency
1,Air Quality Solutions - MSc,University of Leeds,Institute for Transport Studies,Full time,Up to 7 million people are estimated to die ev...,September,35197.35,MSc,"1 year full time, 2 or 3 years part-time",Leeds,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...,GBP
5,Anaesthesia and Perioperative Science MSc,University College London,Division of Surgery and Interventional Science,Part time,The MSc Anaesthesia and Perioperative Science ...,"March, September",8631.0,MSc,2 years part time,London,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...,GBP
9,Analytical Chemistry (MSc),Sheffield Hallam University,Postgraduate Courses,Full time,Apply for the part-time course here. Work in c...,September,12622.08,MSc,"1 year full-time, 2 years part-time",Sheffield,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...,GBP
19,Analytics and Operations Research in Logistics...,Erasmus School of Economics,Masters Programmes,Full time,Are you curious how logistics companies solve ...,September,17.18,MSc,1 year,Rotterdam,Netherlands,On Campus,https://www.findamasters.com/masters-degrees/c...,EUR
24,"Entrepreneurship Management and Innovation, on...",University of Bath,University of Bath Online,Part time,Join us for Entrepreneurship Management and In...,"September, January",1019.8,MSc,2 years and 3 months full time,Bath,United Kingdom,,https://www.findamasters.com/masters-degrees/c...,GBP
35,Environmental Data Science and Analytics - MSc,University of Leeds,School of Geography,Full time,As global discussions are increasingly focused...,September,37645.86,MSc,1 year full time,Leeds,United Kingdom,,https://www.findamasters.com/masters-degrees/c...,GBP
44,"Anthropology, Environment and Development MSc",University College London,Department of Anthropology,Full time,"The MSc in Anthropology, Environment and Devel...",September,17262.0,MSc,"1 year full time, 2 years part time",London,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...,GBP
46,Environmental Engineering and Project Manageme...,University of Leeds,School of Civil Engineering,Full time,There are many challenges we face globally. Bu...,September,37951.92,MSc,1 year full time,Leeds,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...,GBP
47,Applications of Psychology - MSc,University of Suffolk,School of Social Sciences and Humanities,Full time,For those who are eager to expand their knowle...,September,17133.46,MSc,"1 year full-time, 2-3 years part-time",Ipswich,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...,GBP
49,Applied Analytical Chemistry MSc,University College London,Department of Chemistry,Full time,Analytical chemistry underpins many important ...,September,17262.0,MSc,1 year full time,London,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...,GBP


We successfully processed 1259 rows. As we can see, the column 'fees' was converted to float type and expressed in USD dollars. 

## **2.1. Conjunctive query**

### **2.1.1 Create your index!**

#### This code will perform the following operations:

**1. Tokenization:** It separates the text of each course description into words. The resulting words are stored in a list of lists called tokenized_descriptions.

**2. Vocabulary Creation:** It creates a vocabulary that maps each unique word in the corpus of course descriptions to a unique integer (term_id). The vocabulary is a dictionary, where that maps each word to a term_id. This dictionary was saved to a file named 'vocabulary_.pkl' using the pickle module in order to keep it ready for our purposes.

**3. Inverted Index Creation:** For each course description, it creates an inverted index that maps each term_id to the document (course description) in which that word appears. The inverted index is a dictionary that maps each term_id to a list of index in which word is included. This dictionary is saved to a file named 'inverted__index.pkl' using the pickle module.

In [None]:
# Our documents of interest
course_descriptions = df_stemmed['courseDescription']

# A list of lists containing all the descriptions tokenized and stemmed
tokenized_descriptions = [text.split() if isinstance(text, str) else [] for text in course_descriptions]

In [None]:
# Creating a vocabulary in which each word is related to a unique term_id
vocabulary = {word: i for i, word in enumerate(set(word for text in tokenized_descriptions for word in text))}

# Saving the vocabulary as vocabulary_.pkl (!!RUN THE TWO LINES BELOW TO SAVE THE FILE!!)
#with open('vocabulary_.pkl', 'wb') as vocab_file:
    #pickle.dump(vocabulary, vocab_file)

# Initialize a dict
inverted_index = defaultdict(list)

# Nested loop to create the inverted_index: doc_id is the index of the row, text is a tokenized description.
# For each term_id, it contains a list of docs_id containing all those rows that, in their descriptions, contain the term_id. 
for doc_id, text in enumerate(tokenized_descriptions):
    for word in set(text):  
        term_id = vocabulary[word]
        inverted_index[term_id].append(doc_id)

# Saving the inverted_index as inverted__index.pkl (!!RUN THE TWO LINES BELOW TO SAVE THE FILE!!)        
#with open('inverted__index.pkl', 'wb') as index_file:
    #pickle.dump(dict(inverted_index), index_file)

In summary, this code is the first part of a search engine, where a unique vocabulary is built for the corpus of course descriptions, and an inverted index is created to speed up keyword searches.

### **2.1.2 Execute the query**

### General Idea:

The provided code implements a conjunctive **Search Engine** for our dataset. The objective is to allow users to search for courses that contain all the specified terms in their descriptions. The search engine uses a vocabulary and an inverted index to efficiently identify relevant documents.

#### Key Components:
1. **Vocabulary (`vocabulary`):**
   - A dictionary mapping words to unique term IDs.
   - Each term ID is associated with a set of document IDs where the corresponding word appears.

2. **Inverted Index (`inverted_index`):**
   - A dictionary where each term ID is linked to a list of document IDs.
   - Enables quick retrieval of documents containing a specific term.

3. **Query (`query`):**
   - A list of terms provided by the user to search for in course descriptions.

4. **DataFrame (`df`):**
   - The dataset containing information about courses, including columns like `courseName`, `universityName`, `courseDescription`, and `url`.

5. **Conjunctive List (`conjunctive_list`):**
   - A set that is iteratively updated to store document IDs that satisfy all terms in the search query.

In [None]:
def search_engine(query):
    
    """
    Conducts a conjunctive search based on the given query.

    Parameters:
    - query (list): A list of terms representing the search query.

    Returns:
    - DataFrame: A DataFrame containing relevant course information based on the conjunctive search.
    """
       
    # Preprocess the query
    tokenizer = RegexpTokenizer(r'\w+')
    query = tokenizer.tokenize(query)
    stemmer = PorterStemmer()
    query = [stemmer.stem(word) for word in query if not word in lst_stopwords]
    
     # Initialize the conjunctive query list
    conjunctive_list = inverted_index[vocabulary[query[0]]] 
    
    # Iterate through each term in the query
    for term in query:
        
        # Check if the term is present in the vocabulary
        if term in vocabulary:
            
            # Get the term ID from the vocabulary
            term_id = vocabulary[term]
            
            # Get the list of documents containing the current term
            term_list = inverted_index[term_id]
            
            # Update the conjunctive list by taking the intersection with the current term's document list
            conjunctive_list = set(conjunctive_list).intersection(set(term_list))
            
        else:
            print("Not all terms are in the course's descriptions")
            return False
        
    relevant_columns = ['courseName', 'universityName', 'courseDescription', 'url']
    
    # Extract the relevant rows from the DataFrame based on the conjunctive list
    doc_found = df.loc[list(conjunctive_list), relevant_columns].copy()
    
    return doc_found

#### Results:

The search engine successfully performs a conjunctive search by iterating through each term in the query, retrieving the corresponding term ID from the vocabulary, and updating the conjunctive list based on the intersection with the document list for each term. The final result is a DataFrame (`doc_found`) that includes relevant course information based on the conjunctive search criteria.

#### Output:
- The output DataFrame (`doc_found`) contains selected columns from the original dataset, such as `courseName`, `universityName`, `courseDescription`, and `url`.
- If not all terms in the query are found in the vocabulary, the function prints an error message and returns False.

In [None]:
# Ask for a query from the user
query = input()
your_results = search_engine(query)

 advanced knowledge


**Run this line below to view your results.**

In [None]:
your_results

Unnamed: 0,courseName,universityName,courseDescription,url
2053,Digital Design and Branding MSc,Brunel University London,Our Digital Design and Branding MSc degree is ...,https://www.findamasters.com/masters-degrees/c...
2055,Digital Design and Manufacture MSc,University of Edinburgh,"With the transition to industry 4.0, digital d...",https://www.findamasters.com/masters-degrees/c...
4111,Civil Engineering MSc,University of Greenwich,"Looking to develop your professional, analytic...",https://www.findamasters.com/masters-degrees/c...
18,Analytical Sciences MSc,University of Bradford,Our MSc in Analytical Sciences MSc is a resear...,https://www.findamasters.com/masters-degrees/c...
4121,International Business MSc,University of Leicester,This is for you if you want to enhance your ex...,https://www.findamasters.com/masters-degrees/c...
...,...,...,...,...
4032,International Business - MSc,University of Glasgow,International Business will provide you with a...,https://www.findamasters.com/masters-degrees/c...
2002,Diabetes Practice - MSc,Cardiff University,The MSc Diabetes Practice aims to equip health...,https://www.findamasters.com/masters-degrees/c...
2003,Diabetes Practice MSc/PGDip/PGCert,Swansea University,If you are a health professional with a specia...,https://www.findamasters.com/masters-degrees/c...
2005,Diagnostic Imaging - MSc,Glasgow Caledonian University,Designed to train a new generation of leaders ...,https://www.findamasters.com/masters-degrees/c...


This was the output for query = 'advanced knowledge'

## **2.2 Conjunctive query & Ranking score**

### **2.2.1 Inverted index**

### General Idea:

This code provides a new inverted_index in this format:

    - {term_id_1:[(document1, tfIdf_{term,document1}), (document2, tfIdf_{term,document2}), (document4, tfIdf_{term,document4}), ...], 
    term_id_2:[(document1, tfIdf_{term,document1}), (document3, tfIdf_{term,document3}), (document5, tfIdf_{term,document5}), (document6, tfIdf_{term,document6}), ...],
    ...}

For this purpose, we are going to define tfidf(document,word) = tf(document,word) * idf(word), where:

    1. TERM FREQUENCY : tf(document,word) = Number of times word appears in text / Total number of words in text
    
    2. INVERSE DOCUMENT FREQUENCY: idf(word) = log (1 + Total number of documents / 1 + Number of documents including word)
    
Implementing a function **tdidf(document,word)** allows us to define the new inverted index.

In [None]:
def tf(document, word):
    
    words = document.split() if isinstance(document, str) else []
    
    # Count the specific word in document
    term_count = words.count(word)
    
    # Total number of words in document
    total_words = len(words)

    if total_words > 0:
        
        # Compute the tf_score for the specific word in document
        return float(term_count / total_words)
    
    else:
        return 0.0 

In [None]:
def idf(word):
    
    # The total number of documents is the len of our dataframe 
    total_documents = len(df_stemmed)
    
    # Count how many documents include the word
    cont_word = len(inverted_index[vocabulary[word]])
    
    if cont_word > 0:
        # Compute idf_score for that word 
        return np.log(float((1 + total_documents)/(1 + cont_word)))
    
    else:
        return 0.0

In [None]:
def tfidf(document,word):
    return tf(document,word) * idf(word)

In [None]:
# Initialize an inverted_index_top_k for the purpose
inverted_index_top_k = {}

# Remember that course_descriptions is the column of our stemmed DataFrame 
tokenized_descriptions = [text.split() if isinstance(text, str) else [] for text in course_descriptions]

# Iterate on each word in the vocabulary
for word in vocabulary:
    
    # Initialize a list that will contain all the tuples (doc_id_i , tfidf(word,doc_id_i)) 
    mylist=[]
    
    # Iterate on each document 
    for k in range(len(tokenized_descriptions)):
        
        # If word is contained in that specific document then we append the tuple to the list
        if word in tokenized_descriptions[k]:
            
            term_id = vocabulary[word]
            
            mylist.append((k,tfidf(course_descriptions[k],word)))
            
    # This inverted_index_top_k maps each word to a list containing all the tuples
    inverted_index_top_k[term_id] = mylist

In [None]:
# To save inverted_index_top_k as inverted_index_top_k.pkl (!!RUN THE TWO LINES BELOW TO SAVE THE FILE!!) 
#with open('inverted_index_top_k.pkl', 'wb') as index_file:
    #pickle.dump(dict(inverted_index_top_k), index_file)

### **2.2.2 Execute the query**

### General Idea

The provided code implements a search engine,based on the top k results (where k can be chosen by the user), using cosine similarity based on TF-IDF (Term Frequency-Inverse Document Frequency) scores. The scikit-learn library was utilized for TF-IDF vectorization and cosine similarity calculation. The search function takes a query and a parameter 'k' (number of top results to retrieve) and returns the top-k documents that are most similar to the input query.

### Steps in the Code

**1. Document Retrieval:**
- Utilizes a search engine function (`search_engine`) to obtain a DataFrame (`documents`) containing all rows where all the words in the query are present.
- Extracts the 'courseDescription' column from the obtained DataFrame.

**2. TF-IDF Vectorization:**
- Initializes a `TfidfVectorizer` to convert the document corpus into TF-IDF vectors.
- Fits and transforms the TF-IDF vectorizer on the extracted 'courseDescription' column.

**3. Query Preprocessing:**
- Tokenizes the query using a regular expression tokenizer.
- Stems each token using the Porter Stemmer.
- Joins the stemmed tokens to generate the preprocessed query.

**4. Query TF-IDF Transformation:**
- Transforms the preprocessed query using the fitted TF-IDF vectorizer.

**5. Cosine Similarity Calculation:**
- Calculates cosine similarity scores between the query TF-IDF vector and the TF-IDF vectors of the documents in the corpus.

**6. Top-k Document Retrieval:**
- Utilizes a heap data structure to maintain the top-k documents based on similarity scores.
- Iterates through the documents and maintains the heap.
- Retrieves the top-k documents from the heap.

**7. Result DataFrame Creation:**
- Extracts details of the top-k documents from the original DataFrame (`df`) based on the document IDs obtained from the heap.
- Adds a 'similarity' column to the result DataFrame.

**8. Handling Non-Zero Similarity Documents:**
- Checks if the number of non-zero similarity documents is less than k.
- If true, includes all non-zero similarity documents; otherwise, returns the top-k documents.


In [None]:
def search_engine_top_k(query, k):
    
    # Documents is the dataframe that contain all those rows in which all the words in the query are contained
    documents = search_engine(query)
    our_documents = documents['courseDescription']
    
    # To stem our documents
    our_documents = our_documents.apply(process_text)
    
    # To extract tfidf scores of our documents of interest
    tfidf = TfidfVectorizer()
    corpus_tfidf = tfidf.fit_transform(our_documents)
    
    # Preprocess the query
    tokenizer = RegexpTokenizer(r'\w+')
    query_tokens = tokenizer.tokenize(query)
    stemmer = PorterStemmer()
    query = ' '.join([stemmer.stem(word) for word in query_tokens])
    
    # Transform the query using the fitted TfidfVectorizer
    query_tfidf = tfidf.transform([query])

    # Calculate the Cosine Similarity scores for relevant documents
    cosine_similarities = cosine_similarity(query_tfidf, corpus_tfidf).flatten()

    # Use a heap to maintain the top-k documents
    top_k_heap = []

     # Iterate through the documents and maintain the heap
    for doc_id, score in enumerate(cosine_similarities):
        heapq.heappush(top_k_heap, (score, doc_id))

    # Retrieve the top-k documents from the heap
    result_documents_top_k = sorted(top_k_heap, key=lambda x: x[0], reverse=True)[:k]

    # Get the details of the top-k documents from the DataFrame
    result_df_top_k = documents.iloc[[doc_id for _, doc_id in result_documents_top_k]]
    
    # Add the 'similarity' column to the DataFrame
    result_df_top_k['similarity'] = [score for score, _ in result_documents_top_k]

    # If the number of non-zero similarity documents is less than k, include all of them
    non_zero_similarity_documents = result_df_top_k[result_df_top_k['similarity'] > 0]
    if len(non_zero_similarity_documents) < k:
        return non_zero_similarity_documents[['courseName', 'universityName', 'courseDescription', 'url', 'similarity']]
    
    return result_df_top_k[['courseName', 'universityName', 'courseDescription', 'url', 'similarity']]

In [None]:
# Take input for query_to_ask
query_to_ask = input("Enter your query: ")

# Take input for k
k = int(input("Enter the value of k: "))

# Showing your result
your_result = search_engine_top_k(query_to_ask,k)
your_result

Enter your query:  advanced knowledge
Enter the value of k:  10


Unnamed: 0,courseName,universityName,courseDescription,url,similarity
5600,Advanced Clinical Practice - MSc,Canterbury Christ Church University,Gain the knowledge and skills needed to become...,https://www.findamasters.com/masters-degrees/c...,0.192974
5747,Advanced Healthcare Practice - MSc,Cardiff University,Our MSc Advanced Healthcare Practice programme...,https://www.findamasters.com/masters-degrees/c...,0.172914
5714,Advanced Computing MSc,King’s College London,Our Advanced Computing MSc provides knowledge ...,https://www.findamasters.com/masters-degrees/c...,0.170557
5909,Advancing Practice - MSc,University of Northampton,Our MSc Advancing Practice awards support the ...,https://www.findamasters.com/masters-degrees/c...,0.17051
5797,Advanced Mechanical Engineering - MSc (Eng),University of Leeds,This course offers a broad range of advanced s...,https://www.findamasters.com/masters-degrees/c...,0.163365
5652,Advanced Clinical Practice MSc,University of Greenwich,Develop your skills and deepen your knowledge ...,https://www.findamasters.com/masters-degrees/c...,0.154056
5618,Advanced Clinical Practice (AHP) - MSc/PGDip/P...,Bangor University,The programme has been developed to enhance pr...,https://www.findamasters.com/masters-degrees/c...,0.151249
5565,Advanced Biomedical Engineering - MSc,University of Bradford,Biomedical engineering is a fast evolving inte...,https://www.findamasters.com/masters-degrees/c...,0.13868
5599,Advanced Clinical Practice - MSc,University of Northampton,Our MSc Advanced Clinical Practice course aims...,https://www.findamasters.com/masters-degrees/c...,0.134166
5664,Advanced Clinical Practitioner - MSc,University of Sunderland,The MSc Advanced Clinical Practitioner is a hi...,https://www.findamasters.com/masters-degrees/c...,0.133655


This last was the output for query_input = 'advanced knowledge' and k = 10.

#### Results:

- The code enables users to input a query and retrieve the top-k documents most similar to the query based on TF-IDF cosine similarity.
- The results include relevant details such as course name, university name, course description, URL, and the calculated similarity score.
- The cosine similarity score indicates how closely the documents match the input query, with higher scores indicating greater similarity.

Overall, the code provides a functional search engine based on TF-IDF cosine similarity for document retrieval.

## **3. Define a new score!** 

In this exercise we used the search engine created in exercise 2.1 as a starting point to get the relevant courses for the query in input and we defined another scoring function that is not based anymore only on the course description but also on the course name, university name and faculty name. We decided to consider the course name as the data with the most relevance for the score and the university and faculty name as the data with least relevance.

The scoring function takes in input a query and the indexes of the relevant master courses returned by the search engine and outputs the requested data (course name, course description, university name, url) and the course's score.

Lastly, the function top_k_courses takes in input the query and an int k and maintains an heap for the k courses with the highest score (returned by the scoring function) and returns the heap in output:

In [10]:
stemmer = PorterStemmer()
tokenizer = RegexpTokenizer(r'\w+')

# Using the code from exercise 2.1 as a starting point, as requested
def search_engine(query):
      
    tokenizer = RegexpTokenizer(r'\w+')
    query = tokenizer.tokenize(query)
    stemmer = PorterStemmer()
    query = [stemmer.stem(word) for word in query if not word in lst_stopwords]

    conjunctive_list = inverted_index[vocabulary[query[0]]] 
    for term in query:
        if term in vocabulary:
            term_id = vocabulary[term]
            term_list = inverted_index[term_id]
            conjunctive_list = set(conjunctive_list).intersection(set(term_list))
        else:
            print("Not all terms are in the course's descriptions")
            return False
    
    # Returning directly the list of the locations of the documents found
    return conjunctive_list

# Defining the scoring function, takes in input the query and the course number in the dataframe and outputs the relevant data for the course and its score for the query
def scoring_function(query, course_loc): 
    # Getting from the course only the relevant data
    course_data = df.loc[course_loc, ['courseName', 'courseDescription', 'universityName', 'facultyName', 'url']]

    # Weights for the kinds of data
    weight_name = 0.5
    weight_description = 0.3
    weight_uni_faculty = 0.4

    course_name = str(course_data['courseName']).lower()
    course_description = str(course_data['courseDescription']).lower()
    course_uni = str(course_data['universityName']).lower()
    course_faculty = str(course_data['facultyName']).lower()
    course_url = str(course_data['url'])

    # Calculating the score for each attribute
    score_name = sum(word in course_name for word in query) * weight_name
    score_description = sum(word in course_description for word in query) * weight_description
    score_uni = sum(word in course_uni for word in query) * weight_uni_faculty
    score_faculty = sum(word in course_faculty for word in query) * weight_uni_faculty
    
    # Calculating the total score
    total_score = score_name + score_description + score_faculty + score_uni

    # Returning relevant information for the heap
    return course_name, course_description, course_uni, course_url, total_score

# Given a query and a number k, calculates the top k courses
def top_k_courses(query, k):
    # Initializing the heap
    top_courses = []

    # Using the search engine to get the list of courses with a correspondence in the description
    filtered_msc = search_engine(query)

    for msc_loc in filtered_msc:
        # Getting the score and useful data from the scoring function
        course_name, description, universityName, url, msc_score_tmp = scoring_function(query, msc_loc)
        
        # Condition to keep the size of the heap up to k
        if len(top_courses) < k:
            heapq.heappush(top_courses, (course_name, description, universityName, url, msc_score_tmp))
        else:
            # Replacing the lowest score if a new score is higher than the lowest score
            _, _, _, _, lowest_score = top_courses[0]
            if msc_score_tmp > lowest_score:
                heapq.heappop(top_courses)
                heapq.heappush(top_courses, (course_name, description, universityName, url, msc_score_tmp))

    # Sorting the courses by descending score
    top_courses = sorted(top_courses, key=lambda x: x[4], reverse=True)

    columns = ['course name', 'course description', 'university name', 'url', 'score']
    top_courses_df = pd.DataFrame(top_courses, columns=columns)

    
    return top_courses_df



In [9]:

path = "C:\\Users\\nephr\\Desktop\\Uni Nuova\\ADM_HW3"

# Opening dataframe
df = pd.read_csv(path + '\df_DEFINITIVO.tsv', sep='\t')

# Opening Pickle files for the search engine
with open(path + '\\vocabulary_.pkl', 'rb') as vocab_file:
    vocabulary = pickle.load(vocab_file)

with open(path + '\inverted__index.pkl', 'rb') as index_file:
    inverted_index = pickle.load(index_file)

# User query and top k result
query = "management"
k = 10

top_k_results = top_k_courses(query, k)
top_k_results


Unnamed: 0,course name,course description,university name,url,score
0,architecture-engineering construction manageme...,themaster of science in architecture–engineeri...,carnegie mellon university,https://www.findamasters.com/masters-degrees/c...,16.0
1,banking and finance msc,this course will help to enhance your lucrativ...,university of nottingham,https://www.findamasters.com/masters-degrees/c...,15.5
2,"environmental management (waste, energy, water...",environmental challenges require complex solut...,glasgow caledonian university,https://www.findamasters.com/masters-degrees/c...,15.2
3,banking and international finance (msc),you are ambitious to succeed in global banking...,bayes business school,https://www.findamasters.com/masters-degrees/c...,14.4
4,finance - msc,ranked #11 in the world in the financial times...,imperial college london,https://www.findamasters.com/masters-degrees/c...,14.2
5,"entrepreneurship management and innovation, on...",join us for entrepreneurship management and in...,university of bath,https://www.findamasters.com/masters-degrees/c...,13.6
6,fashion business & management - ma/msc,if you are seeking a high-level career in mana...,university for the creative arts,https://www.findamasters.com/masters-degrees/c...,13.6
7,"environmental, social, governance (esg) manage...",more than ever esg is essential for businesses...,king’s college london,https://www.findamasters.com/masters-degrees/c...,12.0
8,environmental engineering and project manageme...,there are many challenges we face globally. bu...,university of leeds,https://www.findamasters.com/masters-degrees/c...,12.0
9,engineering management - msc/pgcert,"designed by engineers, for engineers, the onli...",university of leeds,https://www.findamasters.com/masters-degrees/c...,11.6


We can see that using this new search engine, which incorporates additional data such as course name, faculty name, and university name, the results are more relevant than the previous one.

## **4. Visualizing the most relevant MSc degrees** 

As requested I used the search engine with the score system created in question 3, I added the columns I needed to represent the location fo the masters. The code is the same except those new columns.

In [44]:
stemmer = PorterStemmer()
tokenizer = RegexpTokenizer(r'\w+')

# The search engine used was the one defined in question 3 
# These other two functions were slightly modified from question 3 to accommodate the request from this question
def scoring_function(query, course_loc):
    course_data = df.loc[course_loc, ['courseName', 'courseDescription', 'universityName', 'facultyName', 'url', 'city','country','fees']]

    weight_name = 0.5
    weight_description = 0.3
    weight_uni_faculty = 0.4

    course_name = str(course_data['courseName']).lower()
    course_description = str(course_data['courseDescription']).lower()
    course_uni = str(course_data['universityName']).lower()
    course_faculty = str(course_data['facultyName']).lower()
    course_url = str(course_data['url'])
    
    course_city = str(course_data['city'])
    course_country = str(course_data['country'])
    course_fees = course_data['fees']

    score_name = sum(word in course_name for word in query) * weight_name
    score_description = sum(word in course_description for word in query) * weight_description
    score_uni = sum(word in course_uni for word in query) * weight_uni_faculty
    score_faculty = sum(word in course_faculty for word in query) * weight_uni_faculty
    
    total_score = score_name + score_description + score_faculty + score_uni

    return course_name, course_description, course_uni, course_url, course_city, course_country, course_fees , total_score


def top_k_courses(query, k):
    top_courses = []

    filtered_msc = search_engine(query)

    for msc_loc in filtered_msc:
        course_name, description, universityName, url, city, country, fees, msc_score_tmp = scoring_function(query, msc_loc)
        
        if len(top_courses) < k:
            heapq.heappush(top_courses, (course_name, description, universityName, url, city, country, fees, msc_score_tmp))
        else:
            _, _, _, _, _, _, _, lowest_score = top_courses[0]
            if msc_score_tmp > lowest_score:
                heapq.heappop(top_courses)
                heapq.heappush(top_courses, (course_name, description, universityName, url, city, country, fees, msc_score_tmp))

    top_courses = sorted(top_courses, key=lambda x: x[7], reverse=True)

    return top_courses


if __name__ == '__main__':

    path = "C:\\Users\\nephr\\Desktop\\Uni Nuova\\ADM_HW3"
    df = pd.read_csv( "C:\\Users\\nephr\\Desktop\\Uni Nuova\\ADM_HW3\\df_DEFINITIVO.tsv",sep='\t')
    

    with open(path + '/vocabulary_.pkl', 'rb') as vocab_file:
        vocabulary = pickle.load(vocab_file)

    with open(path + '/inverted__index.pkl', 'rb') as index_file:
        inverted_index = pickle.load(index_file)

    query = "American"
    k = 10

    top_k_results = top_k_courses(query, k)

The choice for the query and k was to showcase the map created in this part. if k was too large the map would be too dense over the United kingdom, since most masters are located there. The query "American" was used because half of the top k results had a numeric fee (most of the masters in the whole dataset didn't have a fee available)

------------------------------------

We start working now on the results to show them on a map


In [45]:
# Define column names
columns = ['courseName', 'description', 'universityName', 'url', 'city','country','fees','score']

# Convert the heap data to a DataFrame
df = pd.DataFrame(top_k_results, columns=columns)

In [46]:
# Create a new column called Full Address
df['Full_Address'] = df['city'] + ',' + df['country']

#Using API key to get informations on latitude and longitude
gmaps = googlemaps.Client(key='AIzaSyA9KbsHqjADCpofhMaUndyhBP9QXZt7Ff8')

#create new dataframe to work on
addresses1= df.iloc[:,-1:].copy()
addresses1['long'] = ""
addresses1['lat'] = ""
for x in range(len(addresses1)):
    geocode_result = gmaps.geocode(addresses1['Full_Address'][x])
    addresses1['lat'][x] = geocode_result[0]['geometry']['location'] ['lat']
    addresses1['long'][x] = geocode_result[0]['geometry']['location']['lng']
    
    #to handle data on the same location
    addresses1['lat'][x] += np.random.uniform(-0.3, 0.3)
    addresses1['long'][x] += np.random.uniform(-0.3, 0.3)

# Lets join the results with original file
df['latitude']=addresses1['lat']
df['longitude']= addresses1['long']
df['fees'] = df['fees'].apply(lambda x: 'not available' if pd.isna(x) else x)

Now the data is ready to be plotted. We created two new columns for each location with latitude and longitude. These values were obtained using an API key but were modified slightly to accommodate the problem of overlapping locations. 

In [47]:
%matplotlib inline

In [None]:
df['text'] = df['courseName'] + ' fee (USD): ' + df['fees'].astype(str) 

# Define a color for points with empty 'fees' values
empty_fee_color = 'red'

# Create a color array for all points based on 'fees'
colors = [empty_fee_color if fee=='not available' else fee for fee in df['fees']]
fig = go.Figure(data=go.Scattergeo(
        locationmode = 'ISO-3',
        lon = df['longitude'],
        lat = df['latitude'],
        text = df['text'],
        mode = 'markers',
        marker = dict(
            size = 5,
            opacity = 0.8,
            reversescale = True,
            autocolorscale = False,
            line = dict(
                width=1,
                color='rgba(102, 102, 102)'
            ),
            colorscale = 'Blues',
            cmin = 0,
            color = colors,
            cmax = df[df['fees']!='not available']['fees'].max(),
            colorbar_title="fee (USD) per year"
        )))

fig.update_layout(
        title = 'WHERE ARE THESE MASTERS',
        geo = dict(
            scope='world',
            showland = True,
            landcolor = "rgb(250, 250, 250)",
            subunitcolor = "rgb(217, 217, 217)",
            countrycolor = "rgb(217, 217, 217)",
            countrywidth = 0.5,
            subunitwidth = 0.5
        ),
    )
fig.show()

The map shows the location of the results of the query. The color is based on the annual fee (in US Dollars). The red dots represent the masters with unavailable fee. Passing with the cursor on the circles gives you the location, the name of the course and the annual fee. 