import libraries

In [2]:
import pandas as pd
import json
from collections import defaultdict
from bs4 import BeautifulSoup
import requests
import os
import re
from collections import Counter
from functools import reduce
from tqdm.notebook import tqdm
from functools import reduce

import custom functions

In [None]:
# Specify the full path to the module (including the filename)
module_path = '/content/drive/MyDrive/HM3-ADM/HW3_ADM/parser.py'

# Import the module using importlib (a workaround if the standard import doesn't work)
import importlib.util
spec = importlib.util.spec_from_file_location("parser", module_path)
parser = importlib.util.module_from_spec(spec)
spec.loader.exec_module(parser)


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 1 Data collection

## 1.1. Get the list of master's degree courses

In [None]:
def extract_masters(this_url):
    result_url = requests.get(this_url)
    result_soup = BeautifulSoup(result_url.text, 'html.parser')
    result_links = result_soup.find_all('a', {'class': 'courseLink'})
    result_list = []
    for item in result_links:
        result_list.append(item['href'])
    return result_list

In [None]:
# Output file path
output_file_path = 'msc_urls.txt'
# Loop through the first 400 pages and write results to the output file
with open(output_file_path, 'a') as output_file:
    for page_number in range(1, 401):
        page_url = f'https://www.findamasters.com/masters-degrees/msc-degrees/?PG={page_number}'
        page_results = extract_masters(page_url)
        for url in page_results:
            output_file.write(f'{url}\n')
        time.sleep(1)

In [None]:
with open(output_file_path, 'r') as file:
    lines = file.readlines()
    number_of_lines = len(lines)

print(f'The file {output_file_path} contains {number_of_lines} rows.')

The file msc_urls.txt contains 6000 rows.


## 1.2. Crawl master's degree pages

In [None]:
# Function to download HTML from a URL with prefix and save it to a file
def download_and_save_html(url, output_path):
    full_url = 'https://www.findamasters.com/' + url
    try:
        response = requests.get(full_url)
        if response.status_code == 200:
            with open(output_path, 'w', encoding='utf-8') as html_file:
                html_file.write(response.text)
        else:
            print(f"Failed to download {full_url}. Status code: {response.status_code}")
    except Exception as e:
        print(f"Error downloading {full_url}: {e}")

# Function to create a directory if it doesn't exist
def create_directory(directory_path):
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)

# Main directory for downloaded HTML files
main_directory = 'downloaded_pages'
create_directory(main_directory)

# Iterate through the URLs and download the HTML
with open('msc_urls.txt', 'r') as file:
    for index, url in enumerate(file, start=1):
        # Remove whitespaces and newline characters from the URL
        url = url.strip()

        # Generate the directory path for the current page
        page_directory = os.path.join(main_directory, f'page_{(index - 1) // 15 + 1}')
        create_directory(page_directory)

        # Generate the output HTML file path
        output_path = os.path.join(page_directory, f'course_{index}.html')

        # Download the HTML and save it to the file
        download_and_save_html(url, output_path)
        time.sleep(3)

print("Download complete for all pages.")

Download complete for all pages.


#[1.3] Parse downloaded pages



At this point, you should have all the HTML documents about the master's degree of interest, and you can start to extract specific information. The list of the information we desire for each course and their format as desiried.

#Access the stored HTML pages

In [None]:
%pip install google-colab-shell
# import the module once
from google_colab_shell import getshell

Collecting google-colab-shell
  Downloading google-colab-shell-0.2.tar.gz (4.2 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: google-colab-shell
  Building wheel for google-colab-shell (setup.py) ... [?25l[?25hdone
  Created wheel for google-colab-shell: filename=google_colab_shell-0.2-py3-none-any.whl size=4107 sha256=80e086c86d11d45d2f29c9a491589816fae1262d0347b13dc5ea33ecab08cf12
  Stored in directory: /root/.cache/pip/wheels/f9/36/65/95dd4599be065418a9fe1f482674c8e716ce540f3f484681d2
Successfully built google-colab-shell
Installing collected packages: google-colab-shell
Successfully installed google-colab-shell-0.2


used command

---mettere modalità python--
git clone https://github.com/marinazanoni/nome_provvisorio.git

after accesing the repository

In [None]:
def parser(html_page):

    # Define your default values here
    default_values = {
    'courseName': None,
    'universityName': None,
    'facultyName': None,
    'isItFullTime': None,
    'description': None,
    'startDate': None,
    'fees': None,
    'modality': None,
    'duration': None,
    'city': None,
    'administration': None,
    'country': None,
    'url': None
}

    # Create a defaultdict with default values
    info = defaultdict(lambda: default_values)

   # Extract the text (HTML)
    with open(html_page, 'r', encoding='utf-8') as file:
        html_content = file.read()
    page_soup = BeautifulSoup(html_content, 'html.parser')

    # COURSE NAME
    page_links = page_soup.find_all('h1', {'class':'text-white course-header__course-title'})
    if page_links:
        first_h1_text = page_links[0].text
        info['courseName'] = str(first_h1_text)
    #else:
        # Handle the case when no 'h1' with 'data-permutive-title' is found
    #    print("No course name found.")
    #courseName = str(first_h1_text)

    # UNIVERSITY NAME
    # Extracting our tag of interest for the Name
    page_links = page_soup.find_all('a', {'class': 'course-header__institution'})
    # Extracting the name of the course as string and print it
    info['universityName'] = str(page_links[0].contents[0])

    # FACULTY NAME
    page_links = page_soup.find_all('a', {'class': 'course-header__department'})
    info['facultyName'] = str(page_links[0].contents[0])

    # FULL TIME
    page_links = page_soup.find_all('a', {'class': 'inheritFont'})
    info['isItFullTime'] = str(page_links[0].contents[0])

    # SHORT DESCRIPTION
    page_links = page_soup.find('div', {'id': 'Snippet'})
    info['description'] = str(page_links.get_text(separator='\n').strip())

    # STARTING DATE
    page_links = page_soup.find('span', {'class': 'key-info__start-date'})
    info['startDate'] = str(page_links.get_text())

    # FEES
    page_links = page_soup.find('a', {'class': 'noWrap inheritFont'})
    page_links = page_soup.find('div', {'class': 'course-sections__fees'})
    if page_links:
        fees_text = page_links.get_text(separator='\n').strip()
        # Remove "Fees" from the text content
        info['fees'] = fees_text.replace('Fees', '').strip()


    # MODALITY
    page_links = page_soup.find('span', {'class': 'key-info__content key-info__qualification py-2 pr-md-3 text-nowrap d-block d-md-inline-block'})
    # Get all elements within the span using find_all
    elementsWithinSpan = page_links.find_all('a')
    info['modality'] = ' '.join([element.text.strip() for element in elementsWithinSpan])

    # DURATION
    page_links = page_soup.find('span', {'class':'key-info__content key-info__duration py-2 pr-md-3 d-block d-md-inline-block'})
    info['duration']=str(page_links.text)

    # CITY
    page_links = page_soup.find('a', {'class':'card-badge text-wrap text-left badge badge-gray-200 p-2 m-1 font-weight-light course-data course-data__city'})
    info['city']=str(page_links.text)

    # ADMINISTRATION
    page_links = page_soup.find('span', {'class':'course-header__online-flag badge bg-white p-2 h6 shadow-sm mr-1'})
    if page_links == None:
        info['administration'] = "On Campus"
    else:
        info['administration']=str(page_links.text)

    # COUNTRY
    page_links = page_soup.find('a', {'class':'card-badge text-wrap text-left badge badge-gray-200 p-2 m-1 font-weight-light course-data course-data__country'})
    info['country']=page_links.text

    #URL
    page_links = page_soup.find('link')
    info['url'] = page_links.get('href')


    return(pd.DataFrame([info]))


*Searching for a tipical hidden file and delete it in order not to have it when exctracting the information from the folder*

In [None]:
parser('/content/drive/MyDrive/HM3-ADM/HW3_ADM/downloaded_pages/page_1/course_1.html')

Unnamed: 0,courseName,universityName,facultyName,isItFullTime,description,startDate,fees,modality,duration,city,administration,country,url
0,3D Design for Virtual Environments - MSc,Glasgow Caledonian University,School of Engineering and Built Environment,Full time,3D visualisation and animation play a role in ...,September,Please see the university website for further ...,MSc,1 year full-time,Glasgow,On Campus,United Kingdom,https://www.findamasters.com/masters-degrees/c...


In [None]:
file_path_to_remove = '/content/drive/MyDrive/HM3-ADM/HW3_ADM/downloaded_pages/.DS_Store'
# Delete the hidden file
os.remove(file_path_to_remove)
print(f"The file {file_path_to_remove} has been deleted.")

In [None]:
len(parsed_dfs)

5979

*Applying the function to our html pages*

In [None]:
# Specifing the path of the folder
folder_path = '/content/drive/MyDrive/HM3-ADM/HW3_ADM/downloaded_pages'

# List all files in the folder
files = os.listdir(folder_path)

parsed_dfs = []

# Iterate through all folders and subfolders using os.walk
for folder_path, _, file_names in os.walk(folder_path):
    # Check if there are files in the current folder
    if file_names:
        # Iterate through each file in the current folder
        for file_name in file_names:
            file_path = os.path.join(folder_path, file_name)

            # Store the information only is the dictionary is not empty (has at list a name course)
            try:
                # Parse the file and append the result to the list
                parsed_df = parser(file_path)
                parsed_dfs.append(parsed_df)
            except Exception as e:
                # Print the file path when an exception occurs
                print(f"Error parsing file: {file_path}")
                # print(f"Error details: {e}")

# Concatenate all DataFrames in the list
concatenated_df = pd.concat(parsed_dfs, ignore_index=True)


Error parsing file: /content/drive/MyDrive/HM3-ADM/HW3_ADM/downloaded_pages/page_118/course_1765.html
Error parsing file: /content/drive/MyDrive/HM3-ADM/HW3_ADM/downloaded_pages/page_119/course_1772.html
Error parsing file: /content/drive/MyDrive/HM3-ADM/HW3_ADM/downloaded_pages/page_128/course_1910.html
Error parsing file: /content/drive/MyDrive/HM3-ADM/HW3_ADM/downloaded_pages/page_140/course_2086.html
Error parsing file: /content/drive/MyDrive/HM3-ADM/HW3_ADM/downloaded_pages/page_196/course_2929.html
Error parsing file: /content/drive/MyDrive/HM3-ADM/HW3_ADM/downloaded_pages/page_196/course_2931.html
Error parsing file: /content/drive/MyDrive/HM3-ADM/HW3_ADM/downloaded_pages/page_215/course_3213.html
Error parsing file: /content/drive/MyDrive/HM3-ADM/HW3_ADM/downloaded_pages/page_291/course_4357.html
Error parsing file: /content/drive/MyDrive/HM3-ADM/HW3_ADM/downloaded_pages/page_293/course_4395.html
Error parsing file: /content/drive/MyDrive/HM3-ADM/HW3_ADM/downloaded_pages/page_2

In [None]:
concatenated_df['url'][3]

'https://www.findamasters.com/masters-degrees/course/applied-economics-banking-and-financial-markets-online-msc/?i280d8352c56675'

In [None]:
# Display the concatenated DataFrame
concatenated_df.head()

Unnamed: 0,courseName,universityName,facultyName,isItFullTime,description,startDate,fees,modality,duration,city,administration,country,url
0,3D Design for Virtual Environments - MSc,Glasgow Caledonian University,School of Engineering and Built Environment,Full time,3D visualisation and animation play a role in ...,September,Please see the university website for further ...,MSc,1 year full-time,Glasgow,On Campus,United Kingdom,https://www.findamasters.com/masters-degrees/c...
1,Air Quality Solutions - MSc,University of Leeds,Institute for Transport Studies,Full time,Up to 7 million people are estimated to die ev...,September,"UK: £12,500 (Total) \nInternational: £28,750 (...",MSc,"1 year full time, 2 or 3 years part-time",Leeds,On Campus,United Kingdom,https://www.findamasters.com/masters-degrees/c...
2,Analytical Toxicology MSc,King’s College London,Faculty of Life Sciences & Medicine,Full time,The Analytical Toxicology MSc is a unique stud...,See Course,Please see the university website for further ...,MSc,Full-time: One year,London,On Campus,United Kingdom,https://www.findamasters.com/masters-degrees/c...
3,Applied Computer Science and Artificial Inte...,University of Bradford,Faculty of Engineering & Digital Technologies,Full time,Computer science is the foundation of many exc...,"September, January",Please see the university website for further ...,MSc,1 Year Full Time / 2 Years Part Time,Bradford,On Campus,United Kingdom,https://www.findamasters.com/masters-degrees/c...
4,Applied Economics (Banking and Financial Mar...,University of Bath,University of Bath Online,Part time,From political uncertainty to finance and recr...,"September, January",Cost per 10 credits £722* (10% alumni discount...,MSc,2 years and 6 months full time,Bath,Online,United Kingdom,https://www.findamasters.com/masters-degrees/c...


In [None]:
concatenated_df.shape

(5979, 13)

In [None]:
print(6000-5979, 'were not valid pages')

21 were not valid pages


In [None]:
parsed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   courseName      1 non-null      object
 1   universityName  1 non-null      object
 2   facultyName     1 non-null      object
 3   isItFullTime    1 non-null      object
 4   description     1 non-null      object
 5   startDate       1 non-null      object
 6   fees            1 non-null      object
 7   modality        1 non-null      object
 8   duration        1 non-null      object
 9   city            1 non-null      object
 10  administration  1 non-null      object
 11  country         1 non-null      object
 12  url             1 non-null      object
dtypes: object(13)
memory usage: 232.0+ bytes


In [None]:
# Specify the path where you want to save the .tsv file
tsv_file_path = '/content/drive/MyDrive/HM3-ADM/HW3_ADM/MasterDegrees.tsv'

# Save the DataFrame to a .tsv file
concatenated_df.to_csv(tsv_file_path, sep='\t', index=False)

print(f".tsv file saved at: {tsv_file_path}")

.tsv file saved at: /content/drive/MyDrive/HM3-ADM/HW3_ADM/MasterDegrees.tsv


### *saving singularly the information about each master (html page)*

In [None]:

# Iterate through each DataFrame in the list
for index, parsed_df in enumerate(parsed_dfs):
    # Iterate through each row in the DataFrame
    for row_index, row in parsed_df.iterrows():
        # Replace NaN values with empty strings
        single_row = row.fillna(' ')
        # Specify the path where you want to save the .tsv file for the current row
        tsv_file_path = f'/content/drive/MyDrive/HM3-ADM/HW3_ADM/Courses/course_{index}.tsv'
        # Save the single row DataFrame to a .tsv file
        with open(tsv_file_path, 'w') as file:
            file.write('\t'.join(single_row))
        # Uncomment to see all the tsv files
        #print(f".tsv file for row {index}, row {row_index} saved at: {tsv_file_path}")

#Preproccessing

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem import *

# Download the stopwords dataset if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

#[2.0.0]

Uploading the file created after parsing the information

In [177]:
df = pd.read_csv('/content/drive/MyDrive/HM3-ADM/HW3_ADM/MasterDegrees.tsv',sep='\t')
df.head()

Unnamed: 0,courseName,universityName,facultyName,isItFullTime,description,startDate,fees,modality,duration,city,administration,country,url
0,Air Quality Solutions - MSc,University of Leeds,Institute for Transport Studies,Full time,Up to 7 million people are estimated to die ev...,September,"UK: £12,500 (Total) \nInternational: £28,750 (...",MSc,"1 year full time, 2 or 3 years part-time",Leeds,On Campus,United Kingdom,https://www.findamasters.com/masters-degrees/c...
1,Analytical Toxicology MSc,King’s College London,Faculty of Life Sciences & Medicine,Full time,The Analytical Toxicology MSc is a unique stud...,See Course,Please see the university website for further ...,MSc,Full-time: One year,London,On Campus,United Kingdom,https://www.findamasters.com/masters-degrees/c...
2,Applied Computer Science and Artificial Inte...,University of Bradford,Faculty of Engineering & Digital Technologies,Full time,Computer science is the foundation of many exc...,"September, January",Please see the university website for further ...,MSc,1 Year Full Time / 2 Years Part Time,Bradford,On Campus,United Kingdom,https://www.findamasters.com/masters-degrees/c...
3,Applied Economics (Banking and Financial Mar...,University of Bath,University of Bath Online,Part time,From political uncertainty to finance and recr...,"September, January",Cost per 10 credits £722* (10% alumni discount...,MSc,2 years and 6 months full time,Bath,Online,United Kingdom,https://www.findamasters.com/masters-degrees/c...
4,Applied Linguistics - MSc,University of Glasgow,College of Arts & Humanities,Full time,This Masters focuses on how linguistic researc...,September,Please see the university website for further ...,MSc,1 year full-time; 2 years part-time,Glasgow,On Campus,United Kingdom,https://www.findamasters.com/masters-degrees/c...


## Stemming

In [80]:
# Make a copy of the original DataFrame
processed_df = df.copy()

# stemmer
stemmer = PorterStemmer()

processed_df['description'] = processed_df.description.apply(lambda row: ' '.join([stemmer.stem(word) for word in row.split(' ')]))

##Lower, removing punctuation and stepwords

In [81]:
stop_words = set(stopwords.words('english'))

# Function to remove stopwords and punctuation from a text
def clean(text):
    """
    The following function returns the filtered element for each column of a dataframe.
    Filtering operation consists in removing punctuation and removing stopwords given text with lower case
    """
    words = word_tokenize(str(text))
    # Remove punctuation using NLTK and string.punctuation
    filtered_words = [word for word in words if word not in string.punctuation + "’"]
    # Remove stopwords
    filtered_words = [word for word in filtered_words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Selecting columns
text_columns = processed_df.select_dtypes(include='object').columns
# Lower
processed_df[text_columns] = processed_df[text_columns].apply(lambda x: x.str.lower() if x.dtype == 'O' else x)
# Applying clean function to all the valid columns
processed_df[text_columns] = processed_df[text_columns].applymap(clean)


In [97]:
processed_df.head()

Unnamed: 0,courseName,universityName,facultyName,isItFullTime,description,startDate,fees,modality,duration,city,administration,country,url
0,air quality solutions msc,university leeds,institute transport studies,full time,"[7, million, peopl, estim, die, everi, year, d...",september,"uk £12,500 total international £28,750 total",msc,1 year full time 2 3 years part-time,leeds,campus,united kingdom,https //www.findamasters.com/masters-degrees/c...
1,analytical toxicology msc,king college london,faculty life sciences medicine,full time,"[analyt, toxicolog, msc, uniqu, studi, cours, ...",see course,please see university website information fees...,msc,full-time one year,london,campus,united kingdom,https //www.findamasters.com/masters-degrees/c...
2,applied computer science artificial intelligen...,university bradford,faculty engineering digital technologies,full time,"[comput, scienc, foundat, mani, excit, cutting...",september january,please see university website information fees...,msc,1 year full time 2 years part time,bradford,campus,united kingdom,https //www.findamasters.com/masters-degrees/c...
3,applied economics banking financial markets on...,university bath,university bath online,part time,"[polit, uncertainti, financ, recruit, demands,...",september january,cost per 10 credits £722 10 alumni discount pa...,msc,2 years 6 months full time,bath,online,united kingdom,https //www.findamasters.com/masters-degrees/c...
4,applied linguistics msc,university glasgow,college arts humanities,full time,"[thi, master, focus, linguist, research, help,...",september,please see university website information fees...,msc,1 year full-time 2 years part-time,glasgow,campus,united kingdom,https //www.findamasters.com/masters-degrees/c...


#[2.0.1]

 ## we want the field fees to collect numeric information


We consider only *fees* column and deepen what we find

In [None]:
raw_fees= pd.DataFrame(processed_df['fees'])

In [None]:
raw_fees.head()

Unnamed: 0,fees
0,please see university website information fees...
1,"uk £12,500 total international £28,750 total"
2,please see university website information fees...
3,please see university website information fees...
4,cost per 10 credits £722 10 alumni discount pa...


there are lot of missings data or more complex information we need to filter. To filter we follow the given guidline:


> in case of multiple information, retrieve only the highest fees


In [None]:
pattern = r' (?P<symbol>[^\d\s])(?P<value>\d{1,3}(?:,\d{3})*(?:\.\d{2})?)'

def return_cost(stri):
    """
    this function returns a string built up by the currency and
    the maximum number it appears in the string given in input
    """
    # Search for the patterns in the string (currency, values)
    match = re.findall(pattern, stri)

    if not match:
        return None

    # Store all values
    v = [value for currency, value in match]
    # Extract the maximum
    max_v = max(v)
    # Merge the currency with the value
    merged_list = f'{match[0][0]}{max_v}'
    return merged_list


text_columns = processed_df.select_dtypes(include='object').columns
# Return the list of currency symbols and costs encountered in 'fees' field
raw_fees['fees'] = raw_fees['fees'].apply(lambda x: return_cost(x) if x is not None else None)




###here we have filter fees column with only one cost each row, where available

In [None]:
raw_fees.head()

Unnamed: 0,fees
0,
1,"£28,750"
2,
3,
4,£722


In [None]:
# Your ExchangeRate-API key
api_key = '40f223580924eaf7a1eb4ee0'

# Fetch exchange rates from the API for all currencies against USD
api_url = f'https://open.er-api.com/v6/latest/USD?apikey={api_key}'
response = requests.get(api_url)
data = response.json()
exchange_rates = data['rates']

# Define a mapping between currency symbols in your data and API symbols
currency_symbol_mapping = {
    '£': 'GBP',
    '€': 'EUR',
    '$': 'USD',
    '¥': 'JPY'
    # Add more currency symbols as needed
}

# Function to convert any currency to the common currency (USD in this case)
def convert_to_common_currency(row, target_currency='USD'):
    if pd.isna(row['fees']):
        return None
    try:
        # Extract the currency symbol from the string
        currency_symbol = row['fees'][0]

        # Map the currency symbol to the API symbol
        api_currency_symbol = currency_symbol_mapping.get(currency_symbol)

        if not api_currency_symbol:
            return None

        # Extract the exchange rate from the pre-fetched rates
        exchange_rate = exchange_rates[api_currency_symbol]

        # Remove the currency symbol and commas, then convert to float
        amount = float(row['fees'][1:].replace(',', ''))

        # Convert to USD using the obtained exchange rate
        amount_target_currency = amount/(exchange_rate)
        return round(amount_target_currency,2)

    except Exception as e:
       return None


In [None]:
# Apply the conversion function to the 'fees' column
raw_fees['fees'] = raw_fees.apply(convert_to_common_currency, axis=1)

# Rename the column to indicate the common currency
common_currency_code = 'USD'
raw_fees.rename(columns={'fees': f'fees ({common_currency_code})'}, inplace=True)

In [None]:
raw_fees.head()

Unnamed: 0,fees (USD)
0,35778.51
1,
2,
3,898.51
4,


sustituing and renaming the new column with the appropriate currance values

In [None]:
processed_df['fees']= raw_fees['fees (USD)']
processed_df.rename(columns={'fees': 'fees (USD)'}, inplace=True)


In [None]:
processed_df.head()

Unnamed: 0,courseName,universityName,facultyName,isItFullTime,description,startDate,fees (USD),modality,duration,city,administration,country,url
0,air quality solutions msc,university leeds,institute transport studies,full time,7 million peopl estim die everi year due harm ...,september,35778.51,msc,1 year full time 2 3 years part-time,leeds,campus,united kingdom,https //www.findamasters.com/masters-degrees/c...
1,analytical toxicology msc,king college london,faculty life sciences medicine,full time,analyt toxicolog msc uniqu studi cours integr ...,see course,,msc,full-time one year,london,campus,united kingdom,https //www.findamasters.com/masters-degrees/c...
2,applied computer science artificial intelligen...,university bradford,faculty engineering digital technologies,full time,comput scienc foundat mani excit cutting-edg t...,september january,,msc,1 year full time 2 years part time,bradford,campus,united kingdom,https //www.findamasters.com/masters-degrees/c...
3,applied economics banking financial markets on...,university bath,university bath online,part time,polit uncertainti financ recruit demands econo...,september january,898.51,msc,2 years 6 months full time,bath,online,united kingdom,https //www.findamasters.com/masters-degrees/c...
4,applied linguistics msc,university glasgow,college arts humanities,full time,thi master focus linguist research help solv l...,september,,msc,1 year full-time 2 years part-time,glasgow,campus,united kingdom,https //www.findamasters.com/masters-degrees/c...


#[2.1.1]

In [91]:
vocabulary = set()

In [93]:
processed_df['description']= processed_df.description.apply(lambda row: word_tokenize(row))

0    [7, million, peopl, estim, die, everi, year, d...
1    [analyt, toxicolog, msc, uniqu, studi, cours, ...
2    [comput, scienc, foundat, mani, excit, cutting...
3    [polit, uncertainti, financ, recruit, demands,...
4    [thi, master, focus, linguist, research, help,...
Name: description, dtype: object

In [102]:
processed_df.description.head()

0    [7, million, peopl, estim, die, everi, year, d...
1    [analyt, toxicolog, msc, uniqu, studi, cours, ...
2    [comput, scienc, foundat, mani, excit, cutting...
3    [polit, uncertainti, financ, recruit, demands,...
4    [thi, master, focus, linguist, research, help,...
Name: description, dtype: object

In [105]:
processed_df.description.apply(lambda row: [vocabulary.add(word) for word in row])


0       [None, None, None, None, None, None, None, Non...
1       [None, None, None, None, None, None, None, Non...
2       [None, None, None, None, None, None, None, Non...
3       [None, None, None, None, None, None, None, Non...
4       [None, None, None, None, None, None, None, Non...
                              ...                        
5974    [None, None, None, None, None, None, None, Non...
5975    [None, None, None, None, None, None, None, Non...
5976    [None, None, None, None, None, None, None, Non...
5977    [None, None, None, None, None, None, None, Non...
5978    [None, None, None, None, None, None, None, Non...
Name: description, Length: 5979, dtype: object

In [109]:
print('peopl' in vocabulary)

True


In [107]:
vocabulary

{'status',
 'strategical',
 'multi-million',
 'mdp',
 'registration',
 'graduates/gradu',
 'oral',
 'supramolecular',
 'embedded',
 'transnational',
 'université',
 'root',
 'jump-start',
 'inact',
 'football',
 'launching',
 'earth',
 'constitution',
 'online-preparatori',
 'governmental',
 'people-orient',
 'defense',
 'medieval',
 'must',
 'linguistics',
 'realiz',
 'urbanist',
 '1-year',
 'especially',
 'instructions',
 'phonology',
 'compel',
 'view',
 'microm',
 'eat',
 'digital-bas',
 'diseas',
 'dangers',
 'lak',
 'engagingly',
 'tomorrow',
 'ngo/charit',
 'adversity',
 'strive',
 'weight',
 'extra-curricular',
 'rates',
 'demography',
 'assist',
 'roy',
 'geriatr',
 'biomechanical',
 'rpl',
 'qualify',
 'arithmet',
 'effici',
 'realis',
 'not-for-profit',
 'manhattan',
 'accru',
 'distinct',
 'startup',
 'norms',
 'watt',
 'well-suit',
 'wertschöpfung',
 'webchats',
 'pocket',
 'domin',
 'clinical/psychotherapi',
 'intak',
 'vulner',
 'guarante',
 'goddard',
 'offer',
 'matter

 Assign unique ID to each term in the vocabulary

In [46]:
vocabulary_list = list(vocabulary)
vocabulary_dict = {word: index for index, word in enumerate(vocabulary_list)}
vocabulary_df=pd.DataFrame(list(vocabulary_dict.items()), columns=['Word', 'Id'])
print(vocabulary_df.head())

            Word  Id
0         status   0
1    strategical   1
2  multi-million   2
3            mdp   3
4   registration   4


save it

In [53]:
vocabulary_df.to_csv('/content/drive/MyDrive/HM3-ADM/HW3_ADM/vocabulary.csv', index=False, header=False)
#remove header to keep the name of the columns

In [51]:
# Store in a json llike a pandas dataframe
vocabulary_df.to_json('/content/drive/MyDrive/HM3-ADM/HW3_ADM/vocabulary.json', orient='records')

In [52]:
# Write the dictionary to the JSON file directly
with open('/content/drive/MyDrive/HM3-ADM/HW3_ADM/vocabulary.json', 'w') as jsonfile:
    json.dump(vocabulary_dict, jsonfile)

In [163]:
vocabulary_reverse = vocabulary_df.copy()
print(vocabulary_reverse.head())

            Word  Id
0         status   0
1    strategical   1
2  multi-million   2
3            mdp   3
4   registration   4


In [164]:
from tqdm import tqdm
tqdm.pandas()
vocabulary_reverse['reverse'] = vocabulary_reverse.Word.progress_apply(lambda item: list(df.loc[df.description.apply(lambda row: item in row)].index))

100%|██████████| 12557/12557 [01:39<00:00, 125.67it/s]


In [119]:
print('million' in vocabulary_reverse['Word'].values)

True


In [165]:
vocabulary_reverse.head()

Unnamed: 0,Word,Id,reverse
0,status,0,"[383, 487, 488, 492, 576, 613, 629, 700, 1008,..."
1,strategical,1,"[1575, 1804, 2009, 2279, 2886, 2985, 3236, 377..."
2,multi-million,2,"[503, 2023, 3877]"
3,mdp,3,[1439]
4,registration,4,"[56, 380, 407, 439, 601, 620, 626, 748, 857, 1..."


#### creating a dictionary

In [38]:
inverted_index = vocabulary_reverse.set_index('Id')['reverse'].to_dict()
count = 0
for key, value in inverted_index.items():
    if count < 5:
        print(f"Key: {key}, Value: {value}")
        count += 1
    else:
        break

Key: 0, Value: [383, 487, 488, 492, 576, 613, 629, 700, 1008, 1505, 1693, 1813, 2142, 2157, 2193, 2322, 2376, 2430, 2443, 2465, 2635, 2648, 3060, 3061, 3077, 3238, 3315, 3946, 4604, 4978, 5042, 5074, 5115, 5206, 5248, 5258, 5266, 5267, 5271, 5272, 5274, 5317]
Key: 1, Value: [1575, 1804, 2009, 2279, 2886, 2985, 3236, 3770, 4066, 4294, 4330, 4773, 4844, 5922]
Key: 2, Value: [503, 2023, 3877]
Key: 3, Value: [1439]
Key: 4, Value: [56, 380, 407, 439, 601, 620, 626, 748, 857, 1113, 1338, 1413, 1466, 1468, 1471, 1475, 1479, 1827, 1985, 1992, 1993, 2113, 2597, 2656, 2723, 2774, 2993, 3003, 3060, 3070, 3077, 3195, 3445, 4205, 4829, 4999, 5144, 5293, 5316, 5358, 5359, 5360, 5363, 5386, 5582, 5590, 5647, 5968]


In [39]:
with open('/content/drive/MyDrive/HM3-ADM/HW3_ADM/inverted_index.json', 'w') as file:
    json.dump(inverted_index, file)

##[2.1.2]

selecting only the rows we need to return as output

In [179]:
df_query = df[['courseName','universityName','description','url']].copy()
df_query.head()

Unnamed: 0,courseName,universityName,description,url
0,Air Quality Solutions - MSc,University of Leeds,Up to 7 million people are estimated to die ev...,https://www.findamasters.com/masters-degrees/c...
1,Analytical Toxicology MSc,King’s College London,The Analytical Toxicology MSc is a unique stud...,https://www.findamasters.com/masters-degrees/c...
2,Applied Computer Science and Artificial Inte...,University of Bradford,Computer science is the foundation of many exc...,https://www.findamasters.com/masters-degrees/c...
3,Applied Economics (Banking and Financial Mar...,University of Bath,From political uncertainty to finance and recr...,https://www.findamasters.com/masters-degrees/c...
4,Applied Linguistics - MSc,University of Glasgow,This Masters focuses on how linguistic researc...,https://www.findamasters.com/masters-degrees/c...


In [181]:
df_query.shape

(5979, 4)

writing a function which extract all documents were all words are met

In [185]:
def engine(query):
    """
    Given a query made up by multiple word it returns the documents were ALL the word are found
    """
    doc_set_indexes = []
    words_in_query = query.split()

    for word in words_in_query:
        # Stemming the word
        stemmed_word = stemmer.stem(word)

        # Check if the stemmed word exists in the 'Word' column after applying stemming
        if stemmed_word in vocabulary_reverse['Word'].apply(stemmer.stem).values:
            # Get the document set indexes for the stemmed word
            indexes_for_word = vocabulary_reverse[vocabulary_reverse['Word'].apply(lambda x: x == stemmed_word)]['reverse'].values

            # Flatten the lists in 'reverse' column
            flattened_indexes = [item for sublist in indexes_for_word for item in sublist]

            # Append the flattened document set indexes to the list
            doc_set_indexes.append(flattened_indexes)
           # print(doc_set_indexes)

        else:
            print(f"Stemmed word '{stemmed_word}' not found in vocabulary_reverse")

    # Find the intersection of all document sets
    selected_doc = list(set.intersection(*map(set, doc_set_indexes)))

    # Select rows using iloc
    selected_rows = df.iloc[selected_doc]

    return selected_rows


In [186]:
engine('advance knowledge')

Unnamed: 0,courseName,universityName,facultyName,isItFullTime,description,startDate,fees,modality,duration,city,administration,country,url
1,Analytical Toxicology MSc,King’s College London,Faculty of Life Sciences & Medicine,Full time,The Analytical Toxicology MSc is a unique stud...,See Course,Please see the university website for further ...,MSc,Full-time: One year,London,On Campus,United Kingdom,https://www.findamasters.com/masters-degrees/c...
5,Accounting and Finance - MSc,University of Leeds,Leeds University Business School,Full time,Businesses and governments rely on sound finan...,September,"UK: £18,000 (Total) \nInternational: £34,750 (...",MSc,1 year full time,Leeds,On Campus,United Kingdom,https://www.findamasters.com/masters-degrees/c...
8,Addictions MSc,King’s College London,"Institute of Psychiatry, Psychology and Neuros...",Full time,Join us for an online session for prospective ...,September,Please see the university website for further ...,MSc,One year FT,London,On Campus,United Kingdom,https://www.findamasters.com/masters-degrees/c...
2056,Engineering with Management - MSc,University of Bristol,Faculty of Engineering,Full time,The MSc in Engineering with Management is desi...,September,Please see the university website for further ...,MSc,1 year full-time,Bristol,On Campus,United Kingdom,https://www.findamasters.com/masters-degrees/c...
2069,Enhanced Radiotherapy and Oncology Practice ...,Sheffield Hallam University,Postgraduate Courses,Part time,Enhance your knowledge of specific radiotherap...,"September, January",For part-time study the fee will be calculated...,MSc,3 years part time distance learning,Sheffield,Online,United Kingdom,https://www.findamasters.com/masters-degrees/c...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2034,Engineering Management MSc,Middlesex University,Faculty of Science and Technology,Full time,The career of an engineering project manager i...,October,"UK students \nFull-time students: £10,500 \nPa...",MSc,1 year full-time; 2 years part-time,London,On Campus,United Kingdom,https://www.findamasters.com/masters-degrees/c...
4083,M2 Research in Management and Innovation,Université Côte d’Azur,UCA International MSc Program,Full time,"Our modern industry structures, value chains a...",September,The tuition currently stands at €243 as well a...,MSc,2 years,Nice,On Campus,France,https://www.findamasters.com/masters-degrees/c...
2038,Engineering Management MSc,University of Greenwich,School of Engineering,Full time,Extend and develop your skills and build a car...,"September, January",Please see the university website for further ...,MSc,"1 years full-time, 2 years part-time",London,On Campus,United Kingdom,https://www.findamasters.com/masters-degrees/c...
2042,Engineering Management via Study Centres - MSc,University of Sunderland,Faculty of Technology,Full time,This course will broaden the knowledge and exp...,See Course,Please see the university website for further ...,MSc,See course dates on website,Sunderland,On Campus,United Kingdom,https://www.findamasters.com/masters-degrees/c...
