# Import libraries

In [None]:
import pandas as pd
import json
from collections import defaultdict
from bs4 import BeautifulSoup
import requests
import os
import re
from collections import Counter
from functools import reduce
from tqdm.notebook import tqdm
from functools import reduce
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
import heapq
from itertools import product
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem import *

# Import custom functions

In [None]:
import custom_functions

# Import functions

#### In functions.py there's a series of functions that we have used; this choice was made to enhance readability

In [None]:
import functions

# 1 Data collection

## [1.1] Get the list of master's degree courses

#### We create a .txt file (msc_urls.txt) whose single line corresponds to the master's URL.

In [None]:
# Output file path
output_file_path = 'msc_urls.txt'
# Loop through the first 400 pages and write results to the output file
with open(output_file_path, 'a') as output_file:
    for page_number in range(1, 401):
        page_url = f'https://www.findamasters.com/masters-degrees/msc-degrees/?PG={page_number}'
        page_results = functions.extract_masters(page_url)
        for url in page_results:
            output_file.write(f'{url}\n')
        time.sleep(1)

#### Each page has 15 courses, so you will end up with 6000 unique master's degree URLs. Let's check it!

In [None]:
with open(output_file_path, 'r') as file:
    lines = file.readlines()
    number_of_lines = len(lines)

print(f'The file {output_file_path} contains {number_of_lines} rows.')

The file msc_urls.txt contains 6000 rows.


## [1.2] Crawl master's degree pages

In [None]:
# Function to create a directory if it doesn't exist
def create_directory(directory_path):
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)

# Main directory for downloaded HTML files
main_directory = 'downloaded_pages'
create_directory(main_directory)

# Iterate through the URLs and download the HTML
with open('msc_urls.txt', 'r') as file:
    for index, url in enumerate(file, start=1):
        # Remove whitespaces and newline characters from the URL
        url = url.strip()

        # Generate the directory path for the current page
        page_directory = os.path.join(main_directory, f'page_{(index - 1) // 15 + 1}')
        create_directory(page_directory)

        # Generate the output HTML file path
        output_path = os.path.join(page_directory, f'course_{index}.html')

        # Download the HTML and save it to the file
        custom_functions.crawler(url, output_path)
        time.sleep(3)

print("Download complete for all pages.")

Download complete for all pages.


#### Now we have a folder named 'downloaded_pages' divided into 400 folders (one for each page), and each folder contains the HTML of the 15 courses present on that page.

## [1.3] Parse downloaded pages


In [None]:
custom_functions.parser('downloaded_pages/page_1/course_1.html')

Unnamed: 0,courseName,universityName,facultyName,isItFullTime,description,startDate,fees,modality,duration,city,administration,country,url
0,3D Design for Virtual Environments - MSc,Glasgow Caledonian University,School of Engineering and Built Environment,Full time,3D visualisation and animation play a role in ...,September,Please see the university website for further ...,MSc,1 year full-time,Glasgow,On Campus,United Kingdom,https://www.findamasters.com/masters-degrees/c...


## Applying the function to our html pages

we test our parsing functioni to a single document and the ìn collect all information with the parser thoìrough the html pages

In [None]:
concatenated_df=functions.parse_html('downloaded_pages')

Error parsing file: /content/drive/MyDrive/HM3-ADM/HW3_ADM/downloaded_pages/page_118/course_1765.html
Error parsing file: /content/drive/MyDrive/HM3-ADM/HW3_ADM/downloaded_pages/page_119/course_1772.html
Error parsing file: /content/drive/MyDrive/HM3-ADM/HW3_ADM/downloaded_pages/page_128/course_1910.html
Error parsing file: /content/drive/MyDrive/HM3-ADM/HW3_ADM/downloaded_pages/page_140/course_2086.html
Error parsing file: /content/drive/MyDrive/HM3-ADM/HW3_ADM/downloaded_pages/page_196/course_2929.html
Error parsing file: /content/drive/MyDrive/HM3-ADM/HW3_ADM/downloaded_pages/page_196/course_2931.html
Error parsing file: /content/drive/MyDrive/HM3-ADM/HW3_ADM/downloaded_pages/page_215/course_3213.html
Error parsing file: /content/drive/MyDrive/HM3-ADM/HW3_ADM/downloaded_pages/page_291/course_4357.html
Error parsing file: /content/drive/MyDrive/HM3-ADM/HW3_ADM/downloaded_pages/page_293/course_4395.html
Error parsing file: /content/drive/MyDrive/HM3-ADM/HW3_ADM/downloaded_pages/page_2

#### At this point, you should have all the HTML documents about the master's degree of interest, and you can start to extract specific information. The list of the information we desire for each course and their format as desiried.

In [None]:
# Display the concatenated DataFrame
concatenated_df.head()

Unnamed: 0,courseName,universityName,facultyName,isItFullTime,description,startDate,fees,modality,duration,city,administration,country,url
0,3D Design for Virtual Environments - MSc,Glasgow Caledonian University,School of Engineering and Built Environment,Full time,3D visualisation and animation play a role in ...,September,Please see the university website for further ...,MSc,1 year full-time,Glasgow,On Campus,United Kingdom,https://www.findamasters.com/masters-degrees/c...
1,Air Quality Solutions - MSc,University of Leeds,Institute for Transport Studies,Full time,Up to 7 million people are estimated to die ev...,September,"UK: £12,500 (Total) \nInternational: £28,750 (...",MSc,"1 year full time, 2 or 3 years part-time",Leeds,On Campus,United Kingdom,https://www.findamasters.com/masters-degrees/c...
2,Analytical Toxicology MSc,King’s College London,Faculty of Life Sciences & Medicine,Full time,The Analytical Toxicology MSc is a unique stud...,See Course,Please see the university website for further ...,MSc,Full-time: One year,London,On Campus,United Kingdom,https://www.findamasters.com/masters-degrees/c...
3,Applied Computer Science and Artificial Inte...,University of Bradford,Faculty of Engineering & Digital Technologies,Full time,Computer science is the foundation of many exc...,"September, January",Please see the university website for further ...,MSc,1 Year Full Time / 2 Years Part Time,Bradford,On Campus,United Kingdom,https://www.findamasters.com/masters-degrees/c...
4,Applied Economics (Banking and Financial Mar...,University of Bath,University of Bath Online,Part time,From political uncertainty to finance and recr...,"September, January",Cost per 10 credits £722* (10% alumni discount...,MSc,2 years and 6 months full time,Bath,Online,United Kingdom,https://www.findamasters.com/masters-degrees/c...


In [None]:
concatenated_df.shape

(5979, 13)

In [None]:
print(6000-5979, 'were not valid pages')

21 were not valid pages


are all categorical variables

In [None]:
parsed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   courseName      1 non-null      object
 1   universityName  1 non-null      object
 2   facultyName     1 non-null      object
 3   isItFullTime    1 non-null      object
 4   description     1 non-null      object
 5   startDate       1 non-null      object
 6   fees            1 non-null      object
 7   modality        1 non-null      object
 8   duration        1 non-null      object
 9   city            1 non-null      object
 10  administration  1 non-null      object
 11  country         1 non-null      object
 12  url             1 non-null      object
dtypes: object(13)
memory usage: 232.0+ bytes


In [None]:
# Specify the path where you want to save the .tsv file
tsv_file_path = 'MasterDegrees.tsv'

# Save the DataFrame to a .tsv file
concatenated_df.to_csv(tsv_file_path, sep='\t', index=False)

print(f".tsv file saved at: {tsv_file_path}")

.tsv file saved at: /content/drive/MyDrive/HM3-ADM/HW3_ADM/MasterDegrees.tsv


### *Saving singularly the information about each master (html page)*

In [None]:
# Iterate through each DataFrame in the list
for index, parsed_df in enumerate(parsed_dfs):
    # Iterate through each row in the DataFrame
    for row_index, row in parsed_df.iterrows():
        # Replace NaN values with empty strings
        single_row = row.fillna(' ')
        # Specify the path where you want to save the .tsv file for the current row
        tsv_file_path = f'Courses/course_{index}.tsv'
        # Save the single row DataFrame to a .tsv file
        with open(tsv_file_path, 'w') as file:
            file.write('\t'.join(single_row))

#[2.0]

## Preproccessing

Uploading the file created after parsing the information

In [None]:
df = pd.read_csv('MasterDegrees.tsv',sep='\t')
df.head()

NameError: ignored

## [2.0.0]

In [None]:
# Download the stopwords dataset if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

## Stemming

In [None]:
# Make a copy of the original DataFrame
processed_df = df.copy()

# stemmer
stemmer = PorterStemmer()

processed_df['description'] = processed_df.description.apply(lambda row: ' '.join([stemmer.stem(word) for word in row.split(' ')]))

## Lower, removing punctuation and stepwords

In [None]:
stop_words = set(stopwords.words('english'))

# Selecting columns
text_columns = processed_df.select_dtypes(include='object').columns
# Lower
processed_df[text_columns] = processed_df[text_columns].apply(lambda x: x.str.lower() if x.dtype == 'O' else x)

# Exclude fees from removing punctuation
exclude_column = 'fees'

# Applying clean function to all the valid columns except the excluded one
processed_df[text_columns.difference([exclude_column])] = processed_df[text_columns.difference([exclude_column])].applymap(functions.clean)

we verify which element we're filtering to check and think if something else need to be considered. Checking buy some outuput of the stemmed words we see some additional punctuation we don't want to consider which was added within the clean function.

In [None]:
string.punctuation

We can know appreciate the differences of the two datasets (before and after preprocessing). Of our interesting will be the columns **fees** and **description**.

In [None]:
processed_df.head()

## [2.0.1]

 ## We want the field fees to collect numeric information


We consider only *fees* column and deepen what we find

In [None]:
raw_fees= pd.DataFrame(processed_df['fees'])

In [None]:
raw_fees.head()

there are lot of missings data or more complex information we need to filter. To filter we follow the given guidline:

>Really missing data (empty strings -122 of them), and something like this:
>>please see the university website for further information on fees for this course.

We decide to set this values to `none`


In [None]:
# Missing values for fees
print(sum(raw_fees['fees'].isna()==True))


> in case of multiple information, retrieve only the highest fees. This suits perfectly the following example
>>*'uk £13,000 total international £29,000 total',
       '*

### Processing fees

Storing the rates in a file when needed to have an update versions of the rates.

In [None]:
# Your ExchangeRate-API key
api_key = '40f223580924eaf7a1eb4ee0'

# Fetch exchange rates from the API for all currencies against USD
api_url = f'https://open.er-api.com/v6/latest/USD?apikey={api_key}'
response = requests.get(api_url)
data = response.json()
exchange_rates = data['rates']

# Define a mapping between currency symbols in your data and API symbols
currency_symbol_mapping = {
    '£': 'GBP',
    '€': 'EUR',
    '$': 'USD',
    '¥': 'JPY',
    'sek':'SEK', #swedish corona
    'euro':'EUR',
    'hkn' : 'HNK',
    'euros': 'EUR',
    'jpy': 'JPY',
    'hkd':'HDK',
    'isk':'ISK',
    'hkd$': 'HKD',
    'gbp£': 'GBP'
}

pattern = r'(?P<symbol_before>[£$€¥]|euros|eur|jpy|sek|euro|hkn|hkd|isk|hkd$|gbp£)?\s*(?P<value>\d{1,3}(?:,\d{3})*)\s*(?P<symbol_after>[£$€¥]|euros|jpy|sek|euro|hkn|hkd|isk|hkd$|gbp£)?'


apply the function which convert into a target value all entries of fees column

In [None]:
text_columns = processed_df.select_dtypes(include='object').columns
# Return the list of currency symbols and costs encountered in 'fees' field through CONVERT TO COMMON CURRENCY function
raw_fees['fees (USD)'] = raw_fees['fees'].apply(lambda x: functions.return_cost(x) if x is not None else None)


`original` information and `filtered fees`

In [None]:
raw_fees.head()

substituing and renaming the new column with the appropriate currance values

In [None]:
processed_df['fees']= raw_fees['fees (USD)']
processed_df.rename(columns={'fees': 'fees (USD)'}, inplace=True)


In [None]:
processed_df.head()

## [2.1] Conjunctive query

## [2.1.1] Create your index!

### Create a file named vocabulary, in the format you prefer, that maps each word to an integer (id).

#### First of all , let's tokenize the description to have all the words. The 'description' column in the DataFrame 'processed_df' will contain tokenized versions of the text data from the original 'description' column.

In [None]:
processed_df['description']= processed_df.description.apply(lambda row: word_tokenize(row))
processed_df.description.head()

In [None]:
vocabulary = set()
processed_df.description.apply(lambda row: [vocabulary.add(word) for word in row])

#### Now the set vocabulary contains all the words. In fact:

In [None]:
vocabulary

 #### Now we can assign unique ID to each term in the vocabulary. We create a df called vocabulary_df with 2 columns: Word and Id

In [None]:
vocabulary_list = list(vocabulary)
vocabulary_dict = {word: index for index, word in enumerate(vocabulary_list)}
vocabulary_df=pd.DataFrame(list(vocabulary_dict.items()), columns=['Word', 'Id'])
print(vocabulary_df.head())

#### Now we can save it , we choosed to save it both in a json and in a csv

In [None]:
vocabulary_df.to_csv('vocabulary.csv', index=False, header=False)
#remove header to keep the name of the columns

In [None]:
# Store in a json llike a pandas dataframe
vocabulary_df.to_json('vocabulary.json', orient='records')

In [None]:
# Write the dictionary to the JSON file directly
with open('vocabulary.json', 'w') as jsonfile:
    json.dump(vocabulary_dict, jsonfile)

#### Now let's create another dataframe called 'vocabulary_reverse' which, in addition to the columns from 'vocabulary', includes an extra column named 'reverse', containing the list of documents where that word is present

In [None]:
vocabulary_reverse = vocabulary_df.copy()
print(vocabulary_reverse.head())

In [None]:
tqdm.pandas()
vocabulary_reverse['reverse'] = vocabulary_reverse.Word.progress_apply(lambda item: list(processed_df.loc[processed_df.description.apply(lambda row: item in row)].index))

In [None]:
vocabulary_reverse.head()

In [None]:
vocabulary_reverse[vocabulary_reverse['reverse'].apply(lambda x: len(x) == 0)]

#### But Inverted Index has to be a dictionary so:

In [None]:
inverted_index = vocabulary_reverse.set_index('Id')['reverse'].to_dict()
count = 0
for key, value in inverted_index.items():
    if count < 5:
        print(f"Key: {key}, Value: {value}")
        count += 1
    else:
        break

#### Since you do not want to compute the inverted index every time you use the Search Engine, it is worth thinking about storing it in a separate file and loading it in memory when needed.

In [None]:
with open('inverted_index.json', 'w') as file:
    json.dump(inverted_index, file)

## [2.1.2] Execute the query

 We return documents which should contain all the words in the query

#### Selecting only the rows we need to return as output

In [None]:
df_query = df[['courseName','universityName','description','url']].copy()
df_query.head()

#### extracting documents for a specific query using the function engine

In [None]:
custom_functions.engine('advance knowledge')

## [2.2] Conjunctive query & Ranking score

## [2.2.1] Inverted index

### For each word, you want the list of documents in which it is contained and the relative tfIdf score.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### The following code computes TF-IDF values for the text in the 'description' column and stores the TF-IDF matrix in the DataFrame tfidf_data, where each row corresponds to a document, each column corresponds to a word, and the values represent TF-IDF scores.

In [None]:
tfidf = TfidfVectorizer(input='content', lowercase=False, tokenizer=lambda text: text) # , max_df=0.1
results = tfidf.fit_transform(processed_df['description'])
result_dense = results.todense()
tfidf_data = pd.DataFrame(result_dense.tolist(), index=processed_df.index, columns=tfidf.get_feature_names_out())
print(tfidf_data.head())

#### Creating a dictionary in the format Word:[(document1, tfIdf_{term,document1}), (document2, tfIdf_{term,document2})...

In [None]:
inverted_index_2 = {}
for word in tfidf_data.columns:
    word_docs = tfidf_data.index[tfidf_data[word] > 0].tolist()
    word_doc_scores = [(doc, tfidf_data.loc[doc, word]) for doc in word_docs]
    inverted_index_2[word] = word_doc_scores
count = 0
for word, doc_scores in inverted_index_2.items():
    print(f"Word: {word}")
    print("Document Scores:")
    print(doc_scores)
    count += 1
    if count == 10:
        break

#### But we need to have the word's ID as the key, not the word itself, we can take this ID from the vocabulary_df. We'll create a new dictionary that meets the requirements specified in the prompt.

In [None]:
new_inverted_index = {}
for word, doc_scores in inverted_index_2.items():
    word_id = vocabulary_df[vocabulary_df['Word'] == word]['Id'].values
    if len(word_id) > 0:
        word_id = word_id[0]
        new_inverted_index[word_id] = doc_scores

#### Let's see the result

In [None]:
n = 10
partial_inverted_index = {k: new_inverted_index[k] for k in list(new_inverted_index.keys())[:n]}
print(partial_inverted_index)

#### Compare document with cosine similarity

Given a query we will compare the query and evaluate the cosine similarity between th query and the extracted document through the engine function. We retrieve k = 10 documents: the first k ordered by cosine similarity.

In [None]:
df_query_processed= processed_df[['courseName','universityName','description','url']].copy()

In [None]:
df_query_processed.head()

In [None]:
tfidf = TfidfVectorizer(input='content', lowercase=False, tokenizer=lambda text: text) # , max_df=0.1
results = tfidf.fit_transform(df_query_processed['description'])
result_dense = results.todense()
tfidf_data = pd.DataFrame(result_dense.tolist(), index=df_query_processed.index, columns=tfidf.get_feature_names_out())

Copules of `EXAMPLES` were we leverage the function rank_documents to accomplished the mentioned task

In [None]:
functions.rank_documents("advanced knowledge")

In [None]:
functions.rank_documents("air pollution")

#[7] Algorithmic question

The problem involves helping Leonardo create a fake report on the number of hours he worked each day for the past d days. Leonardo has the total sum of hours he worked, the number of days, and HR limitations for each day.

## Implementation of the code

In [None]:
# Input the total number of days and the total number of hours
total_days, sum_hours = map(int, input().split())

# Initialize a list to store the ranges of hours for each day
day_min_max = []

# Input the minimum and maximum hours for each day and create a list of ranges
for _ in range(total_days):
    min1, max1 = map(int, input().split())
    day_min_max.append([i for i in range(min1, max1 + 1)])

def find_combinations_with_sum(nested_list, target_sum):
    """
    Having a nested list with the information of the overall amount of days and hour
    worked and informationa bout mintime e maxtime for a single day, returns wether
    it it possibile to return a fake report which respect costrains
    """
    # Get the minimum and maximum hours for each day
    min_hours = [day[0] for day in day_min_max]
    max_hours = [day[-1] for day in day_min_max]

    # Check if the total hours are within the valid range
    if sum_hours >= sum(min_hours) and sum_hours <= sum(max_hours):
        print('YES')
    else:
        print('NO')

    # Iterate through all combinations of hours and print those with the target sum
    for combination in product(*nested_list):
        if sum(combination) == target_sum:
            print(*combination)

# Call the function with the provided day_min_max and sum_hours
find_combinations_with_sum(day_min_max, sum_hours)


1 1
2 3
NO



## Let's combine the time complexity of individual parts to determine the overall time complexity of the provided code:



Initializing min_hours and max_hours: O(total_days)
* Checking total hours range: O(1)
* Generating combinations: O(product of sizes of nested_list)
* Overall, the dominant factor is the third part involving the generation of combinations. Therefore, the overall time complexity is O(product of sizes of nested_list).

#### example : if we have only two working days


let's calculate the number of combinations generated by the product function for two days, each with a range of (max - min + 1) hours

day1 = max_hour - min_hour + 1 = n1

day2 = max_hour - min_hour + 1 = n2


Complexity would be : O(n1 * n2 )

## ChatGpt Implementation



both awsers are right but we think the complexity of the code that Chatgpt has been generated is better since it use backtracking algorithm which avoids generation all combination

##  **Optimality**:


The solution can be considered suboptimal in terms of time complexity, especially when the ranges for each day are large.
It generates all combinations using itertools.product, which can be inefficient for large inputs.
An improvement could be to use a backtracking algorithm that explores valid combinations more efficiently, terminating the search as soon as a valid combination is found.