### Loading Dataset

In [1]:
import datasets
import pandas as pd

from datasets import load_dataset

import os
import ray

import warnings; warnings.filterwarnings("ignore")
from dotenv import load_dotenv; load_dotenv()

import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [2]:
local = False

if local:
    ### NOT WORKING RN
    dataset_name = "traversaal-ai-hackathon/hotel_datasets"
    local_data_dir = "hotel_datasets.hf/"

    # Load the dataset from the local directory
    dataset = load_dataset(dataset_name, split="train", data_dir=local_data_dir)

else:
    dataset = load_dataset("traversaal-ai-hackathon/hotel_datasets")
    dataset

In [92]:
save_csv = False
if save_csv:
    df = pd.DataFrame(dataset["train"])
    df.to_csv("hotel_datasets.csv")

In [3]:
dataset.save_to_disk("hotel_datasets.hf")

df = pd.DataFrame(dataset)
df.to_csv("hotel_datasets.csv")

Saving the dataset (0/1 shards):   0%|          | 0/5997 [00:00<?, ? examples/s]

### Clean Data to make two tables

In [4]:
train_dataset = dataset['train']
unique_hotel_indices = {}

for idx, name in enumerate(train_dataset['hotel_name']):
    if name not in unique_hotel_indices:
        unique_hotel_indices[name] = idx

#Just save one time for the same hotel
merge_dataset = train_dataset.select(list(unique_hotel_indices.values()))

# Delete the table 1 columns
table_2_dataset = merge_dataset.remove_columns(['review_text', 'review_title'])
table_2_dataset

Dataset({
    features: ['hotel_name', 'hotel_description', 'rate', 'tripdate', 'hotel_url', 'hotel_image', 'price_range', 'rating_value', 'review_count', 'street_address', 'locality', 'country'],
    num_rows: 150
})

In [95]:
print(type(dataset["train"]))

<class 'datasets.arrow_dataset.Dataset'>


In [5]:
# Just save our interesting three columns
reduced_dataset = train_dataset.remove_columns(['hotel_description', 'rate', 'tripdate', 'hotel_url',
                                                 'hotel_image', 'price_range', 'rating_value', 'review_count',
                                                 'street_address', 'locality', 'country'])

In [6]:
@ray.remote
def process_data_point(dp):
    text = ""
    if dp['review_text']:
        text = dp['review_text']
    return dp['hotel_name'], (dp['review_title'], text)

futures = [process_data_point.remote(dp) for dp in reduced_dataset]

results = ray.get(futures)

hotel_reviews = {}
for hotel_name, review in results:
    if hotel_name not in hotel_reviews:
        hotel_reviews[hotel_name] = []
    hotel_reviews[hotel_name].append(review)

# hotel_reviews Table 1

2024-02-10 22:53:30,265	INFO worker.py:1724 -- Started a local Ray instance.


### Chunk data

In [68]:
# Have to install langchain first
import json
from langchain.document_loaders import ReadTheDocsLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Text splitter
chunk_size = 500
chunk_overlap = 50
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
)

# Chunk a sample section

first_key = next(iter(hotel_reviews))
data_review = {}
li = []

for k, v in hotel_reviews.items():
    # sample_section = v
    all_reviews = ""
    for review in v:
        all_reviews += review[0] + review[1] + " - "
    data_review[k] = all_reviews
    li.append(all_reviews)
    # count += 1

# print(count)

In [82]:
chunked_dict = {}
def chunk_string(string, chunk_size):
    return [string[i:i+chunk_size] for i in range(0, len(string), chunk_size)]

chunk_size = 2500
for key, value in data_review.items():
    chunked_dict[key] = chunk_string(value, chunk_size)

# chunked_dict is for table 1 to be summarized

In [None]:
first_review = data_review[first_key]
sample_chunk = first_review[:500]

new_key = next(iter(chunked_dict))
new_first = chunked_dict[new_key]


In [86]:
from transformers import pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

summarized_dict = {}

count = 0
for k, v in chunked_dict.items():
    final_output = ""
    for chunk in v:
        summary = summarizer(v, max_length = 100, min_length = 50, do_sample=False)
        final_output += summary[0]['summary_text']
    summarized_dict[k] = final_output
    if count == 0: 
        print(k, final_output)
        break

Romance Istanbul Hotel Great location, walking distance to all of the main attractions. All the staff is very friendly and helpful. Close to the old city and Topcapi. Very nice breakfast with different menu options every day. Spa is nice and of course location is great near old city.Great location, walking distance to all of the main attractions. All the staff is very friendly and helpful. Close to the old city and Topcapi. Very nice breakfast with different menu options every day. Spa is nice and of course location is great near old city.Great location, walking distance to all of the main attractions. All the staff is very friendly and helpful. Close to the old city and Topcapi. Very nice breakfast with different menu options every day. Spa is nice and of course location is great near old city.Great location, walking distance to all of the main attractions. All the staff is very friendly and helpful. Close to the old city and Topcapi. Very nice breakfast with different menu options ev

In [89]:
print(len(str(summarized_dict[first_key]).split(" ")))

226


In [None]:
import csv 

csv_filename = 'output.csv'  # Replace with your actual file path

# Initialize an empty dictionary to store the data
csv_data = {}

# Read the CSV file and populate the dictionary
header = True
with open(csv_filename, 'r') as csvfile:
    reader = csv.reader(csvfile)

    # Read the header and initialize keys
    # header = next(reader, None)

    # Read the remaining rows and populate the dictionary
    for row in reader:
        if header:
            header = False
            continue
        # print(row)
        csv_data[row[0]] = row[1]

# Print or use the resulting dictionary
print(csv_data)

In [108]:
documents = []
for row in table_2_dataset:
    new_hotel = {}
    name = row["hotel_name"]
    review = csv_data[name]
    new_hotel["hotel_name"] = name
    new_hotel["review"] = review

    for col, value in row.items():
        new_hotel[col] = value
    documents.append(new_hotel)
print(documents[0])

{'hotel_name': 'Romance Istanbul Hotel', 'review': 'The room was absolutely amazing, we had the Royal Suite, the restaurant had very fresh and tasty food. The breakfast was excellent and again the restaurant staff were very polite and helpful. The spa soooo relaxing and clean. In one word FABULOUS!! - Great place, highly recommended. The hotel was in a great location for the palace and blocks away from the bazaar’s. Coffee down the street along with taxi drop off and pick up. Staff was friendly and welcoming. Free welcome drink and snack. Decoration is nice and homey. The hotel was very clean and the room had everything needed to have a comfortable stay. The staff was amazing and helped with everything that was needed. The Turkish breakfast every morning was very delicious, abundant and reasonable. The tea service every afternoon was free and delightful. We couldn’t have wished for a better experience. Highly recommend this hotel. Location is very good near historical places and Hagia 

### Embedding

In [109]:
from sentence_transformers import SentenceTransformer

encoder = SentenceTransformer("all-MiniLM-L6-v2")

#### VECTOR DB Downstairs

In [110]:
from qdrant_client import models, QdrantClient

In [111]:
qdrant = QdrantClient(":memory:")


qd = qdrant.recreate_collection(
    collection_name="hotels",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),  # Vector size is defined by used model
        distance=models.Distance.COSINE,
    ),
    
)

qdrant.upload_records(
    collection_name="hotels",
    records=[
        models.Record(
            id=idx, vector=encoder.encode(doc["review"]).tolist(), payload=doc
        )
        for idx, doc in enumerate(documents)
    ],
)

In [114]:
hits = qdrant.search(
    collection_name="hotels",
    query_vector=encoder.encode("Good Family San Francisco Hotel with nice view").tolist(),
    limit=5,
)
for hit in hits:
    print(hit.payload, "score:", hit.score)

{'hotel_name': 'Hyatt Regency San Francisco', 'review': 'The Hyatt Regency is one of the best hotels in San Francisco due to its waterfront location while still being in downtown and across from the Embarcadero and pier. It has a great Regency club at the rooftop with a great view. The food was good, but it kept coming out in small batches. The hotel is in a great location, is clean and modern, and has great staff. The hotel rooms and gym are clean andModern. Dining options are very limited. There’s a food market for snacks and beverages and a bar on site with very limited menu. The lobby bar and restaurant has been thoroughly refreshed. It feels very intimate despite being in the middle of the cavernous lobby/atrium. The atrium itself now has lots of wooden benches, sofas, and plugs for charging. In-room renovations have likewise created a comfortable environment you might want to linger in. ompany party. We dpo  - Great Service and Cleanliness - A Property In A Fantastic Location....

In [115]:
qdrant_client = QdrantClient(
    "https://2440eff1-bd4d-4080-bffd-bdb046f95447.us-east4-0.gcp.cloud.qdrant.io",
    api_key="v_cLOKp3fNY1HsUwpGRpxy_PyY3fsmARz46M8GyWfkkRrJHe_7uehQ",
)

In [116]:
qd = qdrant_client.recreate_collection(
    collection_name="hotels",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),  # Vector size is defined by used model
        distance=models.Distance.COSINE,
    ),
    
)

qdrant_client.upload_records(
    collection_name="hotels",
    records=[
        models.Record(
            id=idx, vector=encoder.encode(doc["review"]).tolist(), payload=doc
        )
        for idx, doc in enumerate(documents)
    ],
)

In [139]:
hits = qdrant_client.search(
    collection_name="hotels",
    query_vector=encoder.encode("Good Family Istanbul Hotels with nice view").tolist(),
    limit=5,
    query_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="locality",
                match=models.MatchValue(
                    value="Istanbul",
                ),
            )
        ]
    )
)

In [140]:
total = 0
for hit in hits:
    if hit.score > 0.5:
        print(hit)
total

id=29 version=2 score=0.7665979 payload={'country': 'Türkiye', 'hotel_description': 'A new meeting point for the business world in İstanbul Basin Ekspres. The 5 star Elite World Grand İstanbul Basın Ekspres Hotel will start serving in February 2017 and you will enjoy its accommodation facilities with high quality standards for your business travels or leisure getaways. Elite World Grand İstanbul Basın Ekspres, 5 minutes distance from International İstanbul Atatürk International Airport, 12 minutes distance from congress, fair and exhibition centers such as CNR Expo, IDTM and TUYAP. Our Istanbul luxury hotel is only 25 minutes away from Istanbul\'s city center and the old city. Luxurious 401 rooms and suites, 9 meeting rooms with a capacity of 2500 people, authentic examples of Turkish and international cuisines and a superb service are offered in "The Grill Restaurant," The \'\'Coffee Company\'\' where you will be eating your daily cake and sipping your coffee with pleasure, "The One B

0

### Connecting with qdrant-client endpoint

1. Qdrant-client is running
2. Am able to curl and setup a instance called QdrantClient

In [None]:
# from qdrant_client import QdrantClient
# from sentence_transformers import SentenceTransformer

# # encoder = SentenceTransformer("all-MiniLM-L6-v2")
# # encoder = SentenceTransformer("distilbert-base-nli-mean-tokens")

# client_mem = QdrantClient(":memory:")

# qdrant_client = QdrantClient(
#     "https://2440eff1-bd4d-4080-bffd-bdb046f95447.us-east4-0.gcp.cloud.qdrant.io",
#     api_key="v_cLOKp3fNY1HsUwpGRpxy_PyY3fsmARz46M8GyWfkkRrJHe_7uehQ",
# )

In [None]:
# docs = ["Qdrant has Langchain integrations", "Qdrant also has Llama Index integrations"]
# metadata = [
#     {"source": "Langchain-docs"},
#     {"source": "Linkedin-docs"},
# ]
# ids = [42, 2]

# # Use the new add method
# embedded = client_mem._embed_documents(docs)

# client_mem.add(
#     collection_name="demo_collection",
#     documents=docs,
#     metadata=metadata,
#     ids=ids
# )

# search_result = client_mem.query(
#     collection_name="demo_collection",
#     query_text="This is a query document"
# )
# print(search_result)

In [96]:
# from transformers import BartForConditionalGeneration, BartTokenizer

# model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
# tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

# model.save_pretrained('./bart-large-cnn')
# tokenizer.save_pretrained('./bart-large-cnn')


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('./bart-large-cnn/tokenizer_config.json',
 './bart-large-cnn/special_tokens_map.json',
 './bart-large-cnn/vocab.json',
 './bart-large-cnn/merges.txt',
 './bart-large-cnn/added_tokens.json')