In [9]:
import pandas as pd
import re
import html


def remove_html_tags(text):
    """Remove html tags from a text"""
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)


def remove_urls(text):
    """Remove urls from a string"""
    return re.sub(r'http\S+', '', text, flags=re.MULTILINE)


def lowercase(text):
    """Lowercase text"""
    return text.lower()


def preprocess_text(text):

    # Remove HTML tags
    text = remove_html_tags(text)

    # Remove URLs
    text = remove_urls(text)

    # Convert text to lowercase
    text = text.lower()

    return text


def process_parquet_file(input_parquet_file, output_parquet_file):
    try:
        # Read the Parquet file into a DataFrame
        df = pd.read_parquet(input_parquet_file)

        # Apply preprocessing to the text column
        df['comments'] = df['comments'].astype(str)
        df['comments'] = df['comments'].apply(preprocess_text)

        # Convert date to datetime and extract year. We also consider reviews from 2013-2023 with 5000 reviews from each year.
        df['date']= pd.to_datetime(df['date'])
        df['year'] = df['date'].dt.year
        df = df[df['year'].isin([2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023])].reset_index(drop=True)
        df = df.groupby('year').apply(lambda x: x.sample(5000,random_state=100)).reset_index(drop=True)

        # Save the processed DataFrame as a new Parquet file
        df.to_parquet(output_parquet_file, index=False)
        print(f"Processed data saved as Parquet file: {output_parquet_file}")

    except Exception as e:
        raise Exception(f"Error occurred during Parquet file processing: {e}")
    

def main():
    input_parquet_file = "../data/paris_reviews.parquet"
    output_parquet_file = "../data/paris_reviews_preprocessed.parquet"

    try:
        # Process the Parquet file
        process_parquet_file(input_parquet_file, output_parquet_file)
    except Exception as e:
        print(f"Error occurred during data processing: {e}")




In [2]:
if __name__ == "__main__":
    main()

Error occurred during data processing: Error occurred during Parquet file processing: expected string or bytes-like object, got 'NoneType'


In [10]:
input_parquet_file = "../data/paris_reviews.parquet"

In [11]:
df = pd.read_parquet(input_parquet_file)

In [12]:
df.shape

(1406845, 6)

In [13]:
        # Apply preprocessing to the text column
        df['comments'] = df['comments'].astype(str)
        df['comments'] = df['comments'].apply(preprocess_text)

        # Convert date to datetime and extract year. We also consider reviews from 2013-2023 with 5000 reviews from each year.
        df['date']= pd.to_datetime(df['date'])
        df['year'] = df['date'].dt.year
        df = df[df['year'].isin([2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023])].reset_index(drop=True)
        df = df.groupby('year').apply(lambda x: x.sample(5000,random_state=100)).reset_index(drop=True)

In [14]:
df

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,year
0,599419,4592866,2013-05-14,5141358,Oksana,i booked lilian's apartment for my parents - t...,2013
1,1679711,8743133,2013-11-15,3674456,Wei Chian,it was a last minute reservation (day before) ...,2013
2,1780833,9186405,2013-12-11,1820498,Jeff,"what a wonderful flat. well-located, perfectly...",2013
3,111270,5275567,2013-06-22,4801752,Juliane,we had a wonderful time at francois' place. th...,2013
4,874864,7792037,2013-10-03,8083127,Danny,audrey is a wonderful host and her apartment i...,2013
...,...,...,...,...,...,...,...
54995,607213527992148908,810015717863921137,2023-01-22,305984574,Myriam,merci pour votre accueil.,2023
54996,24538105,828061132627903355,2023-02-16,220718628,Renata,"the stay was overall okay, get what we’ve paid...",2023
54997,54390678,815078458292521587,2023-01-29,326207448,Anna,a nice place to spend a few days in paris. jus...,2023
54998,25180580,795483123881965037,2023-01-02,191056209,Trin,guillaume is super wonderful host. we got a sh...,2023


In [15]:
ee = pd.read_parquet('../data/paris_reviews_preprocessed.parquet')

In [17]:
ee.year.value_counts()

year
2013    5000
2014    5000
2015    5000
2016    5000
2017    5000
2018    5000
2019    5000
2020    5000
2021    5000
2022    5000
2023    5000
Name: count, dtype: int64