In [2]:
import requests
import gzip
import io
import pandas as pd

def download_data(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise exception if request was unsuccessful
        return response.content
    except requests.exceptions.RequestException as e:
        raise Exception(f"Error occurred during data download: {e}")

def gzip_data(data):
    try:
        compressed_data = io.BytesIO()
        with gzip.GzipFile(fileobj=compressed_data, mode='wb') as f:
            f.write(data)
        return compressed_data.getvalue()
    except Exception as e:
        raise Exception(f"Error occurred during data compression: {e}")

def read_csv_data(gzipped_data):
    try:
        with gzip.GzipFile(fileobj=io.BytesIO(gzipped_data), mode='rb') as f:
            df = pd.read_csv(f)
        return df
    except Exception as e:
        raise Exception(f"Error occurred during CSV data reading: {e}")

def save_as_parquet(df, output_file):
    try:
        df.to_parquet(output_file)
        print(f"Data saved in Parquet format: {output_file}")
    except Exception as e:
        raise Exception(f"Error occurred during Parquet file saving: {e}")

def main():
    url = "http://data.insideairbnb.com/united-states/dc/washington-dc/2023-03-19/data/reviews.csv.gz"
    output_file = "data.parquet"

    try:
        # Download data
        data = download_data(url)

        # Gzip data
        gzipped_data = gzip_data(data)

        # Read in CSV format
        df = read_csv_data(gzipped_data)

        # Save as Parquet
        save_as_parquet(df, output_file)
    except Exception as e:
        print(f"Error occurred during data processing: {e}")



In [None]:
if __name__ == "__main__":
    main()


In [3]:
 url = "http://data.insideairbnb.com/united-states/dc/washington-dc/2023-03-19/data/reviews.csv.gz"
data = download_data(url)

In [4]:
gzipped_data = gzip_data(data)

In [5]:
df = read_csv_data(gzipped_data)

Exception: Error occurred during CSV data reading: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte

In [6]:
import requests
import gzip
import io
import pandas as pd

def download_data(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise exception if request was unsuccessful
        return response.content
    except requests.exceptions.RequestException as e:
        raise Exception(f"Error occurred during data download: {e}")

def gzip_data(data):
    try:
        compressed_data = io.BytesIO()
        with gzip.GzipFile(fileobj=compressed_data, mode='wb') as f:
            f.write(data)
        return compressed_data.getvalue()
    except Exception as e:
        raise Exception(f"Error occurred during data compression: {e}")

def save_as_csv(gzipped_data, output_file):
    try:
        with gzip.GzipFile(fileobj=io.BytesIO(gzipped_data), mode='rb') as f:
            df = pd.read_csv(f)
        df.to_csv(output_file, index=False)
        print(f"Data saved in CSV format: {output_file}")
    except Exception as e:
        raise Exception(f"Error occurred during CSV file saving: {e}")

def main():
    url = "http://data.insideairbnb.com/united-states/dc/washington-dc/2023-03-19/data/reviews.csv.gz"
    output_file = "data.csv"

    try:
        # Download data
        data = download_data(url)

        # Gzip data
        gzipped_data = gzip_data(data)

        # Save as CSV
        save_as_csv(gzipped_data, output_file)
    except Exception as e:
        print(f"Error occurred during data processing: {e}")



In [7]:
data = download_data(url)

In [8]:
gzipped_data = gzip_data(data)

In [10]:
output_file = "data.csv"
save_as_csv(gzipped_data, output_file)

Exception: Error occurred during CSV file saving: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte

In [14]:
import requests
import gzip
import io
import pandas as pd

def download_data(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise exception if request was unsuccessful
        return response.content
    except requests.exceptions.RequestException as e:
        raise Exception(f"Error occurred during data download: {e}")

def unzip_data(data):
    try:
        with gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb') as f:
            unzipped_data = f.read()
        return unzipped_data
    except Exception as e:
        raise Exception(f"Error occurred during data unzipping: {e}")

def save_data(data, output_file):
    try:
        with open(output_file, 'wb') as f:
            f.write(data)
        print(f"Data saved: {output_file}")
    except Exception as e:
        raise Exception(f"Error occurred during data saving: {e}")

def convert_to_parquet(csv_file, parquet_file):
    try:
        df = pd.read_csv(csv_file)
        df.to_parquet(parquet_file)
        print(f"Data converted to Parquet: {parquet_file}")
    except Exception as e:
        raise Exception(f"Error occurred during Parquet conversion: {e}")

def main():
    url = "http://data.insideairbnb.com/united-states/dc/washington-dc/2023-03-19/data/reviews.csv.gz"
    output_file = "data/paris_reviews_unzipped_data.csv"
    parquet_file = "data/paris_reviews.parquet"

    try:
        # Download data
        data = download_data(url)

        # Unzip data
        unzipped_data = unzip_data(data)

        # Save data
        save_data(unzipped_data, output_file)

        # Convert to Parquet
        convert_to_parquet(output_file, parquet_file)
    except Exception as e:
        print(f"Error occurred during data processing: {e}")

if __name__ == "__main__":
    main()


Data saved: unzipped_data.csv
Data converted to Parquet: converted_data.parquet


In [21]:
df = pd.read_csv('/Users/piyush/Desktop/dsml_Portfolio/new_project/data/paris_reviews_unzipped_data.csv')

In [22]:
df.shape

(330237, 6)

In [24]:
dfp = pd.read_parquet('../data/paris_reviews.parquet')

In [26]:
dfp.shape

(330237, 6)

In [32]:
dfp.comments[257]

"I slept in Heather and Vasa's room only for 3 nights.\r<br/>I met them and they are very helpful and kind.\r<br/>The house is really few minutes walking from the most important monuments and museums.\r<br/>For my next time in DC I'll sure come back to them for my accomodation.\r<br/>\r<br/>Thanks Heather and Vasa, I really appreciate it.."