## Preparing the lab environment

- Create a dataset directory
- download the file to the *datasets* directory

In [2]:
!mkdir datasets

A subdirectory or file datasets already exists.


In [None]:
#####!gdown 18Ulneqq0CSsuPPva9F4If-OFJkn7rj4M -O datasets/sigma.zip

### Download the dataset

In [None]:
####ls -al datasets/

### Unzip the dataset

import zipfile
import os

# Path to the ZIP file
zip_path = "datasets/sigma.zip"
extract_path = "datasets/"

# Extract the ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"Files extracted to: {extract_path}")



## Benchmarking Various File Formats

- csv
- pickle
- feather
- parquet

### Reading the CSV File

In [3]:
import pandas as pd
import os

In [5]:
#Force pandas to read the entire file in one go, which can avoid type inference issues caused by chunk processing: as the error in next comman line 
#By default, pandas processes large files in chunks (low_memory=True), inferring data types on the fly, leading to potential mismatches.
import pandas as pd

train_df = pd.read_csv(r'C:\Users\lavan\OneDrive\Desktop\ISB\TERM 2\CT1\Group Assignment\Wholesale customers data.csv', low_memory=False)


In [6]:
train_df.dtypes

Channel             int64
Region              int64
Fresh               int64
Milk                int64
Grocery             int64
Frozen              int64
Detergents_Paper    int64
Delicassen          int64
dtype: object

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Channel           440 non-null    int64
 1   Region            440 non-null    int64
 2   Fresh             440 non-null    int64
 3   Milk              440 non-null    int64
 4   Grocery           440 non-null    int64
 5   Frozen            440 non-null    int64
 6   Detergents_Paper  440 non-null    int64
 7   Delicassen        440 non-null    int64
dtypes: int64(8)
memory usage: 27.6 KB


In [8]:
train_df.shape

(440, 8)

In [9]:
train_df.head(5)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185


### Optimizing data types

In [None]:
######while reading the file with data, we can enforce our own columns, meaning youcan define what your datatype has to be 
#memory will reduce as for the last few variabes the bytes have been reduced when we revised the datatypes and enforced new
#dtypes = {
   # "channelGrouping": "str",
    #"date": "int16",
    #"device": "str",
    #"fullVisitorId": "str",
    #"geoNetwork": "str",
    #"sessionId": "str",
    #"socialEngagementType": "str",
    #"totals": "str",
    #"trafficSource": "str",
    #"visitId": "uint16",
    #"visitNumber": "int16",
    #"visitStartTime": "uint16"
#}

In [None]:
####train_new_df = pd.read_csv("./datasets/train.csv", dtype = dtypes)

In [None]:
####train_new_df.info()

### Creating other file formats

In [None]:
train_new_df.to_pickle("./datasets/train.pkl")

In [None]:
import os

# Get all files and their sizes in MB in the 'datasets' folder
for filename in os.listdir("datasets"):
    file_path = os.path.join("datasets", filename)
    if os.path.isfile(file_path):
        file_size = os.path.getsize(file_path)  # Get file size in bytes
        file_size_mb = file_size / (1024 * 1024)  # Convert bytes to MB
        print(f"{filename} -> {file_size_mb:.2f} MB")




In [None]:
#this is not compatible with windows, so used the command above
#ls -al datasets/

In [None]:
file_size
file_size_mb = file_size / (1024 * 1024)  # Convert bytes to MB
file_size_mb
print(f"{filename} -> {file_size_mb:.2f} MB")

In [None]:
train_new_df.to_parquet("./datasets/train.parquet")

In [None]:
import os

# List files and sizes in the 'datasets' folder
for filename in os.listdir("datasets"):
    file_path = os.path.join("datasets", filename)
    if os.path.isfile(file_path):
        file_size = os.path.getsize(file_path)  # Get file size in bytes
        file_size_mb = file_size / (1024 * 1024)  # Convert to MB
        print(f"{filename} -> {file_size_mb:.2f} MB")


In [None]:
#this is not compatible with windows, so used the command above
#!ls -al datasets/

In [None]:
train_new_df.to_feather("./datasets/train.feather")

In [None]:
import os

# List files and sizes in the 'datasets' folder
for filename in os.listdir("datasets"):
    file_path = os.path.join("datasets", filename)
    if os.path.isfile(file_path):
        file_size = os.path.getsize(file_path)  # Get file size in bytes
        file_size_mb = file_size / (1024 * 1024)  # Convert to MB
        print(f"{filename} -> {file_size_mb:.2f} MB")


In [None]:
#this is not compatible with windows, so used the command above
#!ls -al datasets/

In [None]:
### Comparing file sizes

In [None]:
##file_size = os.path.getsize('d:/file.jpg')

In [None]:
filenames = ['./datasets/train.csv',
             './datasets/train.pkl',
             './datasets/train.feather',
             './datasets/train.parquet']

all_filesizes = [os.path.getsize(f) for f in filenames]

filescompare_df = pd.DataFrame( { "formats" : filenames,
                                  "filesize": all_filesizes})

In [None]:
# Define the folder path where the files are located
folder_path = "datasets/"

# Create an empty list to store file info
file_info = []

# Iterate through all files in the specified folder
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    if os.path.isfile(file_path):
        # Get file size in bytes
        file_size = os.path.getsize(file_path)
        
        # Convert file size to MB
        file_size_mb = file_size / (1024 * 1024)
        
        # Append the file info to the list
        file_info.append({"Filename": filename, "Size_MB": round(file_size_mb, 2)})

# Create a DataFrame from the file info list
filescompare_df = pd.DataFrame(file_info)

# Display the DataFrame
print(filescompare_df)

In [None]:
filescompare_df

In [None]:
import matplotlib.pyplot as plt
import seaborn as sn

In [None]:
import os
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

# Define the folder path where the files are located
folder_path = "datasets/"

# Create an empty list to store file info
file_info = []

# Iterate through all files in the specified folder
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    if os.path.isfile(file_path):
        # Get file size in bytes
        file_size = os.path.getsize(file_path)
        
        # Convert file size to MB
        file_size_mb = file_size / (1024 * 1024)
        
        # Extract file extension (format)
        file_format = os.path.splitext(filename)[1][1:]  # Strip the leading dot
        
        # Append the file info to the list
        file_info.append({"Filename": filename, "Size_MB": round(file_size_mb, 2), "Formats": file_format})

# Create a DataFrame from the file info list
filescompare_df = pd.DataFrame(file_info)

# Create the barplot
plt.figure(figsize=(15, 5))
sn.barplot(data=filescompare_df, x='Formats', y='Size_MB')

# Show the plot
plt.show()


In [None]:
# this isnt working,so used the above syntax for the plot
plt.figure( figsize = (15, 5) )
sn.barplot( data = filescompare_df,
           x = 'formats',
           y = 'filesize');

### Read time benchmarking of different formats

In [None]:
%%time

train_pkl_df = pd.read_pickle("./datasets/train.pkl")

In [None]:
%%time

train_feather_df = pd.read_feather("./datasets/train.feather")

In [None]:
train_feather_df.shape

In [None]:
%%time

train_parquet_df = pd.read_parquet("./datasets/train.parquet")

In [None]:
import time

t1 = time.perf_counter()
train_df = pd.read_csv("./datasets/train.csv")
time_csv = time.perf_counter() - t1

t1 = time.perf_counter()
train_pkl_df = pd.read_pickle("./datasets/train.pkl")
time_pkl = time.perf_counter() - t1

t1 = time.perf_counter()
train_feather_df = pd.read_feather("./datasets/train.feather")
time_feather = time.perf_counter() - t1

t1 = time.perf_counter()
train_parquet_df = pd.read_parquet("./datasets/train.parquet")
time_parquet = time.perf_counter() - t1

In [None]:
filescompare_df['read_time'] = [time_csv, time_pkl, time_feather, time_parquet]

In [None]:
filescompare_df

In [None]:
plt.figure( figsize = (15, 5) )
sn.barplot( data = filescompare_df,
           x = 'formats',
           y = 'read_time');

## Frameworks

- datatable (https://datatable.readthedocs.io/en/latest/index.html)
- dask

In [None]:
!pip install datatable

In [None]:
import datatable as dt
print(dt.__version__)

In [None]:
%%time

train_dtable = dt.fread("./datasets/train.csv")

In [None]:
train_df.head(5)

In [None]:
train_dtable_df = train_dtable.to_pandas()

In [None]:
train_dtable_df.info()

In [None]:
!pip install "dask[complete]"

In [None]:
import pandas as pd
import dask.dataframe as dd

In [None]:
%%time

train_dask_df = dd.read_parquet("./datasets/train.parquet").compute()

In [None]:
train_dask_df.info()

In [None]:
train_dask_df.head(5)

In [None]:
train_dask_df.shape

In [None]:
train_dask_df.channelGrouping.unique()