In [None]:
!pip install pandas==2.1.4
!pip install modin[ray]==0.31.0 ray==2.35.0
!pip install vaex

In [None]:
import pandas as pd
import dask.dataframe as dd
import modin.pandas as mpd
import ray
import time
import os
import yaml
import vaex

In [None]:
# Define file path
file_path = '/content/Airline Dataset.csv'

# Reading with Pandas
start_time = time.time()
df_pandas = pd.read_csv(file_path)
pandas_time = time.time() - start_time

# Reading with Dask
start_time = time.time()
df_dask = dd.read_csv(file_path)
dask_time = time.time() - start_time

# Reading with Modin (Ray backend)
start_time = time.time()
df_modin = mpd.read_csv(file_path)
modin_time = time.time() - start_time

# Display the times
print(f"Pandas read time: {pandas_time} seconds")
print(f"Dask read time: {dask_time} seconds")
print(f"Modin (Ray) read time: {modin_time} seconds")

In [None]:
def clean_column_names(df):
    df.columns = df.columns.str.replace('[^A-Za-z0-9]+', '_')
    df.columns = df.columns.str.strip()
    return df

# Apply to dataframes
df_pandas = clean_column_names(df_pandas)
df_dask = clean_column_names(df_dask)
df_modin = clean_column_names(df_modin)

In [None]:
schema = {
    'separator': ',',
    'columns': list(df_pandas.columns)
}

# Write schema to YAML file
with open('schema.yaml', 'w') as file:
    yaml.dump(schema, file)

In [None]:
# Load YAML schema
with open('schema.yaml', 'r') as file:
    schema_yaml = yaml.safe_load(file)

# Check number of columns and names
assert len(df_pandas.columns) == len(schema_yaml['columns']), "Column count does not match!"
assert all(df_pandas.columns == schema_yaml['columns']), "Column names do not match!"


In [None]:
# Define output file path
output_file_path = '/content/output_file.txt.gz'

# Write to pipe-separated text file and compress
df_pandas.to_csv(output_file_path, sep='|', index=False, compression='gzip')

print("File written and compressed.")

In [None]:
# Number of rows and columns
num_rows, num_columns = df_pandas.shape

# File size
file_size = os.path.getsize(output_file_path) / (1024 * 1024)  # Size in MB

print(f"Total number of rows: {num_rows}")
print(f"Total number of columns: {num_columns}")
print(f"File size: {file_size:.2f} MB")
