- To run other package from other file, we can : 
* Way 1: 
1. !pip install ipynb
2. from ipynb.fs.full.<filename.ipynb> import * 
* Way 2: 
1. pip install nbformat nbclient
2. %run <filename.ipynb> import * 
* Way 3: run directly: %run <file_name>.ipynb

In [None]:
!pip install ipynb

In [None]:
# import package from other .ipynb file
from ipynb.fs.full._1_allLibs import * 

In [None]:
# import other .py file to .ipynb file
import os
import sys
sys.path.append(os.path.dirname(os.path.abspath(os.path.abspath(''))))
sys.path

In [None]:
import pandas as pd
from util.logger.logger import logger
from app import ic
import cProfile
import pstats
ic('Hello')
logger.info('Hello')

Read large csv/excel file by using chunk

In [3]:
file_path = '/home/user/pandas-polar-dask/data/time_series_covid19_recovered_global_narrow.csv'
chunk_size =500

In [5]:

# read csv by using context manager
@logger.catch
def readLargerFileCSV(file_path: str = '', chunk_size: int = 500) -> pd.DataFrame:
    '''
        Read large file csv by using chunk
        Arguments: 
            - file_path (str): file path of csv file
            - chunk_size (int): chunk size of csv file
        Return:
            - df: dataframe of csv file
    '''
    chunks = []
    with pd.read_csv(file_path, chunksize = chunk_size) as reader:
        for chunk in reader:
            chunks.append(chunk)
    df = pd.concat(chunks, ignore_index=True)
    return df

measure performance (time)

In [None]:
%timeit readLargerFileCSV(file_path)

In [13]:
cProfile.run('readLargerFileCSV(file_path)', 'csv_profile.prof')

In [None]:
p = pstats.Stats('csv_profile.prof')
p.sort_stats('cumtime').print_stats(20)

In [None]:
%timeit pd.read_csv(file_path)

In [None]:
cProfile.run('pd.read_csv(file_path)', 'read_csv.prof')

In [None]:
p = pstats.Stats('read_csv.prof')
p.sort_stats('cumtime').print_stats(20)

In [18]:
from tqdm import tqdm

@logger.catch
def readLargerFileCSV2(file_path: str = '', chunk_size: int = 500) -> pd.DataFrame:
    '''
    Efficiently read a large CSV file in chunks and return a single DataFrame.
    
    Args:
        file_path (str): Path to the CSV file.
        chunk_size (int): Number of rows to read per chunk.

    Returns:
        pd.DataFrame: Concatenated DataFrame from all chunks.
    '''
    # reader = pd.read_csv(file_path, chunksize=chunk_size)
    # return pd.concat(tqdm(reader, desc="Reading CSV in chunks"), ignore_index=True)
    return (
        pd.concat(
            pd.read_csv(file_path, chunksize=chunk_size),
             ignore_index=True)
    )


In [None]:
%timeit readLargerFileCSV2(file_path)

In [20]:
from rich.progress import Progress
import pandas as pd

@logger.catch
def readLargerFileCSV3(file_path: str = '', chunk_size: int = 500) -> pd.DataFrame:
    chunks = []
    reader = pd.read_csv(file_path, chunksize=chunk_size)
    
    with Progress() as progress:
        # Estimate total chunks (optional, only if you want progress %)
        total_lines = sum(1 for _ in open(file_path)) - 1  # minus header
        total_chunks = (total_lines // chunk_size) + 1
        
        task = progress.add_task("[cyan]Reading CSV...", total=total_chunks)
        
        for chunk in reader:
            chunks.append(chunk)
            progress.update(task, advance=1)

    return pd.concat(chunks, ignore_index=True)


In [25]:
def readCSVWithChunk(file_path: str = '', chunk_size: str = chunk_size):
    reader = pd.read_csv(file_path, chunksize= chunk_size)
    for chunk in  reader:
        yield(chunk)


In [31]:


# Usage
def genChunk():
    gen = readCSVWithChunk(file_path)
    try:
        i = 0
        while True:
            chunk = next(gen)
            i+=1
            print(f"Chunk {i} has shape:", chunk.shape)
    except StopIteration:
        print("âœ… All chunks have been processed.")




In [None]:
%timeit genChunk()

In [None]:
df = pd.read_csv(file_path)

In [None]:
df.info()

In [None]:
df.memory_usage(index=False, deep=True) / df.shape[0]

In [None]:
df.memory_usage(index=False, deep=True).sum()

In [None]:
import psutil

def calc_chunksize(df, share=0.3):
    """Estimate optimal chunksize (in records) for writing large dfs with df.to_csv"""
    print('df shape',df.shape)
    # get approximate record size in bytes
    row_size = df.memory_usage(index=True, deep=True).sum() / df.shape[0]
    print(f'Avg row size: {row_size:2f} bytes ({row_size / 1024 / 1024:2f} MB)')
    # get share of available memory size in bytes
    avail_mem = psutil.virtual_memory().available * share
    # share is percent of avaliable memory to use for df
    return int(avail_mem / row_size)

calc_chunksize(df)

In [None]:
row_size = df.memory_usage(index=True, deep=True).sum() / df.shape[0]
row_size #bytes

In [None]:
memorySize = 500 *1024 *1024 # assume that is maximum memory for df
availableChunkSize = memorySize/ row_size
availableChunkSize

In [None]:
from sys import getsizeof
getsizeof(df)/ len(df) # avg size per row by bytes

In [None]:

import pandas as pd
import concurrent.futures 
def process_chunk(chunk):
    # Perform some data processing here
    print(f"Processing {len(chunk)} records")
    return "Chunk processed"

chunk_iter = pd.read_csv(file_path, chunksize=10000)

# Use ThreadPoolExecutor to process chunks in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    # Submit each chunk to the executor to be processed in parallel
    futures = [executor.submit(process_chunk, chunk) for chunk in chunk_iter]
    for future in concurrent.futures.as_completed(futures):
        print(future.result())