In [7]:

!pip install pandas

Collecting pandas
  Downloading pandas-2.3.1-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.3.2-cp312-cp312-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.9 kB ? eta -:--:--
     ---------------------------------------- 60.9/60.9 kB ? eta 0:00:00
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.1-cp312-cp312-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---- ----------------------------------- 1.2/11.0 MB 26.7 MB/s eta 0:00:01
   --------------- ------------------------ 4.4/11.0 MB 55.8 MB/s eta 0:00:01
   --------------------------- ------------ 7.4/11.0 MB 53.0 MB/s eta 0:00:01
   ---------------------------------- ----- 9.6/11.0 MB 55.9 MB/s eta 0:00:01
   ----


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [79]:

import pandas as pd

def load_and_clean_csv(path, primary_key):
    """
    Loads a CSV file, cleans column names, and checks for duplicate primary keys.

    Parameters:
        path (str): Path to the CSV file.
        primary_key (str): Column name to check for duplicates.

    Returns:
        pd.DataFrame: Cleaned DataFrame.
    """
    # Load the CSV
    df = pd.read_csv(path, header=0, skiprows=0)

    # Clean column names: strip, lowercase, replace spaces with underscores
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

    df = df[df[primary_key].notna()]

    # Check for duplicates in the primary key
    if df[primary_key].duplicated().any():
        print(f"Warning: Duplicate values found in primary key column '{primary_key}'")
        print(df[df[primary_key].duplicated(keep=False)])

    return df


books = load_and_clean_csv('../Data/03_Library Systembook.csv', primary_key='id')
customers = load_and_clean_csv('../Data/03_Library SystemCustomers.csv', primary_key='customer_id')
print(books.head())



    id                                     books book_checkout book_returned  \
0  1.0                       Catcher in the Rye   "20/02/2023"    25/02/2023   
1  2.0          Lord of the rings the two towers  "24/03/2023"    21/03/2023   
2  3.0  Lord of the rings the return of the kind  "29/03/2023"    25/03/2023   
3  4.0                                The hobbit  "02/04/2023"    25/03/2023   
4  5.0                                     Dune   "02/04/2023"    25/03/2023   

  days_allowed_to_borrow  customer_id  
0                2 weeks          1.0  
1                2 weeks          2.0  
2                2 weeks          3.0  
3                2 weeks          4.0  
4                2 weeks          5.0  


In [88]:

import pandas as pd

#function to load and clean data
def load_and_clean_csv(path, primary_key):
    """
    Loads a CSV file, cleans column names, and checks for duplicate primary keys and removed null primary keys.

    Parameters:
        path (str): Path to the CSV file.
        primary_key (str): Column name to check for duplicates.

    Returns:
        pd.DataFrame: Cleaned DataFrame.
    """
    # Load the CSV
    df = pd.read_csv(path, header=0, skiprows=0)

    # Clean column names: strip, lowercase, replace spaces with underscores
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

    df = df[df[primary_key].notna()]

    # Check for duplicates in the primary key
    if df[primary_key].duplicated().any():
        print(f"Warning: Duplicate values found in primary key column '{primary_key}'")
        print(df[df[primary_key].duplicated(keep=False)])

    return df


books = load_and_clean_csv('../Data/03_Library Systembook.csv', primary_key='id')
customers = load_and_clean_csv('../Data/03_Library SystemCustomers.csv', primary_key='customer_id')


# Convert 'maximum_days_to_borrow' to number of days
def parse_borrow_period(period):
    if pd.isna(period):
        return None
    period = str(period).lower().strip()
    if 'week' in period:
        num = int(period.split()[0])
        return num * 7
    elif 'day' in period:
        num = int(period.split()[0])
        return num
    return None


books_ideal = (
    books
        .assign(
            book_pk=lambda d: d['id'].astype('Int64'),
            book_name=lambda d: d['books'],
            book_checkout_date=lambda d: pd.to_datetime(
                d['book_checkout'].str.replace('"', '').str.strip(), 
                format='%d/%m/%Y', 
                errors='coerce'
            ),
            book_returned_date=lambda d: pd.to_datetime(d['book_returned'], format='%d/%m/%Y', errors='coerce'),
            maximum_days_to_borrow=lambda d: d['days_allowed_to_borrow'], 
            book_customer_fk=lambda d: d['customer_id'].astype('Int64'),
            
        )
        [['book_pk', 'book_name', 'book_checkout_date','book_returned_date','maximum_days_to_borrow','book_customer_fk']]
)


# Apply the parsing function
books_ideal['max_borrow_days'] = books_ideal['maximum_days_to_borrow'].apply(parse_borrow_period)

# Calculate the actual borrowing duration
books_ideal['borrow_duration'] = (books_ideal['book_returned_date'] - books_ideal['book_checkout_date']).dt.days

# Check if the Return Date is before the Checkout Date
books_ideal['returned_before_checkout'] = books_ideal['borrow_duration'] < 0

# Check if the book was returned late
books_ideal['returned_overdue'] = books_ideal['borrow_duration'] > books_ideal['max_borrow_days']






customers_ideal = (
    customers
        .assign(
            customer_pk=lambda d: d['customer_id'].astype('Int64'),
                    )
        [['customer_pk','customer_name']]
)

print(customers_ideal.head())

books_with_customers = (
    books_ideal
        .merge(customers_ideal, left_on='book_customer_fk', right_on='customer_pk', how='left')
        .drop(columns=['book_customer_fk']) 
)


# Group by Customer_pk and count the number of books borrowed

books_per_customer = (
    books_with_customers
        .groupby(['customer_pk', 'customer_name'])
        .agg(
            total_books_borrowed=('book_pk', 'count'),
            first_checkout_date=('book_checkout_date', 'min'),
            last_return_date=('book_returned_date', 'max')
        )
        .reset_index()
)


print(books_ideal.head())


   customer_pk   customer_name
0            1        Jane Doe
1            2      John Smith
2            3      Dan Reeves
4            5  William Holden
5            6   Jaztyn Forest
   book_pk                                 book_name book_checkout_date  \
0        1                       Catcher in the Rye          2023-02-20   
1        2          Lord of the rings the two towers         2023-03-24   
2        3  Lord of the rings the return of the kind         2023-03-29   
3        4                                The hobbit         2023-04-02   
4        5                                     Dune          2023-04-02   

  book_returned_date maximum_days_to_borrow  book_customer_fk  \
0         2023-02-25                2 weeks                 1   
1         2023-03-21                2 weeks                 2   
2         2023-03-25                2 weeks                 3   
3         2023-03-25                2 weeks                 4   
4         2023-03-25                2 

In [56]:

!pip install sqlalchemy
!pip install sqlite3  


Collecting sqlalchemy
  Downloading sqlalchemy-2.0.41-cp312-cp312-win_amd64.whl.metadata (9.8 kB)
Collecting greenlet>=1 (from sqlalchemy)
  Downloading greenlet-3.2.3-cp312-cp312-win_amd64.whl.metadata (4.2 kB)
Collecting typing-extensions>=4.6.0 (from sqlalchemy)
  Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Downloading sqlalchemy-2.0.41-cp312-cp312-win_amd64.whl (2.1 MB)
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
   ------------------------------------ --- 1.9/2.1 MB 40.3 MB/s eta 0:00:01
   ---------------------------------------- 2.1/2.1 MB 33.4 MB/s eta 0:00:00
Downloading greenlet-3.2.3-cp312-cp312-win_amd64.whl (297 kB)
   ---------------------------------------- 0.0/297.8 kB ? eta -:--:--
   --------------------------------------- 297.8/297.8 kB 19.2 MB/s eta 0:00:00
Downloading typing_extensions-4.14.1-py3-none-any.whl (43 kB)
   ---------------------------------------- 0.0/43.9 kB ? eta -:--:--
   ------------------------


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Could not find a version that satisfies the requirement sqlite3 (from versions: none)
ERROR: No matching distribution found for sqlite3

[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [57]:
from sqlalchemy import create_engine

# Create a SQLite database in the current directory
engine = create_engine('sqlite:///library.db')


books_with_customers.to_sql('books_with_customers', con=engine, if_exists='replace', index=False)
books_per_customer.to_sql('books_per_customer', con=engine, if_exists='replace', index=False)


8

In [60]:

import pandas as pd
from sqlalchemy import create_engine

# Connect to the SQLite database created earlier
engine = create_engine('sqlite:///library.db')

# Example: Read the entire books_per_customer table
df = pd.read_sql('SELECT * FROM books_per_customer', con=engine)

# Example: Filter customers with more than 3 books
df_filtered = pd.read_sql('''
    SELECT * FROM books_per_customer
    WHERE total_books_borrowed > 2
''', con=engine)

print(df_filtered.head())


   customer_pk customer_name  total_books_borrowed  \
0            1      Jane Doe                     3   
1            2    John Smith                     5   
2            3    Dan Reeves                     3   

          first_checkout_date            last_return_date  
0  2023-02-20 00:00:00.000000  2023-05-01 00:00:00.000000  
1  2023-03-24 00:00:00.000000  2023-06-20 00:00:00.000000  
2  2023-03-29 00:00:00.000000  2023-06-04 00:00:00.000000  


In [89]:
from app_copy import parse_borrow_period


In [90]:
parse_borrow_period()

TypeError: parse_borrow_period() missing 1 required positional argument: 'period'