In [1]:
import io, yaml, logging, sys

from logging.handlers import RotatingFileHandler

import pandas as pd
from contextlib import redirect_stderr
from decimal import Decimal

In [3]:
def get_logger():
    logger = logging.getLogger('crisp_app_logger')
    
    log_formatter = logging.Formatter("%(asctime)s|%(name)s|%(levelname)s|%(message)s")
    
    logger.setLevel(logging.INFO)
    
    # 10 MiB = 10.485M bytes (10*1024*1024)
    rotating_file_handler = RotatingFileHandler('crisp_app.log', maxBytes=10*1024*1024, backupCount=5)
    
    rotating_file_handler.setFormatter(log_formatter)
    
    logger.addHandler(rotating_file_handler)
    
    console_handler = logging.StreamHandler(sys.stdout)
    
    # TODO: set logging level to INFO
    console_handler.setLevel(logging.DEBUG)
    
    console_handler.setFormatter(log_formatter)
    
    logger.addHandler(console_handler)
    
    return logger

In [4]:
def create_new_col(raw_df, key, value):
    if not pd.Series(value).isin(raw_df.columns).all():
        raw_df[key] = ''.join(value)
    
    else:
        raw_df[key] = raw_df[value].astype(str).apply('-'.join, axis=1)

    return raw_df

In [5]:
logger = get_logger()

logger.info('initial log message')

2024-03-02 05:04:20,464|crisp_app_logger|INFO|initial log message


In [9]:
with open("/Users/peterphyall/Documents/profdev/crisp-take-home/crisp_config.yml", "r") as file:
    config_dict = yaml.safe_load(file)
    
config_dict

{'renamed_cols': {'Order Number': 'OrderID',
  'Product Number': 'ProductId',
  'Product Name': 'ProductName',
  'Count': 'Quantity'},
 'new_cols': {'Unit': ['kg'], 'OrderDate': ['Year', 'Month', 'Day']},
 'dtype_cols': {'int': ['OrderID'],
  'str': ['ProductId', 'ProductName', 'Unit'],
  'decimal': ['Quantity'],
  'datetime': ['OrderDate']},
 'str_dtype_cols_manipulation': {'proper_case': ['ProductName']},
 'select_cols': ['OrderID',
  'OrderDate',
  'ProductId',
  'ProductName',
  'Quantity',
  'Unit']}

In [10]:
from contextlib import redirect_stderr, closing
import io

f = io.StringIO()

with redirect_stderr(f):
#     raw_df = pd.read_csv('/Users/peterphyall/Documents/profdev/crisp-take-home/bad_lines_dummy_file_crisp.csv', 
#                          on_bad_lines='warn')
    raw_df = pd.read_csv('/Users/peterphyall/Documents/profdev/junk-drawer/create_dummy_csv_file/34_kb_dummy_file_crisp.csv', 
                         on_bad_lines='warn')
    
if f.getvalue():
    logger.warning(f"Reading in line - bad line(s): \n{f.getvalue()}")

In [11]:
raw_df

Unnamed: 0,Order Number,Year,Month,Day,Product Number,Product Name,Count,Extra Col1,ExtraCol2,Unnamed: 9
0,20917578,2019,10,18,P-10002,Arugola,500.00,Lorem,,
1,29289481,2016,1,15,P-10002,Arugola,5250.50,,,
2,83721963,2015,9,27,P-10002,Arugola,5250.50,,,
3,42200703,2019,6,9,P-10001,Arugola,5250.50,Lorem,Ipsum,
4,11395301,2017,7,12,P-10002,Iceberg lettuce,5250.50,,Ipsum,
...,...,...,...,...,...,...,...,...,...,...
595,46710632,2016,10,18,P-10001,Arugola,5250.50,Lorem,,
596,31783293,2017,9,20,P-10002,Arugola,5250.50,Lorem,Ipsum,
597,54492626,2018,3,27,P-10002,Iceberg lettuce,5250.50,Lorem,Ipsum,
598,89125914,2016,4,25,P-10002,Arugola,500.00,,Ipsum,


In [12]:
raw_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Order Number    600 non-null    int64  
 1   Year            600 non-null    int64  
 2   Month           600 non-null    int64  
 3   Day             600 non-null    int64  
 4   Product Number  600 non-null    object 
 5   Product Name    600 non-null    object 
 6   Count           600 non-null    object 
 7   Extra Col1      313 non-null    object 
 8   ExtraCol2       303 non-null    object 
 9   Unnamed: 9      0 non-null      float64
dtypes: float64(1), int64(4), object(5)
memory usage: 47.0+ KB


In [13]:
# transformation, step 1: create new target cols
for key, value in config_dict['new_cols'].items():
    raw_df = create_new_col(raw_df, key, value)

# transformation, step 2: rename target cols
raw_df = raw_df.rename(columns=config_dict['renamed_cols'])

# transformation, step 3: convert target cols' dtypes
for key, value in config_dict['dtype_cols'].items():
    if 'int' in key or 'str' in key:
        raw_df[value] = raw_df[value].astype(key)

    elif 'datetime' in key:
        raw_df[value] = raw_df[value].apply(pd.to_datetime)

    elif 'decimal' in key:
        raw_df[value] = raw_df[value].astype(str).apply(lambda x: x.str.replace(',', "")).apply(lambda x: x.apply(Decimal))

# 4) transformation, step 4: manipulate str dtype target cols
for key, value in config_dict['str_dtype_cols_manipulation'].items():
    if 'proper_case' in key:
        raw_df[value] = raw_df[value].apply(lambda x: x.str.title())

# transformation, step 5: select target cols
transformed_df = raw_df[config_dict['select_cols']]

In [14]:
transformed_df

Unnamed: 0,OrderID,OrderDate,ProductId,ProductName,Quantity,Unit
0,20917578,2019-10-18,P-10002,Arugola,500.00,kg
1,29289481,2016-01-15,P-10002,Arugola,5250.50,kg
2,83721963,2015-09-27,P-10002,Arugola,5250.50,kg
3,42200703,2019-06-09,P-10001,Arugola,5250.50,kg
4,11395301,2017-07-12,P-10002,Iceberg Lettuce,5250.50,kg
...,...,...,...,...,...,...
595,46710632,2016-10-18,P-10001,Arugola,5250.50,kg
596,31783293,2017-09-20,P-10002,Arugola,5250.50,kg
597,54492626,2018-03-27,P-10002,Iceberg Lettuce,5250.50,kg
598,89125914,2016-04-25,P-10002,Arugola,500.00,kg


In [15]:
transformed_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   OrderID      600 non-null    int64         
 1   OrderDate    600 non-null    datetime64[ns]
 2   ProductId    600 non-null    object        
 3   ProductName  600 non-null    object        
 4   Quantity     600 non-null    object        
 5   Unit         600 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 28.3+ KB


In [16]:
type(raw_df['Quantity'].iloc[0])

decimal.Decimal

In [30]:
import requests

files={'input_data_file': open('/Users/peterphyall/Documents/profdev/crisp-take-home/bad_lines_dummy_file_crisp.csv', 'rb'),
       'crisp_config_yaml_file': open('/Users/peterphyall/Documents/profdev/crisp-take-home/crisp_config.yml', 'rb')}
response = requests.post('http://127.0.0.1:5000/file/upload', files=files)

print(response.text)

<html> 
    <head> 
       <title>File Upload Success - Crisp App</title> 
    </head> 
    <body>
      <h2>Input File Upload Success</h2>
       <p>Input data file name: bad_lines_dummy_file_crisp.csv</p> 
       <p>Input config file name: crisp_config.yml</p> 
    </body> 
 </html>


In [34]:
response = requests.post('http://127.0.0.1:5000/data/transform')

print(response.text)

<!DOCTYPE html>
<html lang="en"></html>
<head>
    <meta charset="UTF-8">
    <title>Transformed Output - Crisp App</title>
</head>
<body>
    <p>Input data file dimensions (rows, columns): (8, 12)</p> 
    <p>Transformed data dimensions (rows, columns): (8, 6)</p>
    <h2>First 20 results from transformed data</h2>
    
    <table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>OrderID</th>
      <th>OrderDate</th>
      <th>ProductId</th>
      <th>ProductName</th>
      <th>Quantity</th>
      <th>Unit</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>1000</td>
      <td>2018-01-01</td>
      <td>P-10001</td>
      <td>Arugola</td>
      <td>5250.50</td>
      <td>kg</td>
    </tr>
    <tr>
      <th>1</th>
      <td>1001</td>
      <td>2017-12-12</td>
      <td>P-10002</td>
      <td>Iceberg Lettuce</td>
      <td>500.00</td>
      <td>kg</td>
    </tr>
    <tr>
      <th>2</th>
      <td>1002</td>
      <t