In [20]:
import os
import sys
import json
import pickle
import timeit
import logging
import subprocess
import numpy as np
import pandas as pd
import re
import yaml

In [21]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

In [22]:
with open('src/config.json','r') as f:
    config = json.load(f) 

prod_deployment_path = os.path.join(config['prod_deployment_path']) 
data_path=os.path.join(config['output_folder_path']) 
test_data_path=os.path.join(config['test_data_path']) 

In [23]:
def model_predictions(X_df):
    logging.info("Loading deployed model")
    model = pickle.load(open(os.path.join(prod_deployment_path,'trainedmodel.pkl'), 'rb'))
    logging.info("Running predictions on data")
    y_pred = model.predict(X_df)
    return y_pred

In [24]:
def dataframe_summary():
    logging.info("Loading and preparing finaldata.csv")
    data_df = pd.read_csv(os.path.join(data_path, 'finaldata.csv'))
    data_df = data_df.drop(['exited'], axis=1)
    data_df = data_df.select_dtypes('number')
    logging.info("Calculating statistics for data")
    statistics_dict = {}
    for col in data_df.columns:
        mean = data_df[col].mean()
        median = data_df[col].median()
        std = data_df[col].std()
    
        statistics_dict[col] = {'mean': mean, 'median': median, 'std': std}
    return(statistics_dict)

In [25]:
def missing_percentage():
    logging.info("Loading and preparing finaldata.csv")
    data_df = pd.read_csv(os.path.join(data_path, 'finaldata.csv'))
    logging.info("Calculating missing data percentage")
    missing_list = {col: {'percentage': perc} for col, perc in zip(
        data_df.columns, data_df.isna().sum() / data_df.shape[0] * 100)}

    return( missing_list)

In [26]:
def _ingestion_timing():
    starttime = timeit.default_timer()
    _ = subprocess.run(['python', 'ingestion.py'], capture_output=True)
    timing = timeit.default_timer() - starttime
    return timing

In [27]:
def _training_timing():
    starttime = timeit.default_timer()
    _ = subprocess.run(['python', 'training.py'], capture_output=True)
    timing = timeit.default_timer() - starttime
    return timing


In [28]:
def execution_time():
    logging.info("Calculating time for ingestion.py")
    ingestion_time = []
    for _ in range(20):
        time = _ingestion_timing()
        ingestion_time.append(time)

    logging.info("Calculating time for training.py")
    training_time = []
    for _ in range(20):
        time = _training_timing()
        training_time.append(time)

    ret_list = [
        {'ingest_time_mean': np.mean(ingestion_time)},
        {'train_time_mean': np.mean(training_time)}
    ]

    return ret_list

In [29]:
def outdated_packages_list(request_file='request.txt'):
    logging.info("Checking outdated dependencies")
    
    # Step 1: Run pip list to get outdated packages
    result = subprocess.run(
        ['pip', 'list', '--outdated'],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        encoding='utf-8'
    )

    # Check if the subprocess ran successfully
    if result.returncode != 0:
        logging.error(f"Error running pip list: {result.stderr}")
        return []

    dep = result.stdout
    # Split lines and ignore the first 2 (header lines)
    dep_lines = dep.split('\n')[2:]  # Skip the first two lines of the output header

    # Step 2: Parse the pip list output
    outdated_deps = []
    for line in dep_lines:
        if line.strip():  # Skip empty lines
            parts = line.split()
            if len(parts) >= 3:
                # Package name, current version, latest version
                outdated_deps.append({
                    'package': parts[0],
                    'current_version': parts[1],
                    'latest_version': parts[2]
                })
    
    # Convert the outdated list to a DataFrame
    outdated_deps_df = pd.DataFrame(outdated_deps)
    
    # Step 3: Read the request.txt file to get the list of packages
    try:
        with open(request_file, 'r') as f:
            requested_packages = [line.strip().split('==')[0] for line in f if line.strip()]
    except FileNotFoundError:
        logging.error(f"File {request_file} not found.")
        return outdated_deps_df

    # Step 4: Filter outdated packages that are in request.txt
    requested_outdated_deps = outdated_deps_df[outdated_deps_df['package'].isin(requested_packages)]
    
    # Return only the outdated packages that are in request.txt, without extra print/logging
    return requested_outdated_deps

In [30]:
outdated_packages_list(request_file='requirements.txt')

INFO:root:Checking outdated dependencies


Unnamed: 0,package,current_version,latest_version
0,anyio,4.6.2,4.8.0
2,debugpy,1.8.11,1.8.12
3,jupyterlab,4.1.3,4.3.4
4,mistune,3.0.2,3.1.0
5,nbconvert,7.16.4,7.16.5
6,notebook,7.1.3,7.3.2
7,numpy,2.0.1,2.2.2
9,pydantic,2.10.4,2.10.5
10,Pygments,2.18.0,2.19.1
13,pytest,6.2.5,8.3.4


In [31]:
if __name__ == '__main__':

    logging.info("Loading and preparing testdata.csv")
    test_df = pd.read_csv(os.path.join(test_data_path, 'testdata.csv'))
    X_df = test_df.drop(['corporation', 'exited'], axis=1)

    print("Model predictions on testdata.csv:",
          model_predictions(X_df), end='\n\n')

    print("Summary statistics")
    print(json.dumps(dataframe_summary(), indent=4), end='\n\n')

    print("Missing percentage")
    print(json.dumps(missing_percentage(), indent=4), end='\n\n')

    print("Execution time")
    print(json.dumps(execution_time(), indent=4), end='\n\n')

    print("Outdated Packages")
    print(outdated_packages_list(request_file='requirements.txt'))

INFO:root:Loading and preparing testdata.csv
INFO:root:Loading deployed model
INFO:root:Running predictions on data
Model predictions on testdata.csv: [0 1 1 1 1]

Summary statistics
INFO:root:Loading and preparing finaldata.csv
INFO:root:Calculating statistics for data
{
    "lastmonth_activity": {
        "mean": 165.65384615384616,
        "median": 73.0,
        "std": 284.0332293669447
    },
    "lastyear_activity": {
        "mean": 1502.923076923077,
        "median": 955.0,
        "std": 2192.6449584568304
    },
    "number_of_employees": {
        "mean": 26.884615384615383,
        "median": 14.0,
        "std": 31.353885785435814
    }
}

Missing percentage
INFO:root:Loading and preparing finaldata.csv
INFO:root:Calculating missing data percentage
{
    "corporation": {
        "percentage": 0.0
    },
    "lastmonth_activity": {
        "percentage": 0.0
    },
    "lastyear_activity": {
        "percentage": 0.0
    },
    "number_of_employees": {
        "percentage": 

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


INFO:root:Calculating time for training.py
[
    {
        "ingest_time_mean": 0.028965954098384827
    },
    {
        "train_time_mean": 0.027144949964713304
    }
]

Outdated Packages
INFO:root:Checking outdated dependencies
            package current_version latest_version
0             anyio           4.6.2          4.8.0
2           debugpy          1.8.11         1.8.12
3        jupyterlab           4.1.3          4.3.4
4           mistune           3.0.2          3.1.0
5         nbconvert          7.16.4         7.16.5
6          notebook           7.1.3          7.3.2
7             numpy           2.0.1          2.2.2
9          pydantic          2.10.4         2.10.5
10         Pygments          2.18.0         2.19.1
13           pytest           6.2.5          8.3.4
14             pytz          2024.1         2024.2
17      referencing          0.35.1         0.36.1
18            scipy          1.15.0         1.15.1
21        starlette          0.41.3         0.45.2
22    