### Step 1: Data Ingestion

##### Step 1: Reading data and compiling the datasets together:

In [2]:
import pandas as pd
import numpy as np
import os
import json
from datetime import datetime
import warnings
import logging
warnings.filterwarnings('ignore')

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

#############Load config.json and get input and output paths
with open('config.json', 'r') as f:
    config = json.load(f)

input_folder_path = config['input_folder_path']
output_folder_path = config['output_folder_path']


#############Function for data ingestion
def merge_multiple_dataframe():
    #check for datasets, compile them together, and write to an output file
    
    # Get list of files in input folder
    files = os.listdir(input_folder_path)

    # Get list of csv files
    csv_files = [file for file in files if file.endswith('.csv')]

    # Load csv files into pandas dataframes
    df = pd.DataFrame()
    for file in csv_files:
        # Read csv file and append to df
        df = df.append(pd.read_csv(os.path.join(input_folder_path, file)))

    # Reset index
    df.reset_index(drop=True, inplace=True)

    # Remove duplicates
    df.drop_duplicates(inplace=True)

    # Save to output folder
    df.to_csv(os.path.join(output_folder_path, 'finaldata.csv'), index=False)
merge_multiple_dataframe()

##### Step 2: Training, Scoring, and Deploying an ML Model

In [3]:
from flask import Flask, session, jsonify, request
import pandas as pd
import numpy as np
import pickle
import os
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import json
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

###################Load config.json and get path variables
logging.info('Loading config.json')
with open('config.json','r') as f:
    config = json.load(f) 

dataset_csv_path = os.path.join(config['output_folder_path']) 
model_path = os.path.join(config['output_model_path']) 
target = config['target']
test_data_path = config['test_data_path']

# Train model
logging.info('** Training the model ** using train_model()')

# Load dataset
logging.info(f'Loading dataset from {dataset_csv_path}')
df_original = pd.read_csv(os.path.join(dataset_csv_path, 'finaldata.csv'))
df = df_original.copy()

# Split dataset into train and test
logging.info('Splitting dataset into train and test')
df.drop('corporation', axis=1, inplace=True)
X = df.drop(target, axis=1)
y = df[target]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#use this logistic regression for training
logging.info('Training model')
model = LogisticRegression()

#fit the logistic regression to your data
model.fit(X, y)

# #write the trained model to your workspace in a file called trainedmodel.pkl
# logging.info(f'Writing model to {model_path}/trainedmodel.pkl')
# pickle.dump(model, open(os.path.join(model_path, 'trainedmodel.pkl'), 'wb'))

11/08/2022 08:18:27 AM Loading config.json
11/08/2022 08:18:27 AM ** Training the model ** using train_model()
11/08/2022 08:18:27 AM Loading dataset from ingesteddata
11/08/2022 08:18:27 AM Splitting dataset into train and test
11/08/2022 08:18:27 AM Training model


In [6]:
# load test data
logging.info('Loading test data')
df_test = pd.read_csv(os.path.join(test_data_path, 'testdata.csv'))

df_test.drop('corporation', axis=1, inplace=True)
X_test = df_test.drop(target, axis=1)
y_test = df_test[target]

# predict on test data
logging.info('Predicting on test data')
y_pred = model.predict(X_test)

# f1 score
logging.info('Calculating f1 score')
f1 = metrics.f1_score(y_test, y_pred)

11/08/2022 08:20:05 AM Loading test data


FileNotFoundError: [Errno 2] No such file or directory: 'testdata/test_data.csv'

In [13]:
# print mean of each column along with the column name
[(col, df[col].mean()) for col in df.columns]


[('a', 2.0), ('b', 5.0)]