In [86]:
import pandas as pd
import numpy as np
import random

import seaborn as sns
from matplotlib import pyplot as pyplot
%matplotlib inline

import pickle
from sklearn.feature_extraction import DictVectorizer

In [None]:
input_file = './tracked/test.csv'
output_file = 'output/test.csv'

In [87]:
def read_dataframe(filename: str):
    df = pd.read_csv(filename)
    return df

In [88]:
def preprocessor(df: pd.DataFrame):

    df['totalcharges'] = pd.to_numeric(df['totalcharges'], errors='coerce')
    df['totalcharges'] = df['totalcharges'].fillna(0)
    df['seniorcitizen'] = df['seniorcitizen'].replace({0: 'no', 1: 'yes'})

    df.columns = df.columns.str.lower().str.replace(' ','_')
    string_columns = list(df.dtypes[df.dtypes == 'object'].index)

    for col in string_columns:
        df[col] = df[col].str.lower().str.replace(' ','_')

    return df

In [89]:
with open('../notebooks/tracked/model.pkl', "rb") as f_in:
    model = pickle.load(f_in)

In [82]:
def prepareDictionaries(df: pd.DataFrame):
    categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
               'phoneservice', 'multiplelines', 'internetservice',
               'onlinesecurity', 'onlinebackup', 'deviceprotection',
               'techsupport', 'streamingtv', 'streamingmovies',
               'contract', 'paperlessbilling', 'paymentmethod']
    numerical = ['tenure', 'monthlycharges', 'totalcharges']

    dicts = df[categorical + numerical].to_dict(orient='records')
    return dicts

In [83]:
def dictionaryVectorizer(dictDF: dict):
    dv = DictVectorizer(sparse=False)
    dv.fit(dictDF)
    
    return dv.transform(dictDF)

In [90]:
df = read_dataframe()
df = preprocessor(df)

dicts = prepareDictionaries(df)

model_input = dictionaryVectorizer(dicts)

In [91]:
y_pred = model.predict(model_input)

In [94]:
def generate_msisdn(n):
    n = len(y_pred)
    msisdn = []
    for i in range(n):
        msisdn.append('2517' + str(random.randint(100000000, 999999999)))

    return msisdn

In [98]:
df_result = pd.DataFrame(input_file)

In [99]:
df_result['msisdn'] = generate_msisdn(len(y_pred))
df_result['gender'] = df['gender']
df_result['tenure'] = df['tenure']
df_result['totalcharges'] = df['totalcharges']
df_result['monthlycharges'] = df['monthlycharges']
df_result['churn'] = y_pred

In [104]:
df_result.head()

Unnamed: 0,msisdn,gender,tenure,totalcharges,monthlycharges,churn
0,2517817157409,female,41,3320.75,79.85,0
1,2517183813093,female,66,6471.85,102.4,0
2,2517231995574,female,12,524.35,45.0,0
3,2517827538847,female,5,249.95,50.6,0
4,2517438899251,female,10,660.05,65.9,0


In [105]:
!mkdir output

In [106]:
df_result.to_csv('output/test.csv')

In [107]:
# Model Versions Also Must be Delivered With The Data Which is Shipped to The Data Base

In [None]:
def load_model(run_id):
    logged_model = 'Path for the specific ML model and RUNID while assuming experiments will be tracked by MLflow'
    model = model.pyfunc.load_model(logged_model)
    return model

In [None]:
def apply_model(input_file, model, run_id, output_file):
    df = read_dataframe(input_file)
    dicts = prepareDictionaries(df)
    