In [1]:
import pandas
import numpy
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
import pickle
import logging

logger = logging.getLogger(__name__)
logging.basicConfig(level="INFO")

In [2]:
# Check library versions
logger.info("pandas==%s", pandas.__version__)
logger.info("numpy==%s", numpy.__version__)
logger.info("scikit-learn==%s", sklearn.__version__)

INFO:__main__:pandas==1.1.5
INFO:__main__:numpy==1.19.5
INFO:__main__:scikit-learn==0.22


In [3]:
# Loding Training Data
training_data = pandas.read_csv("train.csv")
logger.info("training_data.shape: %s", training_data.shape)
logger.info("Sample Trainning Data: \n %s", training_data.head())

INFO:__main__:training_data.shape: (571, 12)
INFO:__main__:Sample Trainning Data: 
    PassengerId  Pclass                              Name     Sex   Age  SibSp  \
0          145       2        Andrew, Mr. Edgardo Samuel    male  18.0      0   
1          531       2          Quick, Miss. Phyllis May  female   2.0      1   
2          387       3   Goodwin, Master. Sidney Leonard    male   1.0      5   
3           94       3           Dean, Mr. Bertram Frank    male  26.0      1   
4          753       3  Vande Velde, Mr. Johannes Joseph    male  33.0      0   

   Parch     Ticket    Fare Cabin Embarked  Survived  
0      0     231945  11.500   NaN        S         0  
1      1      26360  26.000   NaN        S         1  
2      2    CA 2144  46.900   NaN        S         0  
3      2  C.A. 2315  20.575   NaN        S         0  
4      0     345780   9.500   NaN        S         0  


In [4]:
# Defining predictors and target variable
numeric_predictors = ["Pclass","Age","SibSp","Parch","Fare"]
categorical_predictors = ["Sex", "Cabin", "Embarked"]
target_variable = "Survived"

In [5]:
# Filtering trainng data to predictors + target
training_data = training_data[numeric_predictors+categorical_predictors+[target_variable]]

In [6]:
# Checking Null Values
training_data.isna().sum()

Pclass        0
Age           0
SibSp         0
Parch         0
Fare          0
Sex           0
Cabin       426
Embarked      2
Survived      0
dtype: int64

In [7]:
logger.info("Replacing Nulls")
training_data.replace(to_replace=[None], value=numpy.nan, inplace=True)
training_data[numeric_predictors] = training_data.loc[:, numeric_predictors].apply(
    pandas.to_numeric, errors="coerce"
)

INFO:__main__:Replacing Nulls


In [8]:
logger.info("Setting 'y_train' and 'X_train'")
X_train = training_data.drop("Survived", axis=1)
y_train = training_data["Survived"]

INFO:__main__:Setting 'y_train' and 'X_train'


In [9]:
logger.info("Setting up numeric transformer Pipeline")
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
    ]
)

INFO:__main__:Setting up numeric transformer Pipeline


In [10]:
logger.info("Setting up categorical transformer Pipeline")
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

INFO:__main__:Setting up categorical transformer Pipeline


In [11]:
logger.info("Initializing preprocessor")
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_predictors),
        ("cat", categorical_transformer, categorical_predictors),
    ]
)


INFO:__main__:Initializing preprocessor


In [12]:
logger.info("Initializing model pipeline")
model = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier(random_state=42))]
)

INFO:__main__:Initializing model pipeline


In [13]:
logger.info("Fitting model")
model.fit(X_train, y_train)

INFO:__main__:Fitting model


Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                               

In [14]:
logger.info("Model fitting complete. Writing RFC_model.pkl")
with open("RFC_model.pkl", "wb") as f:
    pickle.dump(model, f)

INFO:__main__:Model fitting complete. Writing RFC_model.pkl


In [15]:
# Let's check Performance on test data
test_data = pandas.read_csv("test.csv")
test_data = test_data[numeric_predictors+categorical_predictors+[target_variable]]

test_accuracy = model.score(
    test_data[numeric_predictors+categorical_predictors],test_data["Survived"]
)
logger.info("Accuracy on training data %f: ", test_accuracy)

INFO:__main__:Accuracy on training data 0.811189: 


In [16]:
# Let's score New Data (no ground truth)
new_data = pandas.read_csv("predict.csv")
new_scores = model.predict(new_data[numeric_predictors+categorical_predictors])

In [17]:
import pandas
import numpy
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
import pickle
import logging

logger = logging.getLogger(__name__)
logging.basicConfig(level="INFO")


# modelop.init
def begin():

    global model
    model = pickle.load(open("RFC_model.pkl", "rb"))
    logger.info("'RFC_model.pkl' file loaded to global variable 'model'")

    global numeric_predictors, categorical_predictors, target_variable
    numeric_predictors = ["Pclass", "Age", "SibSp", "Parch", "Fare"]
    categorical_predictors = ["Sex", "Cabin", "Embarked"]
    target_variable = "Survived"
    logger.info("Variable roles assigned")


# modelop.score
def predict(scoring_data):

    scoring_data=pandas.DataFrame([scoring_data])

    scoring_data["Prediction"] = model.predict(
        scoring_data[numeric_predictors + categorical_predictors]
    )
    yield scoring_data.to_dict(orient="records")[0]


# modelop.metrics
def metrics(metrics_df):

    logger.info("metrics_df is of shape: %s", metrics_df.shape)

    X_test = metrics_df.drop("Survived", axis=1)
    y_true = metrics_df["Survived"]
    yield {
        "ACCURACY": model.score(
            X_test[numeric_predictors + categorical_predictors], y_true
        )
    }


# modelop.train
def train(training_df):

    logger.info("train_df is of shape: %s", training_df.shape)

    numeric_predictors = ["Pclass", "Age", "SibSp", "Parch", "Fare"]
    categorical_predictors = ["Sex", "Cabin", "Embarked"]
    target_variable = "Survived"

    training_df = training_df.loc[
        :, numeric_predictors + categorical_predictors + [target_variable]
    ]

    logger.info("Replacing Nulls")
    training_df.replace(to_replace=[None], value=numpy.nan, inplace=True)
    training_df[numeric_predictors] = training_df.loc[:, numeric_predictors].apply(
        pandas.to_numeric, errors="coerce"
    )

    logger.info("Setting 'y_train' and 'X_train'")
    X_train = training_df.drop("Survived", axis=1)
    y_train = training_df["Survived"]

    logger.info("Setting up numeric transformer Pipeline")
    numeric_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler()),
        ]
    )

    logger.info("Setting up categorical transformer Pipeline")
    categorical_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
            ("onehot", OneHotEncoder(handle_unknown="ignore")),
        ]
    )

    logger.info("Initializing preprocessor")
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_predictors),
            ("cat", categorical_transformer, categorical_predictors),
        ]
    )

    logger.info("Initializing model pipeline")
    model = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("classifier", RandomForestClassifier(random_state=42)),
        ]
    )

    logger.info("Fitting model")
    model.fit(X_train, y_train)

    # pickle file should be written to outputDir/
    logger.info("Model fitting complete. Writing 'RFC_model.pkl' to outputDir/")
    with open("outputDir/RFC_model.pkl", "wb") as f:
        pickle.dump(model, f)

    logger.info("Training Job Complete!")
    pass

In [18]:
# Loading datasets to test functions
predict_data = pandas.read_csv("predict.csv")
test_data = pandas.read_csv("test.csv")
training_data = pandas.read_csv("train.csv")

In [19]:
begin()

INFO:__main__:'RFC_model.pkl' file loaded to global variable 'model'
INFO:__main__:Variable roles assigned


In [20]:
next(predict(predict_data))

INFO:__main__:scoring_data is of shape: (143, 11)


[{'PassengerId': 871,
  'Pclass': 3,
  'Name': 'Balkic, Mr. Cerin',
  'Sex': 'male',
  'Age': 26.0,
  'SibSp': 0,
  'Parch': 0,
  'Ticket': '349248',
  'Fare': 7.8958,
  'Cabin': nan,
  'Embarked': 'S',
  'Prediction': 0},
 {'PassengerId': 499,
  'Pclass': 1,
  'Name': 'Allison, Mrs. Hudson J C (Bessie Waldo Daniels)',
  'Sex': 'female',
  'Age': 25.0,
  'SibSp': 1,
  'Parch': 2,
  'Ticket': '113781',
  'Fare': 151.55,
  'Cabin': 'C22 C26',
  'Embarked': 'S',
  'Prediction': 1},
 {'PassengerId': 662,
  'Pclass': 3,
  'Name': 'Badt, Mr. Mohamed',
  'Sex': 'male',
  'Age': 40.0,
  'SibSp': 0,
  'Parch': 0,
  'Ticket': '2623',
  'Fare': 7.225,
  'Cabin': nan,
  'Embarked': 'C',
  'Prediction': 0},
 {'PassengerId': 173,
  'Pclass': 3,
  'Name': 'Johnson, Miss. Eleanor Ileen',
  'Sex': 'female',
  'Age': 1.0,
  'SibSp': 1,
  'Parch': 1,
  'Ticket': '347742',
  'Fare': 11.1333,
  'Cabin': nan,
  'Embarked': 'S',
  'Prediction': 0},
 {'PassengerId': 60,
  'Pclass': 3,
  'Name': 'Goodwin, Mast

In [21]:
next(metrics(test_data))

INFO:__main__:metrics_df is of shape: (143, 12)


{'ACCURACY': 0.8111888111888111}

In [22]:
train(training_data)

INFO:__main__:train_df is of shape: (571, 12)
INFO:__main__:Replacing Nulls
INFO:__main__:Setting 'y_train' and 'X_train'
INFO:__main__:Setting up numeric transformer Pipeline
INFO:__main__:Setting up categorical transformer Pipeline
INFO:__main__:Initializing preprocessor
INFO:__main__:Initializing model pipeline
INFO:__main__:Fitting model
INFO:__main__:Model fitting complete. Writing 'RFC_model.pkl' to outputDir/
INFO:__main__:Training Job Complete!
