In [1]:
# install missing packages
%pip install plotly
%pip install py-cpuinfo
%pip install pandas-datareader

Collecting plotly
  Obtaining dependency information for plotly from https://files.pythonhosted.org/packages/df/79/c80174d711ee26ee5da55a9cc3e248f1ec7a0188b5e4d6bbbbcd09b974b0/plotly-5.17.0-py2.py3-none-any.whl.metadata
  Using cached plotly-5.17.0-py2.py3-none-any.whl.metadata (7.0 kB)
Collecting tenacity>=6.2.0 (from plotly)
  Obtaining dependency information for tenacity>=6.2.0 from https://files.pythonhosted.org/packages/f4/f1/990741d5bb2487d529d20a433210ffa136a367751e454214013b441c4575/tenacity-8.2.3-py3-none-any.whl.metadata
  Using cached tenacity-8.2.3-py3-none-any.whl.metadata (1.0 kB)
Using cached plotly-5.17.0-py2.py3-none-any.whl (15.6 MB)
Using cached tenacity-8.2.3-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.17.0 tenacity-8.2.3
Note: you may need to restart the kernel to use updated packages.
Collecting py-cpuinfo
  Using cached py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)
Installing collected packages: py-cpuinfo


In [2]:
# check system details
import os
import psutil
import cpuinfo

ram_info = psutil.virtual_memory()
print(f"Total RAM: {ram_info.total / 1024 / 1024 / 1024:.2f} GB")
print(f"Available RAM: {ram_info.available / 1024 / 1024 / 1024:.2f} GB")
print(f"Used RAM: {ram_info.used / 1024 / 1024 / 1024:.2f} GB")
print(f"Percentage Usage Of RAM: {ram_info.percent}%")
print(f"CPU Cores: {os.cpu_count()}")
print(f"CPU Speed: {cpuinfo.get_cpu_info()['hz_actual_friendly']}")
disk_info = psutil.disk_usage(os.getcwd())
print(f"Total Disk: {disk_info.total / 1024 / 1024 / 1024:.2f} GB")
print(f"Available Disk: {disk_info.free / 1024 / 1024 / 1024:.2f} GB")
print(f"Used Disk: {disk_info.used / 1024 / 1024 / 1024:.2f} GB")
print(f"Percentage Usage Of Disk: {disk_info.percent}%")

Total RAM: 15.47 GB
Available RAM: 14.14 GB
Used RAM: 1.07 GB
Percentage Usage Of RAM: 8.6%
CPU Cores: 4
CPU Speed: 2.5000 GHz
Total Disk: 24.99 GB
Available Disk: 15.07 GB
Used Disk: 9.92 GB
Percentage Usage Of Disk: 39.7%


In [3]:
# import requirements
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from wrangle import prepare
from lm_classifier import Classification

In [4]:
# get the data
titanic = pd.read_csv("titanic.csv")

In [5]:
# prepare the data for machine learning
titanic = prepare(df=titanic, name="Data Preparation", path=None, plots=True)

Data Wrangling:
> Removing Unnecessary Columns
> Getting Titles From Name
> Transforming Ticket
> Replacing Missing Values
> Transforming Cabin
> Shuffling The Data
0.01 Seconds
Plotting:
> Plotting Correlations
> Age vs. Fare
> Plotting Age
> Plotting Fare
> Plotting Survived
> Plotting Pclass
> Plotting Sex
> Plotting SibSp
> Plotting Parch
> Plotting Ticket
> Plotting Cabin
> Plotting Embarked
> Plotting Title
> Survived vs. Pclass
> Survived vs. Sex
> Survived vs. SibSp
> Survived vs. Parch
> Survived vs. Ticket
> Survived vs. Cabin
> Survived vs. Embarked
> Survived vs. Title
> Pclass vs. Sex
> Pclass vs. SibSp
> Pclass vs. Parch
> Pclass vs. Ticket
> Pclass vs. Cabin
> Pclass vs. Embarked
> Pclass vs. Title
> Sex vs. SibSp
> Sex vs. Parch
> Sex vs. Ticket
> Sex vs. Cabin
> Sex vs. Embarked
> Sex vs. Title
> SibSp vs. Parch
> SibSp vs. Ticket
> SibSp vs. Cabin
> SibSp vs. Embarked
> SibSp vs. Title
> Parch vs. Ticket
> Parch vs. Cabin
> Parch vs. Embarked
> Parch vs. Title
> Ticke

In [6]:
# get the testing data
y = titanic[["Survived"]]
X = titanic.drop(columns="Survived")
testX = X.tail(int(0.2 * X.shape[0])).reset_index(drop=True)
testy = y.tail(int(0.2 * y.shape[0])).reset_index(drop=True)

In [7]:
# build the model
print("\n---- Titanic Classification Analysis ----\n")
model = Classification(
    name="Lasso Without Feature Engineering", 
    path=None,
    rename=False, 
    time=False, 
    binary=True, 
    imputation=True, 
    variance=True,
    scale=True,
    atwood=False,
    binning=False,
    reciprocal=False, 
    interaction=False, 
    selection=False,
    plots=True,
)
try:
    model.load()  # load the machine learning pipeline
    predictions = model.predict(testX)
except:
    model.validate(X, y)  # build the machine learning pipeline
    predictions = model.predict(testX)
    print(f"Accuracy: {model.accuracy}")
    print(f"F1: {model.f1}")
    print(f"In Control: {model.in_control}")
    print("Confusion Matrix:")
    print(model.confusion)


---- Titanic Classification Analysis ----

Model Training:
> Transforming The Training Data
> Transforming Categorical Features
> Filling In Missing Values
> Removing Constant Features
> Scaling Features
> Training Lasso
0.97 Seconds
Model Performance:
> Transforming The Testing Data
> Scoring The Model
2.97 Seconds
Model Indicators:
> Extracting Important Features
0.08 Seconds
Model Prediction:
> Transforming The New Data
> Getting Predictions
0.03 Seconds
Model Monitoring:
> Computing Feature Drift
0.19 Seconds
Accuracy: 0.8654494382022472
F1: 0.8463620209637581
In Control: 100.0%
Confusion Matrix:
    0   1
0  87  10
1  14  67


In [8]:
# model diagnostics
print("Model Indicators:")
for i, indicator in enumerate(model.indicators["Indicator"][:10]):
    print(f"{i+1}. {indicator}")
print(" ")
print("Feature Drift:")
for i, feature in enumerate(model.drift.loc[model.drift["pvalue"] < 0.05, "Feature"][:10]):
    print(f"{i+1}. {feature}")
if model.drift.loc[model.drift["pvalue"] < 0.05].shape[0] == 0:
    print("None")

Model Indicators:
1. Title_Master.
2. SibSp_5
3. Parch_4
4. SibSp_8
5. Sex_male
6. Fare
7. Parch_6
8. Title_Rev.
9. SibSp_0
10. Ticket_4
 
Feature Drift:
None


In [9]:
# score the model
accuracy = accuracy_score(
    y_true=testy.iloc[:,0].to_numpy(),
    y_pred=predictions,
)
f1 = f1_score(
    y_true=testy.iloc[:,0].to_numpy(),
    y_pred=predictions,
)

print(f"Accuracy: {accuracy}")
print(f"F1: {f1}")

Accuracy: 0.8651685393258427
F1: 0.8481012658227849


In [10]:
# show the confusion matrix
ytest = testy.iloc[:,0].to_numpy()
labels = np.unique(np.concatenate((predictions, ytest)))
confusion = confusion_matrix(
    y_true=ytest,   # rows
    y_pred=predictions,  # columns
    labels=labels,
)
confusion = pd.DataFrame(
    confusion, 
    columns=labels, 
    index=labels,
)
print("Confusion Matrix:")
pd.set_option("display.width", 1000)
print(confusion)

Confusion Matrix:
    0   1
0  87  10
1  14  67


In [11]:
# save the machine learning pipeline
model.dump()

In [12]:
# refit the model to include the test data
model.refit(testX, testy)

Model Retraining:
> Transforming The Updated Data
> Transforming Categorical Features
> Filling In Missing Values
> Removing Constant Features
> Scaling Features
> Training Lasso
1.47 Seconds
Model Indicators:
> Extracting Important Features
0.08 Seconds
