<a href="https://colab.research.google.com/github/nathantthai/SFL-SDE/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!/usr/bin/env python3
__author__ = "Nathan Thai"
__email__ = "nathantthai@gmail.com"
__phone__ = "832-528-7224"

# Section 1

---
# Section 2
###Database & Python ETL

Importing modules

In [None]:
import pandas as pd
import os
!pip install pymongo
import pymongo # often, this will need to be pip installed in, see line above
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pymongo
  Downloading pymongo-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (492 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.1/492.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dnspython<3.0.0,>=1.16.0
  Downloading dnspython-2.3.0-py3-none-any.whl (283 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m283.7/283.7 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.3.0 pymongo-4.3.3


Setting up MongoDB with Python

In [None]:
# credential params to establish connection with MongoDB
provided_URI = "mongodb+srv://cluster0.rqb9xp1.mongodb.net/?authSource=%24external&authMechanism=MONGODB-X509&retryWrites=true&w=majority" #@param {type:'string'}
path_to_certificate = "/content/X509-cert-359057758107592091.pem" #@param {type:'string'}

# Establishing a MongoClient Object, which is a MongoDB instance in the system
client = MongoClient(provided_URI,
                     tls=True,
                     tlsCertificateKeyFile=path_to_certificate,
                     server_api=ServerApi('1'))

# Establishing a database within the instance
database = client['SFL_SDE_evaluation_nathan']

# Establishing a collection within the database
collection = database['SRDataEngineerChallenge_DATASET']



Reading in the csv with pandas




In [None]:
#@markdown ###Enter CSV path
csv_url = '/content/SRDataEngineerChallenge_DATASET.csv' #@param {type:"string"}

#Using read_csv function from pandas to read in the csv
df = pd.read_csv(csv_url)

# a quick preview what the dataframe looks like
df.head()


Unnamed: 0,id,first_name,last_name,email,gender,ip_address
0,1,Margaretta,Laughtisse,mlaughtisse0@mediafire.com,Genderfluid,34.148.232.131
1,2,Vally,Garment,vgarment1@wisc.edu,Bigender,15.158.123.36
2,3,Tessa,Curee,tcuree2@php.net,Bigender,132.209.143.225
3,4,Arman,Heineking,aheineking3@tuttocitta.it,Male,157.110.61.233
4,5,Roselia,Trustie,rtrustie4@ft.com,Non-binary,49.55.218.81


Converting the dataframe into a list of dictionaries

In [None]:
# initializing column name as keys
id = df.columns[0]
first_name = df.columns[1]
last_name = df.columns[2]
email = df.columns[3]
gender = df.columns[4]
ip_address = df.columns[5]

# initializing an empty list to append to
df_in_listFormat = list()

'''
Nesting a dictionary into a list, 
e.g 
[{'id': 1, 
'first_name':'Margaretta', 
'last_name':'Laughtisse', 
'email':'mlaughtisse0@mediafire.com', 
'gender':'Genderfluid', 
'ip_address':'34.148.232.131'}]
'''

#itertuples is ~ 50x faster than iterrows() with 10 mil data points.

for row in df.itertuples(index=False):
  # creating a dictionary of key, value from column name, value
  dictionary ={id: row.id, 
               first_name: row.first_name,
               last_name: row.last_name,
               email: row.email,
               gender: row.gender,
               ip_address: row.ip_address
               }
  
  # appending it to the list
  df_in_listFormat.append(dictionary)


Inserting the list into the collection to the MongoDB

In [None]:
collection.insert_many(df_in_listFormat)


<pymongo.results.InsertManyResult at 0x7f789496f9a0>

---
# Section 3
###ML API

Importing modules, eg. Pytorch, Pytorchlightning and Mnist Data

In [None]:

!pip install lightning
!pip install torch

import lightning as L
import torch as T
import torch.nn.functional as F

from torch import nn
from torchmetrics.functional import accuracy
from torch.utils.data import DataLoader, random_split
from torchvision import transforms

from torchvision.datasets import MNIST, FashionMNIST #Importing the MNIST Dataset from torchvision. It is a Hello-World dataset and any ML dataloader library should have them

# Global Var
DATASET_PATH = os.environ.get("PATH_DATASETS", ".")
INPUT_DIM = (1, 28, 28) # 28 by 28 grayscale input, 1 channel eg. (c, w, h)


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lightning
  Downloading lightning-2.0.2-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytorch-lightning
  Downloading pytorch_lightning-2.0.2-py3-none-any.whl (719 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m719.0/719.0 kB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics<2.0,>=0.7.0
  Downloading torchmetrics-0.11.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting deepdiff<8.0,>=5.7.0
  Downloading deepdiff-6.3.0-py3-none-any.whl (69 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi<0.89.0,>=0.69.0
  Downloading f

Setting up the model

In [None]:
class Model(L.LightningModule):
    def __init__(self, channels, width, height, num_classes, hidden_size=64, learning_rate=2e-3):
        super().__init__()

        # Initializing params eg. input dimensions .. to build the model.
        self.width = width
        self.height = height
        self.channels = channels
        self.num_classes = num_classes
        self.hidden_size = hidden_size
        self.learning_rate = learning_rate

        self.model = nn.Sequential(
            nn.Flatten(),
            nn.Linear(channels * width * height, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_size, num_classes),
        )

    def forward(self, x):
        x = self.model(x)
        return F.log_softmax(x, dim=1)

    def training_step(self, batch):
        x, y = batch
        logits = self(x)
        loss = F.nll_loss(logits, y)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.nll_loss(logits, y)
        preds = T.argmax(logits, dim=1)
        acc = accuracy(preds, y, task="multiclass", num_classes=10)
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", acc, prog_bar=True)

    def configure_optimizers(self):
        optimizer = T.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer

Setting up the dataloader to feed into training

In [None]:
class MNISTModule(L.LightningDataModule):
    def __init__(self, data_dir: str = DATASET_PATH):
        super().__init__()
        self.data_dir = data_dir
        self.transform = transforms.Compose(
            [
                transforms.ToTensor(),
                transforms.Normalize((0.1307,), (0.3081,)), # not sure about these params but Yann LeCun likes them.
            ]
        )

        self.dims = INPUT_DIM 
        self.num_classes = 10 # 10 digits

    def prepare_data(self):
        # downloading and creating the proper directories to save them
        MNIST(self.data_dir, train=True, download=True)
        MNIST(self.data_dir, train=False, download=True)

    def setup(self, stage=None):
        # Assign training/validation datasets for use in dataloaders
        if stage == "fit" or stage is None:
            mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
            self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])

        # Assign test dataset for use in dataloader(s)
        if stage == "test" or stage is None:
            self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)

    def train_dataloader(self):
        return DataLoader(self.mnist_train, batch_size=64)

    def val_dataloader(self):
        return DataLoader(self.mnist_val, batch_size=64)

    def test_dataloader(self):
        return DataLoader(self.mnist_test, batch_size=64)

In [None]:
# Initializing DataModule
datamodule = MNISTModule()
# Initializing Model from datamodule's attributes
model = Model(*datamodule.dims, datamodule.num_classes)
# Initializing the trainer
trainer = L.Trainer(
    max_epochs=1,
    accelerator="auto",
    devices="auto",
)
# fit DataModule to model
trainer.fit(model, datamodule)

INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 91705733.29it/s]


Extracting ./MNIST/raw/train-images-idx3-ubyte.gz to ./MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 29991506.27it/s]


Extracting ./MNIST/raw/train-labels-idx1-ubyte.gz to ./MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 27023223.29it/s]


Extracting ./MNIST/raw/t10k-images-idx3-ubyte.gz to ./MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 7169939.32it/s]


Extracting ./MNIST/raw/t10k-labels-idx1-ubyte.gz to ./MNIST/raw



INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 55.1 K
-------------------------------------
55.1 K    Trainable params
0         Non-trainable params
55.1 K    Total params
0.220     Total estimated model params size (MB)
INFO:lightning.pytorch.callbacks.model_summary:
  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 55.1 K
-------------------------------------
55.1 K    Trainable params
0         Non-trainable params
55.1 K    Total params
0.220     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=1` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


Tracing the checkpoint to torchscript file for deployment

In [None]:
saved_model = model.to_torchscript()
T.jit.save(saved_model, '/content/model.pt')

In [None]:
test_set =MNIST(root='./data', train=False, download=True, transform=None)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 102673316.12it/s]


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 98404300.43it/s]


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 27601847.85it/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 10438645.90it/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw






In [None]:
from fastapi import FastAPI
app = FastAPI()

@app.post('/predict')
def predict(test_set):
  test_set
