In [1]:
!pip install pgeocode

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m


In [1]:
import pandas as pd

In [2]:
import pandas as pd
import os
import pandas as pd
from torch.utils.data import Dataset
from torchvision.io import read_image
import pgeocode
import datetime
import pickle
import torch
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [3]:
def one_hot_feat(feat, feat_list, val):
    out_array = np.zeros(len(feat_list))
    val = feat+str(val)
    pos = np.where(feat_list == val)[0][0]
    out_array[pos] = 1

    return out_array

In [4]:
def get_accept_days(row):
    #print(row)
    payment_date = datetime.datetime.fromisoformat(row['payment_datetime'])
    accept_date = datetime.datetime.fromisoformat(row['acceptance_scan_timestamp'])
    days_accept = accept_date - payment_date
    return days_accept.days

def get_dist(row):
    dist = pgeocode.GeoDistance('US')
    res = dist.query_postal_code(row['item_zip'], row['buyer_zip'])
    if np.isnan(res):
        return 10000
    else:
        return res

def get_delivery_days(row):
    #print(row)
    payment_date = datetime.datetime.fromisoformat(row['payment_datetime'][:10])
    delivery_date = datetime.datetime.fromisoformat(row['delivery_date'])
    days_delivery = delivery_date - payment_date

    if days_delivery.days < 0:
        return 0
    return days_delivery.days

def clean_dec_hand_days(days):
    if np.isnan(days):
        return -1000
    else:
        return days

def get_zip(zip):
    try:
        return zip[:5]
    except:
        return zip

In [5]:
class EbayDataset(Dataset):
    def __init__(self, delivery_df, test_set=0,train_size=300000, quiz_set=0):
        data_dir = "/Users/pamelakatali/Downloads/Ebay_ML/data/"
        
        train_df = delivery_df

        train_preprocess = pd.DataFrame([])

        train_preprocess['accept_days'] = train_df.apply(get_accept_days, axis=1)
        train_preprocess['dec_handling_days'] = train_df['declared_handling_days'].apply(clean_dec_hand_days)
        train_preprocess['shipping_fee'] = train_df['shipping_fee']

        train_preprocess['carrier_min_estimate'] = train_df['carrier_min_estimate']
        train_preprocess['carrier_max_estimate'] = train_df['carrier_max_estimate']
        train_preprocess['carrier_diff_estimate'] = train_df['carrier_max_estimate'] - train_df['carrier_min_estimate']

        dist = pgeocode.GeoDistance('US')
        train_df['item_zip'] = train_df['item_zip'].apply(get_zip)
        train_df['buyer_zip'] = train_df['buyer_zip'].apply(get_zip)

        train_preprocess['shipping_dist'] = dist.query_postal_code(train_df['item_zip'].astype(str).values, train_df['buyer_zip'].astype(str).values)
        train_preprocess['shipping_dist'] = train_preprocess['shipping_dist'].fillna(10000)

        train_preprocess['item_price'] = train_df['item_price']
        train_preprocess['quantity'] = train_df['quantity']

        train_preprocess['weight'] = train_df['weight']
        train_preprocess['weight_units'] = train_df['weight_units']

        self.cols = list(train_preprocess.columns)


        train_preprocess['ship_id'] = train_df['shipment_method_id']
        train_preprocess['cat_id'] = train_df['category_id']
        train_preprocess['pack_size'] = train_df['package_size']
        train_preprocess['b2c_c2c'] = train_df['b2c_c2c']


        self.ship_cols = pickle.load( open( data_dir+"ship_id_cols.pkl", "rb" ) )
        self.cat_cols = pickle.load( open( data_dir+"cat_id_cols.pkl", "rb" ) )
        self.pack_cols = pickle.load( open( data_dir+"pack_size_cols.pkl", "rb" ) )
        self.b2c_cols = pickle.load( open( data_dir+"b2c_c2c_cols.pkl", "rb" ) )

        scaler = pickle.load( open( data_dir+"train_minmax_scaler_2.pkl", "rb" ) )

    
        if quiz_set == 0:
            train_preprocess['delivery_days'] = train_df.apply(get_delivery_days, axis=1)
            self.data = train_preprocess

        else:
            train_preprocess['delivery_days'] = np.zeros(len(train_preprocess))
            self.data = train_preprocess
  
        self.scaler = pickle.load(open( data_dir+"train_minmax_scaler_2.pkl", "rb" ))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data.iloc[idx]
        #print(sample)

        ship = one_hot_feat('ship_id_', self.ship_cols, sample['ship_id'])
        cat = one_hot_feat('cat_id_', self.cat_cols, sample['cat_id'])
        pack = one_hot_feat('pack_size_', self.pack_cols, sample['pack_size'])
        b2c_c2c = one_hot_feat('b2c_c2c_', self.b2c_cols, sample['b2c_c2c'])

        one_hot = np.concatenate((ship, cat, pack, b2c_c2c), axis=0)

        x = np.concatenate((sample[self.cols].values.astype(np.float64), one_hot), axis=0)
        x = self.scaler.transform(x.reshape(1,78))
        x = x.reshape(78,)
        return x, sample['delivery_days'] #torch.tensor(x), torch.tensor(sample['delivery_days']) 

In [6]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
import torch.nn.functional as F

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.w1 = nn.Linear(78,512,bias=True)
        nn.init.xavier_uniform_(self.w1.weight)
        nn.init.zeros_(self.w1.bias)
        self.bn1 = torch.nn.BatchNorm1d(512)
        

        self.w2 = nn.Linear(512,256,bias=True)
        nn.init.xavier_uniform_(self.w2.weight)
        nn.init.zeros_(self.w2.bias)
        self.bn2 = torch.nn.BatchNorm1d(256)
        #nn.Dropout(0.20)
        
        self.w3 = nn.Linear(256,128,bias=True)
        nn.init.xavier_uniform_(self.w3.weight)
        nn.init.zeros_(self.w3.bias)
        self.bn3 = torch.nn.BatchNorm1d(128)
        
        self.w4 = nn.Linear(128,64,bias=True)
        nn.init.xavier_uniform_(self.w4.weight)
        nn.init.zeros_(self.w4.bias)
        self.bn4 = torch.nn.BatchNorm1d(64)
        #nn.Dropout(0.20)
        
        self.w5 = nn.Linear(64,32,bias=True)
        nn.init.xavier_uniform_(self.w5.weight)
        nn.init.zeros_(self.w5.bias)
        self.bn5 = torch.nn.BatchNorm1d(32)
        
        self.w6 = nn.Linear(32,16,bias=True)
        nn.init.xavier_uniform_(self.w6.weight)
        nn.init.zeros_(self.w6.bias)
        self.bn6 = torch.nn.BatchNorm1d(16)
        
        self.w7 = nn.Linear(16,8,bias=True)
        nn.init.xavier_uniform_(self.w7.weight)
        nn.init.zeros_(self.w7.bias)
        self.bn7 = torch.nn.BatchNorm1d(8)

        self.w8 = nn.Linear(8,1,bias=True)
        nn.init.xavier_uniform_(self.w8.weight)
        nn.init.zeros_(self.w8.bias)


    def forward(self, x):
        x = self.bn1(F.relu(self.w1(x)))
        x = self.bn2(F.relu(self.w2(x)))
        x = self.bn3(F.relu(self.w3(x)))
        x = self.bn4(F.relu(self.w4(x)))
        x = self.bn5(F.relu(self.w5(x)))
        x = self.bn6(F.relu(self.w6(x)))
        x = self.bn7(F.relu(self.w7(x)))
        x = self.w8(x)
        return x

In [7]:
model = MLP()
model.load_state_dict(torch.load("/Users/pamelakatali/Downloads/Ebay_ML/data/mlp_small/mlp_mid_train_0_epoch.pt",map_location=torch.device('cpu')))
model.eval()

MLP(
  (w1): Linear(in_features=78, out_features=512, bias=True)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (w2): Linear(in_features=512, out_features=256, bias=True)
  (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (w3): Linear(in_features=256, out_features=128, bias=True)
  (bn3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (w4): Linear(in_features=128, out_features=64, bias=True)
  (bn4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (w5): Linear(in_features=64, out_features=32, bias=True)
  (bn5): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (w6): Linear(in_features=32, out_features=16, bias=True)
  (bn6): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (w7): Linear(in_features=16, out_features=8, bias=True)
  (bn7): BatchNorm1d(8, eps=1e-05, momen

In [10]:
import numpy as np
import pickle

data_dir = "/Users/pamelakatali/Downloads/Ebay_ML/data/"

chunk_size = 50000
quiz_filename = data_dir+"eBay_ML_Challenge_Dataset_2021_quiz.tsv.gz"

current_loss = 0.0

targets_all = torch.tensor([])
outputs_all = torch.tensor([])

for chunk in pd.read_csv(quiz_filename, sep='\t', chunksize=chunk_size):
    #process chunk
    dataset = EbayDataset(chunk, train_size=chunk_size, quiz_set=1)
    
    # Prepare dataset
    quizloader = torch.utils.data.DataLoader(dataset, batch_size=10000, shuffle=True, num_workers=0)
    
    # Iterate over the DataLoader for training data
    for i, data in enumerate(quizloader):
        # Get inputs
        inputs, targets = data

        outputs = model(inputs.float())
        outputs_all = torch.cat((outputs_all, outputs), 0)
        targets_all = torch.cat((targets_all, targets), 0)
        #break;
    #break;

In [11]:
#inputs_all, targets_all = next(iter(quiz_loader))
#outputs_all = model(inputs_all.float())

#y_pred = sftmx(outputs_all.detach()) #np.round(outputs.detach().numpy())
#y_pred = np.argmax(y_pred, axis=1)

y_pred = outputs_all.detach()
y_pred = np.round(y_pred.numpy().reshape(len(y_pred)))


In [12]:
y_pred

array([4., 4., 4., ..., 5., 4., 4.], dtype=float32)

In [13]:
dataset_quiz = None
quiz_loader = None
inputs_all = None
outputs_all = None


In [14]:
import datetime
def get_delivery_date_quiz(row):
  
    payment_date = datetime.datetime.fromisoformat(row['payment_datetime'][:10])
    #print(payment_date)
    #delivery_date = datetime.fromisoformat(row['delivery_date'])
    delivery_date = payment_date + datetime.timedelta(days=row['mlp_pred'])
    return delivery_date

In [15]:
quiz_df = pd.read_csv(data_dir+"eBay_ML_Challenge_Dataset_2021_quiz.tsv.gz", sep="\t")
quiz_df['mlp_pred'] = y_pred


In [16]:
quiz_df['mlp_pred'] = quiz_df.apply(get_delivery_date_quiz, axis=1)

In [17]:
quiz_df[['record_number', 'mlp_pred']].to_csv(data_dir+'eBay_ML_Challenge_Dataset_2021_quiz_mlp_pred.tsv.gz',sep='\t',header=False, index=False, compression='infer')

In [None]:
Chunk: 2440
Mini-batch: 48800
Train loss: 0.794
Test loss: 0.8074281215667725

In [None]:
'''
import pandas as pd
train_set = pd.read_csv("/content/drive/MyDrive/Colab_Notebooks/Ebay/data/eBay_ML_Challenge_Dataset_2021_train.tsv.gz",sep='\t')
train_set

train_set = train_set.sample(frac=1)

train_len = int(len(train_set) * 0.9)
test_set = train_set[train_len:]
train_set = train_set[:train_len]

test_set.to_csv('/content/drive/MyDrive/Colab_Notebooks/Ebay/data/ebay_dev.tsv.gz',sep='\t',compression='infer')
train_set.to_csv('/content/drive/MyDrive/Colab_Notebooks/Ebay/data/ebay_train.tsv.gz',sep='\t',compression='infer')
'''