### TEAM NAME : Brute_force
### MEMBER1 : Khushal Rathi
### MEMBER 2 : Siddharth Nilol 


## INSTALLING LIBRARIES

In [2]:
!pip install --quiet  catboost optuna lightgbm scikit-optimize


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score as auc
from sklearn.model_selection import RepeatedStratifiedKFold
from catboost import Pool, CatBoostClassifier


## LOADING DATA

In [4]:
## Adjust the directories accordingly before loading data
train_full=pd.read_csv('drive/MyDrive/Cascade cup/train.csv')
test_full=pd.read_csv('drive/MyDrive/Cascade cup/test.csv')


## DATA PREPROCESSING AND FEATURE ENGINEERING

In [5]:
order_id = test_full['order_id']
y_train=train_full['cancelled']

train_full.drop(columns=['order_id','pickup_time','delivered_time','reassignment_reason','cancelled_time'],axis=1,inplace=True)
test_full.drop(columns=['order_id','reassignment_reason'],axis=1,inplace=True)

train_full.loc[:, 'order_time'] = pd.to_datetime(train_full['order_time'], format='%Y-%m-%d %H:%M:%S')
train_full.loc[:, 'allot_time'] = pd.to_datetime(train_full['allot_time'], format='%Y-%m-%d %H:%M:%S')
train_full.loc[:, 'accept_time'] = pd.to_datetime(train_full['accept_time'], format='%Y-%m-%d %H:%M:%S')
train_full.loc[:, 'order_date'] = pd.to_datetime(train_full['order_date'], format='%Y-%m-%d %H:%M:%S')

test_full.loc[:, 'order_time'] = pd.to_datetime(test_full['order_time'], format='%Y-%m-%d %H:%M:%S')
test_full.loc[:, 'allot_time'] = pd.to_datetime(test_full['allot_time'], format='%Y-%m-%d %H:%M:%S')
test_full.loc[:, 'accept_time'] = pd.to_datetime(test_full['accept_time'], format='%Y-%m-%d %H:%M:%S')
test_full.loc[:, 'order_date'] = pd.to_datetime(test_full['order_date'], format='%Y-%m-%d %H:%M:%S')

train_full.loc[:, 'order_day'] = train_full['order_time'].apply(lambda x: x.day)
train_full.loc[:, 'order_month'] = train_full['order_time'].apply(lambda x: x.month)

test_full.loc[:, 'order_day'] = test_full['order_time'].apply(lambda x: x.day)
test_full.loc[:, 'order_month'] = test_full['order_time'].apply(lambda x: x.month)


In [6]:
x_train = train_full.drop(['cancelled'],axis=1)
y_train=train_full['cancelled']
x_test=test_full


In [7]:
def func1(reassignment_method):
  if(reassignment_method=='auto'):
    return 1
  elif(reassignment_method=='manual'):
    return 2
  else:
    return 3

def func2(reassigned_order):
  if(reassigned_order==1):
    return 1
  else:
    return 2

df1=train_full.apply(lambda row : func1(row['reassignment_method']), axis = 1)
df2=train_full.apply(lambda row : func2(row['reassigned_order']), axis = 1)

df1=pd.DataFrame(df1)
df1.columns=['reassignment_method']

df2=pd.DataFrame(df2)
df2.columns=['reassigned_order']

x_train.loc[:,'reassignment_method']=df1
x_train.loc[:,'reassigned_order']=df2

df1=test_full.apply(lambda row : func1(row['reassignment_method']), axis = 1)
df2=test_full.apply(lambda row : func2(row['reassigned_order']), axis = 1)

df1=pd.DataFrame(df1)
df1.columns=['reassignment_method']

df2=pd.DataFrame(df2)
df2.columns=['reassigned_order']

x_test.loc[:,'reassignment_method']=df1
x_test.loc[:,'reassigned_order']=df2


In [8]:
x_train.loc[:, 'avg_mile_distance'] = (((x_train['first_mile_distance'] + x_train['last_mile_distance'])/2).apply(lambda x:x))
x_train.loc[:, 'diff1'] = ((x_train['allot_time'] - x_train['order_time']).apply(lambda x: x.total_seconds()))
x_train.loc[:, 'diff2'] = ((x_train['accept_time'] - x_train['allot_time']).apply(lambda x: x.total_seconds()))

x_test.loc[:, 'avg_mile_distance'] = (((x_test['first_mile_distance'] + x_test['last_mile_distance'])/2).apply(lambda x:x))
x_test.loc[:, 'diff1'] = ((x_test['allot_time'] - x_test['order_time']).apply(lambda x: x.total_seconds()))
x_test.loc[:, 'diff2'] = ((x_test['accept_time'] - x_test['allot_time']).apply(lambda x: x.total_seconds()))


In [9]:
orders_train=x_train[['rider_id','order_date','undelivered_orders']].fillna(0)
orders_test=x_test[['rider_id','order_date','undelivered_orders']].fillna(0)
orders_train['days_diff']=(x_train['order_time']-x_train['order_time'].iloc[0]).dt.days
orders_test['days_diff']=(x_test['order_time']-x_test['order_time'].iloc[0]).dt.days


In [10]:
groups=orders_train.groupby(['rider_id'])
key = groups.groups.keys()
order_train={}

for i in key:
    df=groups.get_group(i)
    order_train[df['rider_id'].iloc[0]]=df['undelivered_orders'].iloc[0]

def func1(undelivered_order,rider_id):
    if(undelivered_order>order_train[rider_id]):
      order_train[rider_id]=undelivered_order
      return 1
    else:
      return 0

df1 = orders_train.apply(lambda row : func1(row['undelivered_orders'],row['rider_id']), axis = 1)
df_orders = pd.DataFrame(df1)
df_orders.columns=['orders_change']
orders_train.loc[:,'orders_change']=df_orders

groups=orders_train.groupby(['rider_id'])
key = groups.groups.keys()
list_train=[]
dates_train={}

for i in key:
    df=groups.get_group(i)
    dates_train[df['rider_id'].iloc[0]]=df['order_date'].unique()
    rows_where_order_change=df[df['orders_change']==1]
    for index, row in rows_where_order_change.iterrows():
        list_train.append([row['rider_id'],row['order_date']])

list_train1=[]

for i in list_train:
  for j in range(len(dates_train[i[0]])):
    if(dates_train[i[0]][j]==i[1]):
      list_train1.append([i[0],dates_train[i[0]][j-1]])

orders_train.loc[:,'order_change_final']=0
for i in list_train1:
  rows=orders_train[(orders_train['rider_id']==i[0])& (orders_train['order_date']==i[1])]
  for index, row in rows.iterrows():
    orders_train.at[index,'order_change_final']=1



In [11]:
groups=orders_test.groupby(['rider_id'])
key = groups.groups.keys()
order_test={}

for i in key:
    df=groups.get_group(i)
    order_test[df['rider_id'].iloc[0]]=df['undelivered_orders'].iloc[0]

def func1(undelivered_order,rider_id):
    if(undelivered_order>order_test[rider_id]):
      order_test[rider_id]=undelivered_order
      return 1
    else:
      return 0

df1 = orders_test.apply(lambda row : func1(row['undelivered_orders'],row['rider_id']), axis = 1)
df_orders = pd.DataFrame(df1)
df_orders.columns=['orders_change']
orders_test.loc[:,'orders_change']=df_orders

groups=orders_test.groupby(['rider_id'])
key = groups.groups.keys()
list_test=[]
dates_test={}

for i in key:
    df=groups.get_group(i)
    dates_test[df['rider_id'].iloc[0]]=df['order_date'].unique()
    rows_where_order_change=df[df['orders_change']==1]
    for index, row in rows_where_order_change.iterrows():
        list_test.append([row['rider_id'],row['order_date']])

list_test1=[]

for i in list_test:
  for j in range(len(dates_test[i[0]])):
    if(dates_test[i[0]][j]==i[1]):
      list_test1.append([i[0],dates_test[i[0]][j-1]])


orders_test.loc[:,'order_change_final']=0
for i in list_test1:
  rows=orders_test[(orders_test['rider_id']==i[0])& (orders_test['order_date']==i[1])]
  for index, row in rows.iterrows():
    orders_test.at[index,'order_change_final']=1


In [12]:
x_train['order_change_final']=orders_train['order_change_final']
x_test['order_change_final']=orders_test['order_change_final']


In [13]:
x_train.drop(columns=['order_time','allot_time','accept_time','order_date'],axis=1,inplace=True)
x_test.drop(columns=['order_time','allot_time','accept_time','order_date'],axis=1,inplace=True)


In [14]:
riders={}

for index, row in x_train.iterrows():
  if row['rider_id'] in riders.keys():
    riders[row['rider_id']]+=1
  else:
    riders[row['rider_id']]=1
  
for index, row in x_test.iterrows():
  if row['rider_id'] in riders.keys():
    riders[row['rider_id']]+=1
  else:
    riders[row['rider_id']]=1


In [15]:
x_train['riders'] = x_train['rider_id'].apply(lambda x: riders[x])
x_test['riders'] = x_test['rider_id'].apply(lambda x: riders[x])


## MODEL TRAINING

In [16]:
X_train, X_valid, Y_train, Y_valid = train_test_split(x_train, y_train,train_size=0.8, test_size=0.2,random_state=42) 

from catboost import Pool, CatBoostClassifier

train_label = Y_train
eval_label = Y_valid

train_dataset = Pool(data=X_train,label=train_label)
eval_dataset = Pool(data=X_valid,label=eval_label)

# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=1000,learning_rate=0.01,depth=8,loss_function='Logloss',
                           eval_metric='AUC',use_best_model=True,random_seed=42
                          ,l2_leaf_reg=8,early_stopping_rounds=100)

model.fit(train_dataset,plot=True,eval_set=eval_dataset)
pred_val=model.predict(X_valid)
pred_test=model.predict(x_test)
pred_probab_cat=model.predict_proba(x_test)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.8929456	best: 0.8929456 (0)	total: 145ms	remaining: 2m 25s
1:	test: 0.9258365	best: 0.9258365 (1)	total: 232ms	remaining: 1m 55s
2:	test: 0.9260263	best: 0.9260263 (2)	total: 335ms	remaining: 1m 51s
3:	test: 0.9288127	best: 0.9288127 (3)	total: 410ms	remaining: 1m 42s
4:	test: 0.9272901	best: 0.9288127 (3)	total: 496ms	remaining: 1m 38s
5:	test: 0.9275245	best: 0.9288127 (3)	total: 578ms	remaining: 1m 35s
6:	test: 0.9261876	best: 0.9288127 (3)	total: 673ms	remaining: 1m 35s
7:	test: 0.9250331	best: 0.9288127 (3)	total: 743ms	remaining: 1m 32s
8:	test: 0.9250509	best: 0.9288127 (3)	total: 830ms	remaining: 1m 31s
9:	test: 0.9278561	best: 0.9288127 (3)	total: 911ms	remaining: 1m 30s
10:	test: 0.9327554	best: 0.9327554 (10)	total: 1s	remaining: 1m 30s
11:	test: 0.9313910	best: 0.9327554 (10)	total: 1.09s	remaining: 1m 29s
12:	test: 0.9317338	best: 0.9327554 (10)	total: 1.18s	remaining: 1m 29s
13:	test: 0.9301855	best: 0.9327554 (10)	total: 1.25s	remaining: 1m 28s
14:	test: 0.929

In [17]:
pred_probab_cat=pd.DataFrame(pred_probab_cat)
pred_probab_cat.columns=['probab_0','probab_1']
pred_probab_cat.drop(columns=['probab_0'],inplace=True)
pred_probab_cat.columns=['cancelled']
result = pd.concat([order_id, pred_probab_cat], axis=1, join='inner')


## FINAL SUBMISSION

In [18]:
## Adjust the directories accordingly before saving data
result.to_csv('drive/MyDrive/Cascade cup/final/final_sub.csv',index=False)
