# Imports

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn
import copy
import xgboost as xgb
import lightgbm as lgb

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Data input

In [None]:
raw = pd.read_csv('/kaggle/input/marketcohackaton/data.csv')
train_labels = pd.read_csv('/kaggle/input/marketcohackaton/train_labels.csv')

# Data for manipulation
data = copy.deepcopy(raw)

In [None]:
data.head()

# Data wrangling

In [None]:
data = data[data['qty_sold'] > 0]
data = data[data['cost'] >= 0]
data['month'] = data['date'].apply(lambda x: x.split('-')[1])

# Feature Engineering

## 1. Price spent per month for a individual client 

In [None]:
month_avg = data.groupby(['client_id', 'month'])['price'].agg(month_avg='mean').reset_index()
data = data.merge(month_avg, left_on = ['client_id', 'month'], right_on = ['client_id', 'month'])

In [None]:
MonthAvg = pd.pivot_table(data, values = 'month_avg', index = ['client_id'], columns=['month'], fill_value = 0).rename_axis(None, axis=1).reset_index()
MonthAvg = MonthAvg.add_prefix('AvgMonth')
MonthAvg.rename(columns={'AvgMonthclient_id': 'client_id'}, inplace = True)
MonthAvg.head()

In [None]:
MonthAvg.shape

## 2. Current number of a week

In [None]:
data['date'] = pd.to_datetime(data['date'])
data['year_week'] = data['date'].dt.isocalendar().week
data['week_number'] = data['year_week'] - 23
data.drop(columns = 'year_week', inplace = True)
visits_week = data.groupby(['client_id', 'week_number'])['purchase_id'].agg(visits_week='nunique').reset_index()
data = data.merge(visits_week, left_on = ['client_id', 'week_number'], right_on = ['client_id', 'week_number'])

## 3. Number of visits a client does to the market within a given week

In [None]:
WeekVisits = pd.pivot_table(data, values = 'visits_week', index = ['client_id'], columns=['week_number'], fill_value = 0).rename_axis(None, axis=1).reset_index()
WeekVisits = WeekVisits.add_prefix('visits_week')
WeekVisits.rename(columns={'visits_weekclient_id': 'client_id'}, inplace = True)
WeekVisits.head()

In [None]:
WeekVisits.shape

## 4. Price spent per week for a individual client

In [None]:
average_price_week = data.groupby(['client_id', 'week_number'])\
            ['price'].agg(avg_price_week='mean').reset_index()
data = data.merge(average_price_week, left_on = ['client_id', 'week_number'], right_on = ['client_id', 'week_number'])

In [None]:
PricesWeekly = pd.pivot_table(data, values = 'avg_price_week', index = ['client_id'], columns=['week_number'], fill_value = 0).rename_axis(None, axis=1).reset_index()
PricesWeekly = PricesWeekly.add_prefix('avgprice_week')
PricesWeekly.rename(columns={'avgprice_weekclient_id': 'client_id'}, inplace = True)
PricesWeekly.head()

In [None]:
PricesWeekly.shape

## 5. Sum of Discount Sales per week

In [None]:
data['discount_item'] = (data['cost'] - data['price']) * data['qty_sold']

data['discount_item_corr'] = np.where(data['discount_item'] < 0, 0, data['discount_item'])

week_discount = data.groupby(['client_id', 'week_number'])['discount_item_corr'].agg(discount_week = 'sum').reset_index()

WeekDiscount = pd.pivot_table(week_discount, values = 'discount_week', index = ['client_id'], columns=['week_number'], fill_value = 0).rename_axis(None, axis=1).reset_index()
WeekDiscount = WeekDiscount.add_prefix('discount_week')
WeekDiscount.rename(columns={'discount_weekclient_id': 'client_id'}, inplace = True)
WeekDiscount.head()

In [None]:
data['week_day'] = data['date'].dt.weekday

In [None]:
data.head()

In [None]:
unique_items = data.groupby(['week_number', 'client_id'])['item_code'].agg(unique_items_ave = 'nunique').reset_index()
UniqueItemsAve = unique_items.groupby(['client_id'])['unique_items_ave'].mean().reset_index()
UniqueItemsAve

In [None]:
unique_cat = data.groupby(['week_number', 'client_id'])['category_code'].agg(unique_cat_ave = 'nunique').reset_index()
UniqueCatAve = unique_cat.groupby(['client_id'])['unique_cat_ave'].mean().reset_index()
UniqueCatAve

## 6. Max and Average of days customer go to the store**

In [None]:
visit_day_cust = data.drop_duplicates(['date','client_id'],keep= 'last')
visit_day_cust['days'] = (visit_day_cust.sort_values('date').groupby('client_id').date.shift() -
                          visit_day_cust.date).dt.days.abs()
visit_day_cust = visit_day_cust.fillna(0)

In [None]:
MaxVisitDays = visit_day_cust.groupby('client_id')['days'].agg(max_days = 'max').reset_index()
AveVisitDays = visit_day_cust.groupby('client_id')['days'].agg(mean_days = 'mean').reset_index()

## 7. Favority item for a client

In [None]:
favorities = data.groupby(['client_id'])['item_code'].agg(favority=lambda x:x.value_counts().index[0]).reset_index()

# Final processed data

In [None]:
all_data = pd.concat([MonthAvg, WeekVisits, PricesWeekly, UniqueItemsAve, 
                      MaxVisitDays, AveVisitDays, favorities], axis=1)
all_data = all_data.loc[:,~all_data.columns.duplicated()]
df = train_labels.merge(all_data, right_on='client_id', left_on='client_id')

In [None]:
df.columns

In [None]:
df_train = df.drop(columns=['target_visit'])
target = train_labels['target_visit']

# Train & Val split

In [None]:

X_train, X_validation, y_train, y_validation = train_test_split(df_train,
                                                                target,
                                                                test_size=0.2,
                                                                random_state=45)

In [None]:
X_train = X_train.reset_index(drop=True)
X_validation = X_validation.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_validation = y_validation.reset_index(drop=True)

# Light GBM experiment

In [None]:
clf = lgb.LGBMClassifier()

param_grid = {
    "max_depth": [3, 4, 5, 7, 10],
    "learning_rate": [1, 0.1, 0.01, 0.05]
}

# Init Grid Search
grid_cv = GridSearchCV(clf, param_grid, n_jobs=-1, cv=3, scoring="f1")

In [None]:
# Fit
_ = grid_cv.fit(X_train, y_train)

In [None]:
y_pred = grid_cv.predict(X_validation)
f1_score(y_validation, y_pred, average='weighted')

# XGBoost experiment

In [None]:
# XGBoost
xgb_cl = xgb.XGBClassifier()

# Init classifier
xgb_cl = xgb.XGBClassifier(objective="binary:logistic")

xgb_cl.fit(X_train, y_train)

In [None]:
y_pred = xgb_cl.predict(X_validation)
f1_score(y_validation, y_pred, average='weighted')

# Final Training with the whole data

In [None]:
clf = lgb.LGBMClassifier()

param_grid = {
    "max_depth": [3, 4, 5, 7, 10],
    "learning_rate": [1, 0.1, 0.01, 0.05]
}

# Init Grid Search
grid_cv = GridSearchCV(clf, param_grid, n_jobs=-1, cv=3, scoring="f1")

In [None]:
xgb.__version__

In [None]:
df = df.drop(columns=['target_visit'])

In [None]:
# Fit
_ = grid_cv.fit(df, target)

In [None]:
TEST_LABEL_INPUT = '/kaggle/input/marketcohackaton/test_labels.csv'
test_labels = pd.read_csv(TEST_LABEL_INPUT)

In [None]:
test_labels.shape

In [None]:
test_labels

In [None]:
test_set = test_labels.merge(all_data, right_on='client_id', left_on='client_id')

In [None]:
test_set.shape

In [None]:
test_set.head()

In [None]:
test_labels.shape

In [None]:
test_set.shape

In [None]:
test_labels['target_visit'] = grid_cv.predict(test_set)

# Data output - Prediction

In [None]:
PATH='ada_final.csv'
test_labels.to_csv(PATH, index=False)