# site-meta


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline

## Data preprocessing

In [None]:
train = pd.read_csv("train_site_meta.csv")
train

Unnamed: 0.1,Unnamed: 0,user-id,site-id,recency,frequency,monetary,target
0,0,user_1,2,,,,0
1,1,user_2,4,,,,0
2,2,user_2,5,,,,0
3,3,user_2,6,,,,0
4,4,user_2,7,,,,0
...,...,...,...,...,...,...,...
1356552,1356552,user_127755,194,,,,0
1356553,1356553,user_127755,212,,,,0
1356554,1356554,user_127755,65,,,,0
1356555,1356555,user_127755,213,,,,0


In [None]:
train = train.drop(columns=['Unnamed: 0'])
train = train[~train['recency'].isna()]
train

Unnamed: 0,user-id,site-id,recency,frequency,monetary,target
13,user_3,21,3.0,1.0,5.0,1
18,user_3,22,3.0,1.0,5.0,1
23,user_4,30,2.0,2.0,3.0,1
25,user_4,21,1.0,1.0,5.0,1
28,user_4,29,1.0,2.0,2.0,1
...,...,...,...,...,...,...
1356543,user_127755,7,1.0,4.0,3.0,0
1356545,user_127755,8,1.0,1.0,5.0,0
1356547,user_127755,85,1.0,1.0,4.0,0
1356550,user_127755,58,5.0,5.0,4.0,0


In [None]:
label_encoder = LabelEncoder()
train['user-id'] = label_encoder.fit_transform(train['user-id'])
train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['user-id'] = label_encoder.fit_transform(train['user-id'])


Unnamed: 0,user-id,site-id,recency,frequency,monetary,target
13,38021,21,3.0,1.0,5.0,1
18,38021,22,3.0,1.0,5.0,1
23,46232,30,2.0,2.0,3.0,1
25,46232,21,1.0,1.0,5.0,1
28,46232,29,1.0,2.0,2.0,1
...,...,...,...,...,...,...
1356543,23482,7,1.0,4.0,3.0,0
1356545,23482,8,1.0,1.0,5.0,0
1356547,23482,85,1.0,1.0,4.0,0
1356550,23482,58,5.0,5.0,4.0,0


In [None]:
X = train.drop('target', axis=1)
y = train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model training

In [None]:
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [None]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits




In [None]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.66      0.69      0.67     46546
           1       0.50      0.47      0.48     30967

    accuracy                           0.60     77513
   macro avg       0.58      0.58      0.58     77513
weighted avg       0.60      0.60      0.60     77513



# exchange-sessions


In [None]:
import zipfile
from google.colab import drive
drive.mount('/content/drive')
z = zipfile.ZipFile('/content/drive/MyDrive/train_exchange_sessions.zip', 'r')
z.extractall()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
df = pd.read_csv("/content/train_exchange_sessions.csv")
df['target'] = df['target'].replace({0: -1})
df.drop(columns=['Unnamed: 0'], inplace=True)
df.head(5)

Unnamed: 0,user-id,landed-at,sites,accepted-site-id,accepted-at,clicks,target
0,user_6,1698263812,"[169, 214, 215, 216, 217, 218, 1, 219, 220, 22...",207.0,1698264000.0,"[{'clicked-at': 1698263944, 'site-id': 207}]",-1
1,user_6,1658944504,"[46, 21, 29, 78, 180, 158, 42, 74, 1, 119, 22,...",1.0,1658945000.0,"[{'clicked-at': 1658944723, 'site-id': 1}]",-1
2,user_6,1646471462,"[326, 71, 2, 1, 21, 171, 31, 74, 80, 325, 335,...",,,,-1
3,user_6,1644170436,"[326, 71, 2, 1, 21, 48, 336, 74, 75, 79, 92, 8...",,,"[{'clicked-at': 1644170563, 'site-id': 21}]",-1
4,user_6,1630352621,"[21, 31, 74, 83, 90, 146, 88, 98, 80, 180, 46,...",21.0,1630353000.0,"[{'clicked-at': 1630352688, 'site-id': 21}]",-1


In [None]:
site_columns = pd.get_dummies(df['accepted-site-id'].apply(pd.Series).stack()).sum(level=0)
df = df.join(site_columns)

df['number_of_sites'] = df['sites'].apply(len)
df['time_spent'] = df['accepted-at'] - df['landed-at']
df['number_of_clicks'] = df['clicks'].apply(len)

In [None]:
features = ['number_of_sites', 'time_spent', 'number_of_clicks'] + list(site_columns.columns)
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

In [None]:
sequences = []
targets = []

for user_id in df['user-id'].unique():
    user_df = df[df['user-id'] == user_id]
    sequence = user_df[features].values
    target = user_df['target'].iloc[-1]
    sequences.append(sequence)
    targets.append(target)

In [None]:
max_sequence_length = 5
X_padded = pad_sequences(X, maxlen=max_sequence_length, padding='post', dtype='float32')

In [None]:
model = Sequential()
model.add(LSTM(100, return_sequences=True, input_shape=(max_sequence_length, X.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(1, activation='tanh'))

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train, epochs=10, batch_size=32)

In [None]:
predictions = model.predict(X_test)
classified_predictions = [1 if p >= 0 else -1 for p in predictions.flatten()]

sum(pred == true for pred, true in zip(classified_predictions, y_test)) / len(y_test)

0.658372