In [1]:
import pandas as pd
import re

In [2]:
train = pd.read_json('train.json', orient='index').reset_index()

In [3]:
train.shape

(127755, 3)

In [54]:
train.head(1)

Unnamed: 0,user,target,orders,site-meta,visits,last-visits-in-categories,exchange-sessions
0,user_1,female,"[{'site-id': 1, 'orders': [{'created-at': 1634...",[{'site-id': 2}],,,


In [5]:
train.rename(columns={'index':'user'}, inplace=True)
users = train.drop(columns='features')
users.head(1)

Unnamed: 0,user,target
0,user_1,female


In [6]:
train = pd.concat([train.drop('features', axis = 1), pd.json_normalize(train['features'])], axis = 1)
train.shape

(127755, 7)

In [7]:
train.sample()

Unnamed: 0,user,target,orders,site-meta,visits,last-visits-in-categories,exchange-sessions
67267,user_67268,female,"[{'site-id': 34, 'orders': [{'created-at': 156...","[{'site-id': 371}, {'site-id': 146}, {'site-id...","[{'site-id': 3, 'first-seen': 1693067437, 'las...","[{'category': 'other', 'last-visit-at': 169765...","[{'landed-at': 1694763414, 'sites': [21, 7, 2,..."


In [24]:
orders = train.drop(columns=['visits', 'site-meta', 'exchange-sessions', 'last-visits-in-categories'])
site_meta = train.drop(columns=['visits', 'orders', 'exchange-sessions', 'last-visits-in-categories'])

In [25]:
orders['orders'] = orders['orders'].apply(lambda x: re.findall(r'item_\d+', str(x)) if not isinstance(x, float) else x)
site_meta['site-meta'] = site_meta['site-meta'].apply(lambda x: re.findall(r"'site-id':\s*(\d+)", str(x)) if not isinstance(x, float) else x)

In [26]:
orders = orders.explode('orders')
site_meta = site_meta.explode('site-meta')

In [27]:
orders

Unnamed: 0,user,target,orders
0,user_1,female,
1,user_2,female,
2,user_3,male,item_1
3,user_4,male,item_8
3,user_4,male,item_9
...,...,...,...
127754,user_127755,female,item_162398
127754,user_127755,female,item_1896437
127754,user_127755,female,item_3739266
127754,user_127755,female,item_57560


In [28]:
site_meta

Unnamed: 0,user,target,site-meta
0,user_1,female,2
1,user_2,female,4
1,user_2,female,5
1,user_2,female,6
1,user_2,female,7
...,...,...,...
127754,user_127755,female,194
127754,user_127755,female,212
127754,user_127755,female,65
127754,user_127755,female,213


In [29]:
def id_to_score(df, column_name):
    df_counts = df[column_name].value_counts()
    df_female_counts = df[df['target'] == 'female'][column_name].value_counts()
    score = pd.concat([df_counts, df_female_counts], axis=1)
    score['sex_score'] = score.iloc[:, 1] / score.iloc[:, 0]
    score['sex_score'].fillna(0, inplace=True)
    score.reset_index(inplace=True)
    score.drop(columns=[column_name, column_name], inplace=True)
    score.rename(columns={'index': column_name}, inplace=True)
    ids_scored = df.merge(score[[column_name, 'sex_score']], on=column_name, how='left')
    ids_by_user = ids_scored.groupby('user').agg({'sex_score':'mean', 'target':'last'})
    ids_by_user.reset_index(inplace=True)
    
    return ids_by_user, score

In [30]:
items_by_user, items_score = id_to_score(orders, 'orders')

In [33]:
items_by_user.sample()

Unnamed: 0,user,sex_score,target
97728,user_72974,0.870125,female


In [32]:
items_score.sample()

Unnamed: 0,orders,sex_score
904132,item_1955911,1.0


In [34]:
meta_by_user, meta_score = id_to_score(site_meta, 'site-meta')

In [35]:
assambled_meta_item = meta_by_user.merge(items_by_user.drop(columns='target'), on='user', how='left')

In [36]:
# Тест на то, что ни одного юзера не потеряли
len(users) == len(items_by_user) == len(meta_by_user) == len(assambled_meta_item)

True

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = assambled_meta_item.drop(columns=['user', 'target']).fillna(0.5)
y = assambled_meta_item['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Display classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Accuracy: 0.89
Confusion Matrix:
[[10973  1857]
 [  970 11751]]
Classification Report:
              precision    recall  f1-score   support

      female       0.92      0.86      0.89     12830
        male       0.86      0.92      0.89     12721

    accuracy                           0.89     25551
   macro avg       0.89      0.89      0.89     25551
weighted avg       0.89      0.89      0.89     25551



In [38]:
val = pd.read_json('val.json', orient='index').reset_index()
val.rename(columns={'index':'user'}, inplace=True)
val = pd.concat([val.drop('features', axis = 1), pd.json_normalize(val['features'])], axis = 1)


In [55]:
orders = val.drop(columns=['visits', 'site-meta', 'exchange-sessions', 'last-visits-in-categories'])
site_meta = val.drop(columns=['visits', 'orders', 'exchange-sessions', 'last-visits-in-categories'])
orders['orders'] = orders['orders'].apply(lambda x: re.findall(r'item_\d+', str(x)) if not isinstance(x, float) else x)
site_meta['site-meta'] = site_meta['site-meta'].apply(lambda x: re.findall(r"'site-id':\s*(\d+)", str(x)) if not isinstance(x, float) else x)
orders = orders.explode('orders')
site_meta = site_meta.explode('site-meta')

In [60]:
items_by_user = orders.merge(items_score, on='orders', how='left')
items_by_user.drop(columns='orders', inplace=True)
items_by_user = items_by_user.groupby('user').agg({'sex_score':'mean', 'target':'last'}).reset_index()
meta_by_user = site_meta.merge(meta_score, on='site-meta', how='left')
meta_by_user.drop(columns='site-meta', inplace=True)
meta_by_user = meta_by_user.groupby('user').agg({'sex_score':'mean', 'target':'last'}).reset_index()

In [61]:
assambled_meta_item = meta_by_user.merge(items_by_user.drop(columns='target'), on='user', how='left')

In [63]:

y_pred = model.predict(assambled_meta_item.drop(columns=['user', 'target']).fillna(0.5))
y_test = assambled_meta_item['target']
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Display classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Accuracy: 0.74
Confusion Matrix:
[[10305  3421]
 [ 3727  9994]]
Classification Report:
              precision    recall  f1-score   support

      female       0.73      0.75      0.74     13726
        male       0.74      0.73      0.74     13721

    accuracy                           0.74     27447
   macro avg       0.74      0.74      0.74     27447
weighted avg       0.74      0.74      0.74     27447



In [20]:
# # Нейронкой можно улучшить скор, но необходимы эксперименты

# import tensorflow as tf
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score

# assambled_meta_item['target_bi'] = assambled_meta_item['target'].map({'female': 1, 'male': 0})
# X = assambled_meta_item.drop(columns=['user', 'target', 'target_bi']).fillna(0.5)
# y = assambled_meta_item['target_bi']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Создание модели нейронной сети
# model = tf.keras.Sequential([
#     tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
#     tf.keras.layers.Dropout(0.5),
#     tf.keras.layers.Dense(64, activation='relu'),
#     tf.keras.layers.Dense(1, activation='sigmoid')
# ])


# # Компиляция модели
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# # Обучение модели
# model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

# # Оценка модели на тестовом наборе
# y_pred_prob = model.predict(X_test)
# y_pred = (y_pred_prob > 0.5).astype(int)

# # Оценка точности
# accuracy = accuracy_score(y_test, y_pred)
# print(f'Точность модели: {accuracy}')


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Точность модели: 0.8894368126492114
