In [1]:
import json
import pandas as pd
import numpy as np
from tqdm import tqdm


def read_json(data_path, dataset_type):
    with open(data_path+f"{dataset_type}.json", "rb") as f:
        json_object = json.load(f)

    return json_object


def parse_visits(json_object):
    # parse json
    df = pd.DataFrame()
    for j in range(len(list(json_object.keys()))):
        key = list(json_object.keys())[j]
        dfl1 = pd.DataFrame()
        if "visits" in json_object[key]["features"]:
            for i in range(len(json_object[key]["features"]["visits"])):
                n = len(pd.json_normalize(json_object[key]["features"]["visits"][i]["visits"]))
                dfl2 = pd.concat([
                        pd.concat(
                            [pd.json_normalize(json_object[key]["features"]["visits"][i])] * n, 
                            ignore_index=True
                        )[["site-id", "first-seen", "last-seen"]],
                        pd.json_normalize(json_object[key]["features"]["visits"][i]["visits"])
                    ], axis=1)
                dfl1 = pd.concat([dfl1, dfl2]).reset_index(drop=True)
        dfl1["user"] = [key] * len(dfl1)
        if "target" in json_object[key].keys():
            dfl1["target"] = json_object[key]["target"]
        df = pd.concat([df, dfl1]).reset_index(drop=True)

    df = df.explode("visited-items", ignore_index=True)
    df = df.explode("visited-general-categories", ignore_index=True)
    df = df.explode("visited-universal-brands", ignore_index=True)
    
    return df 

In [2]:
# Функция собирает все ["site-meta"]['site-id']

def parse_site_meta_ids(json_object):

    # Создаем пустой df
    df = pd.DataFrame()
    # Проходим по ключам словаря (users)
    for j in range(len(list(json_object.keys()))):
        key = list(json_object.keys())[j]
        # Создаем пустой df для итерации по юзерам
        dfl1 = pd.DataFrame()
        if "site-meta" in json_object[key]["features"]:
            dfl2 = pd.json_normalize(json_object[key]["features"]["site-meta"])['site-id']
            dfl1 = pd.concat([dfl1, dfl2]).reset_index(drop=True)
        # Добавляем ключ (user)
        dfl1["user"] = [key] * len(dfl1)
        # Добавляем пол (target)
        if "target" in json_object[key].keys():
            dfl1["target"] = json_object[key]["target"]
        # Пришиваем полученный df к итоговому
        df = pd.concat([df, dfl1]).reset_index(drop=True)
    
    return df 

In [3]:
# Функция замены site_id на унифицированный sex_score

def replace_site_id(df, column_name=0):
    # Переводим пол в бинарный признак через One-Hot Encoding
    df_encoded = pd.get_dummies(df, columns=['target'], prefix='target')
    # Группируем по сайту, считаем sex.score (уровень феминности) для каждого сайта
    df_grouped = df_encoded.groupby(column_name).agg({'user':'count', 'target_female':'sum'}).reset_index()
    df_grouped['target_female'] = df_grouped['target_female'] / df_grouped['user']
    # Переводим sex.scores в словарь
    df_grouped.set_index(column_name, drop=True, inplace=True)
    sex_score_dic = df_grouped['target_female'].to_dict()
    # Заменяем site_id на sex.score
    df[column_name] = df[column_name].replace(sex_score_dic)
    df = df.groupby('user').agg({column_name: 'mean', 'target': 'last'}).reset_index()
    return df

In [4]:
data_path = "flocktory/data/"
dataset_type = "val"
data = read_json(data_path, dataset_type)

In [5]:
visits = parse_visits(data)

In [10]:
visits.to_csv('visits.csv', index=False)

In [6]:
visits.sample(3)

Unnamed: 0,site-id,first-seen,last-seen,visited-at,session-duration,pages-count,user,target,visited-items,visited-general-categories,visited-universal-brands
16504150,20.0,1686404000.0,1698765000.0,1697120000.0,1.0,1.0,user_140696,female,item_78685,91392,15859
6355705,20.0,1685796000.0,1698695000.0,1698572000.0,74.0,11.0,user_132276,female,item_3130602,7812195,1756
35915980,20.0,1637851000.0,1699265000.0,1699103000.0,258.0,5.0,user_149563,male,item_6009276,2724669,1936252


In [7]:
visits.isna().mean()

site-id                       0.000000
first-seen                    0.000000
last-seen                     0.000000
visited-at                    0.000000
session-duration              0.000000
pages-count                   0.000000
user                          0.000000
target                        0.000000
visited-items                 0.009908
visited-general-categories    0.012323
visited-universal-brands      0.015231
dtype: float64

In [8]:
visits['visited-universal-brands'].nunique()

18461

In [9]:
visited_universal_brands = replace_site_id(visits[['visited-universal-brands', 'user', 'target']], column_name='visited-universal-brands')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name] = df[column_name].replace(sex_score_dic)


In [11]:
visited_universal_brands.to_csv('visits.csv', index=False)

In [12]:
visited_universal_brands.isna().mean()

user                        0.00000
visited-universal-brands    0.48937
target                      0.00000
dtype: float64

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = visited_universal_brands[['visited-universal-brands']].fillna(0.5)
y = visited_universal_brands['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Display classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Accuracy: 0.63
Confusion Matrix:
[[1133 1536]
 [ 477 2263]]
Classification Report:
              precision    recall  f1-score   support

      female       0.70      0.42      0.53      2669
        male       0.60      0.83      0.69      2740

    accuracy                           0.63      5409
   macro avg       0.65      0.63      0.61      5409
weighted avg       0.65      0.63      0.61      5409



In [14]:
meta_ids = parse_site_meta_ids(data)

In [15]:
replaced_meta_df = replace_site_id(meta_ids)

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = replaced_meta_df[[0]]
y = replaced_meta_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Display classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Accuracy: 0.80
Confusion Matrix:
[[2216  566]
 [ 557 2151]]
Classification Report:
              precision    recall  f1-score   support

      female       0.80      0.80      0.80      2782
        male       0.79      0.79      0.79      2708

    accuracy                           0.80      5490
   macro avg       0.80      0.80      0.80      5490
weighted avg       0.80      0.80      0.80      5490



In [22]:
replaced_meta_df.rename(columns={0:'meta_site_id'}, inplace=True)
merged_df_ = replaced_meta_df.merge(visited_universal_brands[['user', 'visited-universal-brands']], on='user', how='left')

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = merged_df_.drop(columns=['user', 'target']).fillna(0.5)
y = merged_df_['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Display classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Accuracy: 0.82
Confusion Matrix:
[[2265  517]
 [ 495 2213]]
Classification Report:
              precision    recall  f1-score   support

      female       0.82      0.81      0.82      2782
        male       0.81      0.82      0.81      2708

    accuracy                           0.82      5490
   macro avg       0.82      0.82      0.82      5490
weighted avg       0.82      0.82      0.82      5490

