In [2]:
import json
import pandas as pd
from tqdm import tqdm


def read_json(data_path, dataset_type):
    with open(data_path+f"{dataset_type}.json", "rb") as f:
        json_object = json.load(f)

    return json_object


def parse_visits(json_object):
    # parse json
    df = pd.DataFrame()
    for j in range(len(list(json_object.keys()))):
        key = list(json_object.keys())[j]
        dfl1 = pd.DataFrame()
        if "visits" in json_object[key]["features"]:
            for i in range(len(json_object[key]["features"]["visits"])):
                n = len(pd.json_normalize(json_object[key]["features"]["visits"][i]["visits"]))
                dfl2 = pd.concat([
                        pd.concat(
                            [pd.json_normalize(json_object[key]["features"]["visits"][i])] * n, 
                            ignore_index=True
                        )[["site-id", "first-seen", "last-seen"]],
                        pd.json_normalize(json_object[key]["features"]["visits"][i]["visits"])
                    ], axis=1)
                dfl1 = pd.concat([dfl1, dfl2]).reset_index(drop=True)
        dfl1["user"] = [key] * len(dfl1)
        if "target" in json_object[key].keys():
            dfl1["target"] = json_object[key]["target"]
        df = pd.concat([df, dfl1]).reset_index(drop=True)

    df = df.explode("visited-items", ignore_index=True)
    df = df.explode("visited-general-categories", ignore_index=True)
    df = df.explode("visited-universal-brands", ignore_index=True)
    
    return df 

def iterate_json(json_object, chunk_size):
    items = list(json_object.items())
    for i in range(0, len(items), chunk_size):
        yield {k:v for k,v in items[i:i+chunk_size]}

#timing
def get_visits_dataset(data_path, chunk_size, dataset_type="train"):
    # read as json
    json_object = read_json(data_path, dataset_type)
    # json_object = dict(list(json_object.items())[:50])

    for idx, chunk in enumerate(iterate_json(json_object, chunk_size)):
        # get chunk idx
        chunk_users = sorted([int(x[5:]) for x in list(chunk.keys())])
        chunk_users = f"{chunk_users[0]}_{chunk_users[-1]}"

        df = parse_visits(chunk)

        # save as parquet by chunks
        df.to_parquet(
            data_path+f"{dataset_type}/{idx}_{chunk_users}_{dataset_type}.parquet.gzip",
            compression="gzip",
            index=False
        )

In [3]:
# Функция собирает все ["site-meta"]['site-id']

def parse_site_meta_ids(json_object):

    # Создаем пустой df
    df = pd.DataFrame()
    # Проходим по ключам словаря (users)
    for j in range(len(list(json_object.keys()))):
        key = list(json_object.keys())[j]
        # Создаем пустой df для итерации по юзерам
        dfl1 = pd.DataFrame()
        if "site-meta" in json_object[key]["features"]:
            dfl2 = pd.json_normalize(json_object[key]["features"]["site-meta"])['site-id']
            dfl1 = pd.concat([dfl1, dfl2]).reset_index(drop=True)
        # Добавляем ключ (user)
        dfl1["user"] = [key] * len(dfl1)
        # Добавляем пол (target)
        if "target" in json_object[key].keys():
            dfl1["target"] = json_object[key]["target"]
        # Пришиваем полученный df к итоговому
        df = pd.concat([df, dfl1]).reset_index(drop=True)
    
    return df 

In [132]:
# Функция собирает все ["exchange-sessions"]['accepted-site-id']

def parse_accepted_site_ids(json_object):

    # Создаем пустой df
    df = pd.DataFrame()
    # Проходим по ключам словаря (users)
    for j in range(len(list(json_object.keys()))):
        key = list(json_object.keys())[j]
        # Создаем пустой df для итерации по юзерам
        dfl1 = pd.DataFrame()
        if "exchange-sessions" in json_object[key]["features"]:
            if "accepted-site-id" in pd.DataFrame(json_object[key]["features"]["exchange-sessions"]).columns:
                dfl2 = pd.json_normalize(json_object[key]["features"]["exchange-sessions"])['accepted-site-id'].dropna()
                dfl1 = pd.concat([dfl1, dfl2]).reset_index(drop=True)
            else:
                continue
        else:
            continue
        # Добавляем ключ (user)
        dfl1["user"] = [key] * len(dfl1)
        # Добавляем пол (target)
        if "target" in json_object[key].keys():
            dfl1["target"] = json_object[key]["target"]
        # Пришиваем полученный df к итоговому
        df = pd.concat([df, dfl1]).reset_index(drop=True)
    
    return df

In [16]:
# Функция замены site_id на унифицированный sex_score

def replace_site_id(df):
    # Переводим пол в бинарный признак через One-Hot Encoding
    df_encoded = pd.get_dummies(df, columns=['target'], prefix='target')
    # Группируем по сайту, считаем sex.score (уровень феминности) для каждого сайта
    df_grouped = df_encoded.groupby(0).agg({'user':'count', 'target_female':'sum'}).reset_index()
    df_grouped['target_female'] = df_grouped['target_female'] / df_grouped['user']
    # Переводим sex.scores в словарь
    df_grouped.set_index(0, drop=True, inplace=True)
    sex_score_dic = df_grouped['target_female'].to_dict()
    # Заменяем site_id на sex.score
    df[0] = df[0].replace(sex_score_dic)
    df = df.groupby('user').agg({0: 'mean', 'target': 'last'}).reset_index()
    return df

In [140]:
data_path = "flocktory/data/"
dataset_type = "val"
data = read_json(data_path, dataset_type)

In [141]:
meta_ids = parse_site_meta_ids(data)

In [142]:
replaced_meta_df = replace_site_id(meta_ids)
replaced_meta_df.rename(columns={0:'meta_site_id'}, inplace=True)

In [155]:
replaced_meta_df.sample()

Unnamed: 0,user,meta_site_id,target
1381,user_129137,0.505464,male


In [156]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = replaced_meta_df[['meta_site_id']]
y = replaced_meta_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Display classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Accuracy: 0.80
Confusion Matrix:
[[2216  566]
 [ 557 2151]]
Classification Report:
              precision    recall  f1-score   support

      female       0.80      0.80      0.80      2782
        male       0.79      0.79      0.79      2708

    accuracy                           0.80      5490
   macro avg       0.80      0.80      0.80      5490
weighted avg       0.80      0.80      0.80      5490



In [144]:
accepted_site_ids = parse_accepted_site_ids(val)

In [145]:
replaced_accepted_df = replace_site_id(accepted_site_ids)
replaced_accepted_df.rename(columns={0:'accepted_site_id'}, inplace=True)

In [152]:
replaced_accepted_df.sample()

Unnamed: 0,user,accepted_site_id,target
13251,user_153737,0.698826,male


In [153]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = replaced_accepted_df[['accepted_site_id']]
y = replaced_accepted_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Display classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Accuracy: 0.66
Confusion Matrix:
[[1120  420]
 [ 529  712]]
Classification Report:
              precision    recall  f1-score   support

      female       0.68      0.73      0.70      1540
        male       0.63      0.57      0.60      1241

    accuracy                           0.66      2781
   macro avg       0.65      0.65      0.65      2781
weighted avg       0.66      0.66      0.66      2781



In [160]:
merged_df = replaced_meta_df.merge(replaced_accepted_df[['user', 'accepted_site_id']], on='user', how='left')

In [165]:
merged_df = merged_df.dropna()

In [167]:
merged_df.sample()

Unnamed: 0,user,meta_site_id,target,accepted_site_id
16345,user_144101,0.661473,female,0.696545


In [168]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = merged_df[['meta_site_id', 'accepted_site_id']]
y = merged_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Display classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Accuracy: 0.82
Confusion Matrix:
[[1280  260]
 [ 250  991]]
Classification Report:
              precision    recall  f1-score   support

      female       0.84      0.83      0.83      1540
        male       0.79      0.80      0.80      1241

    accuracy                           0.82      2781
   macro avg       0.81      0.81      0.81      2781
weighted avg       0.82      0.82      0.82      2781

