In [1]:
import json
import pandas as pd
import numpy as np
from tqdm import tqdm


def read_json(data_path, dataset_type):
    with open(data_path+f"{dataset_type}.json", "rb") as f:
        json_object = json.load(f)

    return json_object


def parse_visits(json_object):
    # parse json
    df = pd.DataFrame()
    for j in range(len(list(json_object.keys()))):
        key = list(json_object.keys())[j]
        dfl1 = pd.DataFrame()
        if "visits" in json_object[key]["features"]:
            for i in range(len(json_object[key]["features"]["visits"])):
                n = len(pd.json_normalize(json_object[key]["features"]["visits"][i]["visits"]))
                dfl2 = pd.concat([
                        pd.concat(
                            [pd.json_normalize(json_object[key]["features"]["visits"][i])] * n, 
                            ignore_index=True
                        )[["site-id", "first-seen", "last-seen"]],
                        pd.json_normalize(json_object[key]["features"]["visits"][i]["visits"])
                    ], axis=1)
                dfl1 = pd.concat([dfl1, dfl2]).reset_index(drop=True)
        dfl1["user"] = [key] * len(dfl1)
        if "target" in json_object[key].keys():
            dfl1["target"] = json_object[key]["target"]
        df = pd.concat([df, dfl1]).reset_index(drop=True)

    df = df.explode("visited-items", ignore_index=True)
    df = df.explode("visited-general-categories", ignore_index=True)
    df = df.explode("visited-universal-brands", ignore_index=True)
    
    return df 

def iterate_json(json_object, chunk_size):
    items = list(json_object.items())
    for i in range(0, len(items), chunk_size):
        yield {k:v for k,v in items[i:i+chunk_size]}

#timing
def get_visits_dataset(data_path, chunk_size, dataset_type="train"):
    # read as json
    json_object = read_json(data_path, dataset_type)
    # json_object = dict(list(json_object.items())[:50])

    for idx, chunk in enumerate(iterate_json(json_object, chunk_size)):
        # get chunk idx
        chunk_users = sorted([int(x[5:]) for x in list(chunk.keys())])
        chunk_users = f"{chunk_users[0]}_{chunk_users[-1]}"

        df = parse_visits(chunk)

        # save as parquet by chunks
        df.to_parquet(
            data_path+f"{dataset_type}/{idx}_{chunk_users}_{dataset_type}.parquet.gzip",
            compression="gzip",
            index=False
        )

In [2]:
# Функция собирает все ["site-meta"]['site-id']

def parse_site_meta_ids(json_object):

    # Создаем пустой df
    df = pd.DataFrame()
    # Проходим по ключам словаря (users)
    for j in range(len(list(json_object.keys()))):
        key = list(json_object.keys())[j]
        # Создаем пустой df для итерации по юзерам
        dfl1 = pd.DataFrame()
        if "site-meta" in json_object[key]["features"]:
            dfl2 = pd.json_normalize(json_object[key]["features"]["site-meta"])['site-id']
            dfl1 = pd.concat([dfl1, dfl2]).reset_index(drop=True)
        # Добавляем ключ (user)
        dfl1["user"] = [key] * len(dfl1)
        # Добавляем пол (target)
        if "target" in json_object[key].keys():
            dfl1["target"] = json_object[key]["target"]
        # Пришиваем полученный df к итоговому
        df = pd.concat([df, dfl1]).reset_index(drop=True)
    
    return df 

In [3]:
# Функция собирает все ["last-visits-in-categories"]['category']

def parse_category(json_object):

    # Создаем пустой df
    df = pd.DataFrame()
    # Проходим по ключам словаря (users)
    for j in range(len(list(json_object.keys()))):
        key = list(json_object.keys())[j]
        # Создаем пустой df для итерации по юзерам
        dfl1 = pd.DataFrame()
        if "last-visits-in-categories" in json_object[key]["features"]:
            dfl2 = pd.json_normalize(json_object[key]["features"]["last-visits-in-categories"])['category']
            dfl1 = pd.concat([dfl1, dfl2]).reset_index(drop=True)
        # Добавляем ключ (user)
        dfl1["user"] = [key] * len(dfl1)
        # Добавляем пол (target)
        if "target" in json_object[key].keys():
            dfl1["target"] = json_object[key]["target"]
        # Пришиваем полученный df к итоговому
        df = pd.concat([df, dfl1]).reset_index(drop=True)
    
    return df 

In [4]:
# Функция собирает все ["exchange-sessions"]['accepted-site-id']

def parse_accepted_site_ids(json_object):

    # Создаем пустой df
    df = pd.DataFrame()
    # Проходим по ключам словаря (users)
    for j in range(len(list(json_object.keys()))):
        key = list(json_object.keys())[j]
        # Создаем пустой df для итерации по юзерам
        dfl1 = pd.DataFrame()
        if "exchange-sessions" in json_object[key]["features"]:
            if "accepted-site-id" in pd.DataFrame(json_object[key]["features"]["exchange-sessions"]).columns:
                dfl2 = pd.json_normalize(json_object[key]["features"]["exchange-sessions"])['accepted-site-id'].dropna()
                dfl1 = pd.concat([dfl1, dfl2]).reset_index(drop=True)
            else:
                continue
        else:
            continue
        # Добавляем ключ (user)
        dfl1["user"] = [key] * len(dfl1)
        # Добавляем пол (target)
        if "target" in json_object[key].keys():
            dfl1["target"] = json_object[key]["target"]
        # Пришиваем полученный df к итоговому
        df = pd.concat([df, dfl1]).reset_index(drop=True)
    
    return df

In [5]:
# Функция собирает все ["exchange-sessions"]['clicks']

def parse_clicks(json_object):

    # Создаем пустой df
    df = pd.DataFrame()
    # Проходим по ключам словаря (users)
    for j in range(len(list(json_object.keys()))):
        key = list(json_object.keys())[j]
        # Создаем пустой df для итерации по юзерам
        dfl1 = pd.DataFrame()
        if "exchange-sessions" in json_object[key]["features"]:
            for i in range(len(json_object[key]["features"]["exchange-sessions"])):
                if 'clicks' in json_object[key]["features"]["exchange-sessions"][i].keys():
                    dfl2 = pd.json_normalize(json_object[key]["features"]["exchange-sessions"][i]["clicks"])['site-id']
                    dfl1 = pd.concat([dfl1, dfl2]).reset_index(drop=True)
                else:
                    continue
        else:
            continue
        # Добавляем ключ (user)
        dfl1["user"] = [key] * len(dfl1)
        # Добавляем пол (target)
        if "target" in json_object[key].keys():
            dfl1["target"] = json_object[key]["target"]
        # Пришиваем полученный df к итоговому
        df = pd.concat([df, dfl1]).reset_index(drop=True)
    
    return df

In [6]:
# Функция собирает все ["exchange-sessions"]['accepted-site-id'] & ['clicks']

def parse_accepted_n_clicks(json_object):

    # Создаем пустой df
    df = pd.DataFrame()
    # Проходим по ключам словаря (users)
    for j in range(len(list(json_object.keys()))):
        key = list(json_object.keys())[j]
        # Создаем пустой df для итерации по юзерам
        dfl1 = pd.DataFrame()
        if "exchange-sessions" in json_object[key]["features"]:
            for i in range(len(json_object[key]["features"]["exchange-sessions"])):
                if 'clicks' in json_object[key]["features"]["exchange-sessions"][i].keys():
                    dfl2 = pd.json_normalize(json_object[key]["features"]["exchange-sessions"][i]["clicks"])['site-id']
                    dfl1 = pd.concat([dfl1, dfl2]).reset_index(drop=True)
                else:
                    if 'accepted-site-id' in json_object[key]["features"]["exchange-sessions"][i].keys():
                        dfl2 = pd.json_normalize(json_object[key]["features"]["exchange-sessions"])['accepted-site-id']
                        dfl1 = pd.concat([dfl1, dfl2]).reset_index(drop=True)
                    else:
                        continue
        else:
            continue
        # Добавляем ключ (user)
        dfl1["user"] = [key] * len(dfl1)
        # Добавляем пол (target)
        if "target" in json_object[key].keys():
            dfl1["target"] = json_object[key]["target"]
        # Пришиваем полученный df к итоговому
        df = pd.concat([df, dfl1]).reset_index(drop=True)
    
    return df

In [7]:
# Функция замены site_id на унифицированный sex_score

def replace_site_id(df, column_name=0):
    # Переводим пол в бинарный признак через One-Hot Encoding
    df_encoded = pd.get_dummies(df, columns=['target'], prefix='target')
    # Группируем по сайту, считаем sex.score (уровень феминности) для каждого сайта
    df_grouped = df_encoded.groupby(column_name).agg({'user':'count', 'target_female':'sum'}).reset_index()
    df_grouped['target_female'] = df_grouped['target_female'] / df_grouped['user']
    # Переводим sex.scores в словарь
    df_grouped.set_index(column_name, drop=True, inplace=True)
    sex_score_dic = df_grouped['target_female'].to_dict()
    # Заменяем site_id на sex.score
    df[column_name] = df[column_name].replace(sex_score_dic)
    df = df.groupby('user').agg({column_name: 'mean', 'target': 'last'}).reset_index()
    return df

In [8]:
data_path = "flocktory/data/"
dataset_type = "val"
data = read_json(data_path, dataset_type)

In [9]:
visits = parse_visits(data)

In [10]:
visits.sample(3)

Unnamed: 0,site-id,first-seen,last-seen,visited-at,session-duration,pages-count,user,target,visited-items,visited-general-categories,visited-universal-brands
23448124,20.0,1676459000.0,1698917000.0,1697812000.0,4343.0,122.0,user_141558,male,item_1426,13314855,16894
17075383,20.0,1696833000.0,1698915000.0,1698831000.0,200.0,6.0,user_140812,female,item_562766,13022251,13724
13408370,20.0,1659804000.0,1698655000.0,1698309000.0,0.0,1.0,user_138409,female,item_459884,12699910,1942675


In [11]:
visits.isna().mean()

site-id                       0.000000
first-seen                    0.000000
last-seen                     0.000000
visited-at                    0.000000
session-duration              0.000000
pages-count                   0.000000
user                          0.000000
target                        0.000000
visited-items                 0.009908
visited-general-categories    0.012323
visited-universal-brands      0.015231
dtype: float64

In [12]:
visits['site-id'].nunique()

454

In [13]:
visits['visited-items'].nunique()

242478

In [69]:
# visits_site_ids = replace_site_id(visits[['visited-items', 'user', 'target']], column_name='visited-items') - бесконечно долго

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = visits_site_ids[['visited-items']]
y = visits_site_ids['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Display classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

In [14]:
visits['visited-general-categories'].nunique()

2981

In [15]:
visited_general_categories = replace_site_id(visits[['visited-general-categories', 'user', 'target']], column_name='visited-general-categories')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name] = df[column_name].replace(sex_score_dic)


In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = visited_general_categories[['visited-general-categories']].fillna(0.5)
y = visited_general_categories['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Display classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Accuracy: 0.57
Confusion Matrix:
[[1165 1504]
 [ 805 1935]]
Classification Report:
              precision    recall  f1-score   support

      female       0.59      0.44      0.50      2669
        male       0.56      0.71      0.63      2740

    accuracy                           0.57      5409
   macro avg       0.58      0.57      0.56      5409
weighted avg       0.58      0.57      0.57      5409



In [509]:
visits['visited-universal-brands'].nunique()

18461

In [37]:
visited_universal_brands = replace_site_id(visits[['visited-universal-brands', 'user', 'target']], column_name='visited-universal-brands')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name] = df[column_name].replace(sex_score_dic)


In [70]:
visited_universal_brands.isna().mean()

user                        0.00000
visited-universal-brands    0.48937
target                      0.00000
dtype: float64

In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = visited_universal_brands[['visited-universal-brands']].fillna(0.5)
y = visited_universal_brands['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Display classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Accuracy: 0.63
Confusion Matrix:
[[1133 1536]
 [ 477 2263]]
Classification Report:
              precision    recall  f1-score   support

      female       0.70      0.42      0.53      2669
        male       0.60      0.83      0.69      2740

    accuracy                           0.63      5409
   macro avg       0.65      0.63      0.61      5409
weighted avg       0.65      0.63      0.61      5409



In [511]:
visits['session-duration'].describe()

count    4.008714e+07
mean     1.181306e+03
std      1.634129e+03
min      0.000000e+00
25%      6.100000e+01
50%      4.410000e+02
75%      1.778000e+03
max      1.564900e+04
Name: session-duration, dtype: float64

In [33]:
visits_session_duration = visits.groupby('user').agg({'session-duration': 'mean', 'target': 'last'})

In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = visits_session_duration[['session-duration']]
y = visits_session_duration['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Display classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Accuracy: 0.52
Confusion Matrix:
[[ 975 1694]
 [ 885 1855]]
Classification Report:
              precision    recall  f1-score   support

      female       0.52      0.37      0.43      2669
        male       0.52      0.68      0.59      2740

    accuracy                           0.52      5409
   macro avg       0.52      0.52      0.51      5409
weighted avg       0.52      0.52      0.51      5409



In [512]:
visits['pages-count'].describe()

count    4.008714e+07
mean     2.862516e+01
std      4.645698e+01
min      1.000000e+00
25%      2.000000e+00
50%      8.000000e+00
75%      3.500000e+01
max      9.220000e+02
Name: pages-count, dtype: float64

In [35]:
visits_pages_count = visits.groupby('user').agg({'pages-count': 'mean', 'target': 'last'})

In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = visits_pages_count[['pages-count']]
y = visits_pages_count['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Display classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Accuracy: 0.53
Confusion Matrix:
[[ 907 1762]
 [ 807 1933]]
Classification Report:
              precision    recall  f1-score   support

      female       0.53      0.34      0.41      2669
        male       0.52      0.71      0.60      2740

    accuracy                           0.53      5409
   macro avg       0.53      0.52      0.51      5409
weighted avg       0.53      0.53      0.51      5409



In [47]:
meta_ids = parse_site_meta_ids(data)

In [48]:
replaced_meta_df = replace_site_id(meta_ids)

In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = replaced_meta_df[[0]]
y = replaced_meta_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Display classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Accuracy: 0.80
Confusion Matrix:
[[2216  566]
 [ 557 2151]]
Classification Report:
              precision    recall  f1-score   support

      female       0.80      0.80      0.80      2782
        male       0.79      0.79      0.79      2708

    accuracy                           0.80      5490
   macro avg       0.80      0.80      0.80      5490
weighted avg       0.80      0.80      0.80      5490



In [216]:
accepted_site_ids = parse_accepted_site_ids(val)

In [501]:
replaced_accepted_df = replace_site_id(accepted_site_ids)

In [502]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = replaced_accepted_df[[0]]
y = replaced_accepted_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Display classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Accuracy: 0.66
Confusion Matrix:
[[1120  420]
 [ 529  712]]
Classification Report:
              precision    recall  f1-score   support

      female       0.68      0.73      0.70      1540
        male       0.63      0.57      0.60      1241

    accuracy                           0.66      2781
   macro avg       0.65      0.65      0.65      2781
weighted avg       0.66      0.66      0.66      2781



In [290]:
clicks = parse_clicks(val)

In [503]:
replaced_clicks_df = replace_site_id(clicks)
replaced_clicks_df.sample()

Unnamed: 0,user,0,target
5810,user_139253,0.802611,female


In [504]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = replaced_clicks_df[[0]]
y = replaced_clicks_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Display classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Accuracy: 0.67
Confusion Matrix:
[[1151  428]
 [ 524  778]]
Classification Report:
              precision    recall  f1-score   support

      female       0.69      0.73      0.71      1579
        male       0.65      0.60      0.62      1302

    accuracy                           0.67      2881
   macro avg       0.67      0.66      0.66      2881
weighted avg       0.67      0.67      0.67      2881



In [313]:
accepted_n_clicks = parse_accepted_n_clicks(val)
accepted_n_clicks.sample()

Unnamed: 0,0,user,target
17165,194.0,user_127876,male


In [505]:
replaced_accepted_n_clicks_df = replace_site_id(accepted_n_clicks)
replaced_accepted_n_clicks_df.sample()

Unnamed: 0,user,0,target
11488,user_148642,0.772177,female


In [506]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = replaced_accepted_n_clicks_df[[0]]
y = replaced_accepted_n_clicks_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Display classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Accuracy: 0.62
Confusion Matrix:
[[1170  393]
 [ 703  656]]
Classification Report:
              precision    recall  f1-score   support

      female       0.62      0.75      0.68      1563
        male       0.63      0.48      0.54      1359

    accuracy                           0.62      2922
   macro avg       0.63      0.62      0.61      2922
weighted avg       0.62      0.62      0.62      2922



In [322]:
category_df = parse_category(val)
category_df.head(1)

Unnamed: 0,0,user,target
0,other,user_127756,female


In [439]:
replaced_category_df = replace_site_id(category_df)
replaced_category_df.sample()

Unnamed: 0,user,0,target
12055,user_139954,0.502892,female


In [440]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = replaced_category_df[[0]]
y = replaced_category_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Display classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Accuracy: 0.61
Confusion Matrix:
[[1597 1109]
 [1000 1709]]
Classification Report:
              precision    recall  f1-score   support

      female       0.61      0.59      0.60      2706
        male       0.61      0.63      0.62      2709

    accuracy                           0.61      5415
   macro avg       0.61      0.61      0.61      5415
weighted avg       0.61      0.61      0.61      5415



In [441]:
replaced_meta_df.rename(columns={0:'meta_site_id'}, inplace=True)
replaced_clicks_df.rename(columns={0:'clicks_site_id'}, inplace=True)
replaced_accepted_df.rename(columns={0:'accepted_site_id'}, inplace=True)
replaced_category_df.rename(columns={0:'category'}, inplace=True)

In [466]:
merged_df = replaced_meta_df.merge(replaced_clicks_df[['user', 'clicks_site_id']], on='user', how='left')

In [467]:
merged_df = merged_df.merge(replaced_accepted_df[['user', 'accepted_site_id']], on='user', how='left')

In [468]:
merged_df = merged_df.merge(replaced_category_df[['user', 'category']], on='user', how='left')

In [469]:
merged_df.sample()

Unnamed: 0,user,meta_site_id,target,clicks_site_id,accepted_site_id,category
17887,user_145643,0.527583,male,,,0.533796


In [470]:
merged_df_dropna = merged_df.dropna()

In [471]:
merged_df_fillna = merged_df.fillna(0.5)

In [472]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = merged_df_dropna.drop(columns=['user', 'target'])
y = merged_df_dropna['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Display classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Accuracy: 0.83
Confusion Matrix:
[[1254  225]
 [ 228 1009]]
Classification Report:
              precision    recall  f1-score   support

      female       0.85      0.85      0.85      1479
        male       0.82      0.82      0.82      1237

    accuracy                           0.83      2716
   macro avg       0.83      0.83      0.83      2716
weighted avg       0.83      0.83      0.83      2716



In [473]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = merged_df_fillna.drop(columns=['user', 'target'])
y = merged_df_fillna['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Display classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Accuracy: 0.80
Confusion Matrix:
[[2227  555]
 [ 547 2161]]
Classification Report:
              precision    recall  f1-score   support

      female       0.80      0.80      0.80      2782
        male       0.80      0.80      0.80      2708

    accuracy                           0.80      5490
   macro avg       0.80      0.80      0.80      5490
weighted avg       0.80      0.80      0.80      5490



In [73]:
replaced_meta_df.rename(columns={0:'meta_site_id'}, inplace=True)
merged_df_ = replaced_meta_df.merge(visited_universal_brands[['user', 'visited-universal-brands']], on='user', how='left')
# merged_df_ = replaced_meta_df.merge(visited_general_categories[['user', 'visited-general-categories']], on='user', how='left') - неудачный, метрика не растет
# visits_pages_count и visits_session_duration аналогично

In [74]:
merged_df_.isna().mean()

user            0.000000
meta_site_id    0.000000
target          0.000000
pages-count     0.014646
dtype: float64

In [75]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = merged_df_.drop(columns=['user', 'target']).fillna(0.5)
y = merged_df_['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Display classification report
class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

Accuracy: 0.80
Confusion Matrix:
[[2219  563]
 [ 556 2152]]
Classification Report:
              precision    recall  f1-score   support

      female       0.80      0.80      0.80      2782
        male       0.79      0.79      0.79      2708

    accuracy                           0.80      5490
   macro avg       0.80      0.80      0.80      5490
weighted avg       0.80      0.80      0.80      5490



In [409]:
merged_df_fillna['target'] = merged_df_fillna['target'].map({'female': 1, 'male': 0})

In [410]:
X = merged_df_fillna.drop(columns=['user', 'target'])
y = merged_df_fillna['target']

In [412]:
# import tensorflow as tf
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
# import pandas as pd
# from sklearn.impute import SimpleImputer


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Создание модели нейронной сети
# model = tf.keras.Sequential([
#     tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
#     tf.keras.layers.Dropout(0.5),
#     tf.keras.layers.Dense(64, activation='relu'),
#     tf.keras.layers.Dense(1, activation='sigmoid')
# ])


# # Компиляция модели
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# # Обучение модели
# model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

# # Оценка модели на тестовом наборе
# y_pred_prob = model.predict(X_test)
# y_pred = (y_pred_prob > 0.5).astype(int)

# # Оценка точности
# accuracy = accuracy_score(y_test, y_pred)
# print(f'Точность модели: {accuracy}')


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Точность модели: 0.7979963570127504
