In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime 
%matplotlib inline

import pickle

In [2]:
with open('../data/pkl/ga_hits-002.pkl', 'rb') as file:
    data = pickle.load(file)

In [3]:
df = data.copy()

In [4]:
shape = df.shape
print("Number of rows:", "{:,}".format(shape[0]).replace(',',' '))
print("Number of columns:", "{:,}".format(shape[1]).replace(',',' '))

Number of rows: 15 726 470
Number of columns: 11


In [5]:
df.head()

Unnamed: 0,session_id,hit_date,hit_time,hit_number,hit_type,hit_referer,hit_page_path,event_category,event_action,event_label,event_value
0,5639623078712724064.1640254056.1640254056,2021-12-23,597864.0,30,event,,sberauto.com/cars?utm_source_initial=google&ut...,quiz,quiz_show,,
1,7750352294969115059.1640271109.1640271109,2021-12-23,597331.0,41,event,,sberauto.com/cars/fiat?city=1&city=18&rental_c...,quiz,quiz_show,,
2,885342191847998240.1640235807.1640235807,2021-12-23,796252.0,49,event,,sberauto.com/cars/all/volkswagen/polo/e994838f...,quiz,quiz_show,,
3,142526202120934167.1640211014.1640211014,2021-12-23,934292.0,46,event,,sberauto.com/cars?utm_source_initial=yandex&ut...,quiz,quiz_show,,
4,3450086108837475701.1640265078.1640265078,2021-12-23,768741.0,79,event,,sberauto.com/cars/all/mercedes-benz/cla-klasse...,quiz,quiz_show,,


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15726470 entries, 0 to 15726469
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   session_id      object 
 1   hit_date        object 
 2   hit_time        float64
 3   hit_number      int64  
 4   hit_type        object 
 5   hit_referer     object 
 6   hit_page_path   object 
 7   event_category  object 
 8   event_action    object 
 9   event_label     object 
 10  event_value     object 
dtypes: float64(1), int64(1), object(9)
memory usage: 1.3+ GB


### Преобразование типов переменных

In [32]:
# Функция на будующее, переводит число секунд во время
def float_to_time(seconds):
    if not seconds is None:
        minutes, seconds = divmod(seconds, 60)
        hours, minutes = divmod(minutes, 60)

        if hours >= 24:
            hours = hours % 24

        time_obj = datetime.time(int(hours), int(minutes), int(seconds))
        return time_obj

In [34]:
df.hit_time.value_counts(dropna=False)

NaN          9160322
0.0            41137
1.0            18522
2.0            10101
3.0             6060
              ...   
544067.0           1
698726.0           1
609006.0           1
1200872.0          1
742820.0           1
Name: hit_time, Length: 925888, dtype: int64

In [36]:
df['hit_date'] = pd.to_datetime(data['hit_date'], format='%Y-%m-%d', errors='coerce')


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15726470 entries, 0 to 15726469
Data columns (total 11 columns):
 #   Column          Dtype         
---  ------          -----         
 0   session_id      object        
 1   hit_date        datetime64[ns]
 2   hit_time        float64       
 3   hit_number      int64         
 4   hit_type        object        
 5   hit_referer     object        
 6   hit_page_path   object        
 7   event_category  object        
 8   event_action    object        
 9   event_label     object        
 10  event_value     object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(8)
memory usage: 1.3+ GB


### Обработка пустых атрибутов.

In [38]:
print(df.isna().sum())

session_id               0
hit_date                 0
hit_time           9160322
hit_number               0
hit_type                 0
hit_referer        6274804
hit_page_path            0
event_category           0
event_action             0
event_label        3760184
event_value       15726470
dtype: int64


In [39]:
total = df.shape[0]
for col in df.columns:
    missing = df[col].isnull().sum()
    percent = (missing / total) * 100
    print(f"Column: {col}, Missing Values: {missing}, Percentage of Missing Values: {percent:.2f}%")

Column: session_id, Missing Values: 0, Percentage of Missing Values: 0.00%
Column: hit_date, Missing Values: 0, Percentage of Missing Values: 0.00%
Column: hit_time, Missing Values: 9160322, Percentage of Missing Values: 58.25%
Column: hit_number, Missing Values: 0, Percentage of Missing Values: 0.00%
Column: hit_type, Missing Values: 0, Percentage of Missing Values: 0.00%
Column: hit_referer, Missing Values: 6274804, Percentage of Missing Values: 39.90%
Column: hit_page_path, Missing Values: 0, Percentage of Missing Values: 0.00%
Column: event_category, Missing Values: 0, Percentage of Missing Values: 0.00%
Column: event_action, Missing Values: 0, Percentage of Missing Values: 0.00%
Column: event_label, Missing Values: 3760184, Percentage of Missing Values: 23.91%
Column: event_value, Missing Values: 15726470, Percentage of Missing Values: 100.00%


In [41]:
df.hit_referer.value_counts(dropna=False)

HbolMJUevblAbkHClEQa    8879187
NaN                     6274804
FwdMTcXzWAwhtsnMAbhS      91940
sAGDBCdeVPmQPwkVEcIX      53565
GQmOLxANuSeCxCanBpMl      35470
                         ...   
VaOgrsicGpgIbzLNSDbc          1
rSeuBRLEcgyWsXrSYQqh          1
UNckxKWlzUxsKgvMBNdE          1
YZNVwxZKSbzYIwrGrgdY          1
tvGabHlPFDvTmIFiCKTL          1
Name: hit_referer, Length: 37874, dtype: int64

In [43]:
df.event_label.value_counts(dropna=False)

KclpemfoHstknWHFiLit    6505447
NaN                     3760184
hAHqGICPFQiPwtzubOzs    1284914
OHfyUfDKdPgBdvelDlfG     626554
pzAKUYdRKNEUXxxBFUPX     358915
                         ...   
iFTveknOEiHmdgIHHqBZ          1
WKfoFxrMANSjHQMIhCZr          1
RAJXTkPWctNbMZubUIng          1
jRptuMMybqHerOxplPYo          1
aNlmojNIfsJaXKZHCDiU          1
Name: event_label, Length: 39826, dtype: int64

In [44]:
drop_columns = ['hit_time', 'hit_referer','event_label', 'event_value']
df = df.drop(columns=drop_columns)

In [45]:
df.head(10)

Unnamed: 0,session_id,hit_date,hit_number,hit_type,hit_page_path,event_category,event_action
0,5639623078712724064.1640254056.1640254056,2021-12-23,30,event,sberauto.com/cars?utm_source_initial=google&ut...,quiz,quiz_show
1,7750352294969115059.1640271109.1640271109,2021-12-23,41,event,sberauto.com/cars/fiat?city=1&city=18&rental_c...,quiz,quiz_show
2,885342191847998240.1640235807.1640235807,2021-12-23,49,event,sberauto.com/cars/all/volkswagen/polo/e994838f...,quiz,quiz_show
3,142526202120934167.1640211014.1640211014,2021-12-23,46,event,sberauto.com/cars?utm_source_initial=yandex&ut...,quiz,quiz_show
4,3450086108837475701.1640265078.1640265078,2021-12-23,79,event,sberauto.com/cars/all/mercedes-benz/cla-klasse...,quiz,quiz_show
5,6466333295973247896.1640246168.1640246168,2021-12-23,41,event,sberauto.com/cars?utm_source_initial=youtube&u...,quiz,quiz_show
6,281215510786615563.1640245516.1640245516,2021-12-23,102,event,sberauto.com/cars/toyota?isnew=false&rental_ca...,quiz,quiz_show
7,4024492994895054107.1640269084.1640269084,2021-12-23,85,event,sberauto.com/cars/all/mercedes-benz/glc/f8f330...,quiz,quiz_show
8,555009234841130092.1640256620.1640256620,2021-12-23,101,event,sberauto.com/cars/all/kia/sorento/c38179cb?utm...,quiz,quiz_show
9,2692901778487480807.1640206845.1640206845,2021-12-23,1,event,sberauto.com/cars/all/nissan/x-trail/0744675f?...,card_web,view_card


In [46]:
df.hit_type.value_counts(dropna=False)

event    15726470
Name: hit_type, dtype: int64

In [47]:
# переменная hit_type имеет столько же уникальных значений
# сколько наблюдений, поэтому она бесполезна для моделирования
df = df.drop(columns=['hit_type'])

In [57]:
# бесполезное поле - порядковый номер события в рамках сессии
df = df.drop(columns=['hit_number'])

In [58]:
df.head()

Unnamed: 0,session_id,hit_date,hit_page_path,event_category,event_action
0,5639623078712724064.1640254056.1640254056,2021-12-23,sberauto.com/cars?utm_source_initial=google&ut...,quiz,quiz_show
1,7750352294969115059.1640271109.1640271109,2021-12-23,sberauto.com/cars/fiat?city=1&city=18&rental_c...,quiz,quiz_show
2,885342191847998240.1640235807.1640235807,2021-12-23,sberauto.com/cars/all/volkswagen/polo/e994838f...,quiz,quiz_show
3,142526202120934167.1640211014.1640211014,2021-12-23,sberauto.com/cars?utm_source_initial=yandex&ut...,quiz,quiz_show
4,3450086108837475701.1640265078.1640265078,2021-12-23,sberauto.com/cars/all/mercedes-benz/cla-klasse...,quiz,quiz_show


In [59]:
print(df.isna().sum())

session_id        0
hit_date          0
hit_page_path     0
event_category    0
event_action      0
dtype: int64


### Дубликаты строк

In [60]:
duplicates = df.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")

Number of duplicate rows: 8196271


In [63]:
duplicated_rows = df[duplicates]
duplicated_rows

Unnamed: 0,session_id,hit_date,hit_page_path,event_category,event_action
23,3117547552978670057.1640217454.1640217454,2021-12-23,sberauto.com/cars/all/skoda/rapid/bf24b977?ren...,card_web,view_card
32,3282323002617677961.1640266499.1640266499,2021-12-23,sberauto.com/cars/all/lada-vaz/vesta/2fc745ed?...,card_web,view_card
39,562366956183957396.1640246520.1640246520,2021-12-23,sberauto.com/cars/all/mini/hatch/b82590a3?rent...,card_web,view_card
42,628849304731384746.1640233987.1640233987,2021-12-23,sberauto.com/cars/all/skoda/rapid/bf24b977?ren...,card_web,view_card
61,2530706414478031710.1640233887.1640233887,2021-12-23,sberauto.com/cars/all/nissan/qashqai/bfc21661?...,card_web,view_card
...,...,...,...,...,...
15726416,2445898965208837590.1640279515.1640279515,2021-12-23,sberauto.com/cars/all/kia/sorento/c38179cb?utm...,card_web,photos
15726420,1279681186746345207.1640244983.1640244983,2021-12-23,sberauto.com/cars/all/skoda/rapid/bf24b977?utm...,card_web,photos
15726421,1279681186746345207.1640244983.1640244983,2021-12-23,sberauto.com/cars/all/renault/logan/8c3c73f2?u...,card_web,photos
15726429,2233243198661178069.1640255986.1640255986,2021-12-23,sberauto.com/cars?utm_source_initial=outlook&u...,quiz,quiz_show


In [64]:
df[df['session_id']=='3717261073532546004.1632332758.1632332758']

Unnamed: 0,session_id,hit_date,hit_page_path,event_category,event_action
9627600,3717261073532546004.1632332758.1632332758,2021-09-22,sberauto.com/cars/6afb1543?rental_page=rental_car,card_web,view_card
9628117,3717261073532546004.1632332758.1632332758,2021-09-22,sberauto.com/cars/6afb1543?rental_page=rental_car,card_web,view_card
9629078,3717261073532546004.1632332758.1632332758,2021-09-22,sberauto.com/cars/6afb1543?rental_page=rental_car,card_web,view_new_card
10078037,3717261073532546004.1632332758.1632332758,2021-09-22,sberauto.com/cars/6afb1543?rental_page=rental_car,card_web,view_card
10078224,3717261073532546004.1632332758.1632332758,2021-09-22,sberauto.com/cars/6afb1543?rental_page=rental_car,card_web,view_card
11428978,3717261073532546004.1632332758.1632332758,2021-09-22,sberauto.com/cars/6afb1543?rental_page=rental_car,card_web,view_card
12330520,3717261073532546004.1632332758.1632332758,2021-09-22,sberauto.com/cars/6afb1543?rental_page=rental_car,card_web,view_card
12330994,3717261073532546004.1632332758.1632332758,2021-09-22,sberauto.com/cars/6afb1543?rental_page=rental_car,card_web,view_new_card
12781462,3717261073532546004.1632332758.1632332758,2021-09-22,sberauto.com/cars/6afb1543?rental_page=rental_car,card_web,view_new_card
12781682,3717261073532546004.1632332758.1632332758,2021-09-22,sberauto.com/cars/6afb1543?rental_page=rental_car,card_web,view_new_card


In [65]:
df = df.drop_duplicates()
shape = df.shape
print("Number of rows:", "{:,}".format(shape[0]).replace(',',' '))
print("Number of columns:", "{:,}".format(shape[1]).replace(',',' '))


Number of rows: 7 530 199
Number of columns: 5


In [66]:
df.head()

Unnamed: 0,session_id,hit_date,hit_page_path,event_category,event_action
0,5639623078712724064.1640254056.1640254056,2021-12-23,sberauto.com/cars?utm_source_initial=google&ut...,quiz,quiz_show
1,7750352294969115059.1640271109.1640271109,2021-12-23,sberauto.com/cars/fiat?city=1&city=18&rental_c...,quiz,quiz_show
2,885342191847998240.1640235807.1640235807,2021-12-23,sberauto.com/cars/all/volkswagen/polo/e994838f...,quiz,quiz_show
3,142526202120934167.1640211014.1640211014,2021-12-23,sberauto.com/cars?utm_source_initial=yandex&ut...,quiz,quiz_show
4,3450086108837475701.1640265078.1640265078,2021-12-23,sberauto.com/cars/all/mercedes-benz/cla-klasse...,quiz,quiz_show


### Добавление новых атрибутов

In [67]:
# новый атрибут - Целевое действие - 1, нецелевое действие - 0
target_action = ['sub_car_claim_click', 'sub_car_claim_submit_click',
                 'sub_open_dialog_click', 'sub_custom_question_submit_click',
                 'sub_call_number_click', 'sub_callback_submit_click', 'sub_submit_success',
                 'sub_car_request_submit_click']

df['target_action'] = df.apply(lambda x: 1 if x['event_action'] in target_action else 0, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['target_action'] = df.apply(lambda x: 1 if x['event_action'] in target_action else 0, axis=1)


In [68]:
df.head()

Unnamed: 0,session_id,hit_date,hit_page_path,event_category,event_action,target_action
0,5639623078712724064.1640254056.1640254056,2021-12-23,sberauto.com/cars?utm_source_initial=google&ut...,quiz,quiz_show,0
1,7750352294969115059.1640271109.1640271109,2021-12-23,sberauto.com/cars/fiat?city=1&city=18&rental_c...,quiz,quiz_show,0
2,885342191847998240.1640235807.1640235807,2021-12-23,sberauto.com/cars/all/volkswagen/polo/e994838f...,quiz,quiz_show,0
3,142526202120934167.1640211014.1640211014,2021-12-23,sberauto.com/cars?utm_source_initial=yandex&ut...,quiz,quiz_show,0
4,3450086108837475701.1640265078.1640265078,2021-12-23,sberauto.com/cars/all/mercedes-benz/cla-klasse...,quiz,quiz_show,0


In [93]:
[df['hit_page_path'].str.split('/', expand=True)][0][3]

0                    None
1                    None
2              volkswagen
3                    None
4           mercedes-benz
                ...      
15726465           toyota
15726466    mercedes-benz
15726467           toyota
15726468              bmw
15726469              bmw
Name: 3, Length: 7530199, dtype: object

In [94]:
[df['hit_page_path'].str.split('/', expand=True)][0][4]


0                 None
1                 None
2                 polo
3                 None
4           cla-klasse
               ...    
15726465      fortuner
15726466    gla-klasse
15726467       alphard
15726468            x3
15726469       7-serii
Name: 4, Length: 7530199, dtype: object

In [95]:
df['auto_marka'] = [df['hit_page_path'].str.split('/', expand=True)][0][3]

In [97]:
df['auto_model'] = [df['hit_page_path'].str.split('/', expand=True)][0][4]

In [101]:
# удаляем поле hit_page_path, оно нам больше не нужно.
df = df.drop(columns=['hit_page_path'])

In [103]:
# пометим пустые значения в полях как OTHER - другие страницы
df.auto_marka = df.auto_marka.fillna('other')
df.auto_model = df.auto_model.fillna('other')

In [105]:
df['event_category'] = df.event_category.apply(lambda x: x.lower())
df['event_action'] = df.event_action.apply(lambda x: x.lower())
df['auto_marka'] = df.auto_marka.apply(lambda x: x.lower())
df['auto_model'] = df.auto_model.apply(lambda x: x.lower())

In [106]:
df.head(10)

Unnamed: 0,session_id,hit_date,event_category,event_action,target_action,auto_marka,auto_model
0,5639623078712724064.1640254056.1640254056,2021-12-23,quiz,quiz_show,0,other,other
1,7750352294969115059.1640271109.1640271109,2021-12-23,quiz,quiz_show,0,other,other
2,885342191847998240.1640235807.1640235807,2021-12-23,quiz,quiz_show,0,volkswagen,polo
3,142526202120934167.1640211014.1640211014,2021-12-23,quiz,quiz_show,0,other,other
4,3450086108837475701.1640265078.1640265078,2021-12-23,quiz,quiz_show,0,mercedes-benz,cla-klasse
5,6466333295973247896.1640246168.1640246168,2021-12-23,quiz,quiz_show,0,other,other
6,281215510786615563.1640245516.1640245516,2021-12-23,quiz,quiz_show,0,other,other
7,4024492994895054107.1640269084.1640269084,2021-12-23,quiz,quiz_show,0,mercedes-benz,glc
8,555009234841130092.1640256620.1640256620,2021-12-23,quiz,quiz_show,0,kia,sorento
9,2692901778487480807.1640206845.1640206845,2021-12-23,card_web,view_card,0,nissan,x-trail


In [109]:
shape = df.shape
print("Number of rows:", "{:,}".format(shape[0]).replace(',',' '))
print("Number of columns:", "{:,}".format(shape[1]).replace(',',' '))

Number of rows: 7 530 199
Number of columns: 7


In [108]:
df.to_pickle("../data/result_ga_hits.pkl")

In [110]:
with open('../data/result_ga_hits.pkl', 'rb') as file:
    df_hits = pickle.load(file)

In [111]:
shape = df_hits.shape
print("Number of rows:", "{:,}".format(shape[0]).replace(',',' '))
print("Number of columns:", "{:,}".format(shape[1]).replace(',',' '))

Number of rows: 7 530 199
Number of columns: 7


In [112]:
df_hits.head()

Unnamed: 0,session_id,hit_date,event_category,event_action,target_action,auto_marka,auto_model
0,5639623078712724064.1640254056.1640254056,2021-12-23,quiz,quiz_show,0,other,other
1,7750352294969115059.1640271109.1640271109,2021-12-23,quiz,quiz_show,0,other,other
2,885342191847998240.1640235807.1640235807,2021-12-23,quiz,quiz_show,0,volkswagen,polo
3,142526202120934167.1640211014.1640211014,2021-12-23,quiz,quiz_show,0,other,other
4,3450086108837475701.1640265078.1640265078,2021-12-23,quiz,quiz_show,0,mercedes-benz,cla-klasse
