In [12]:
import pandas as pd
import math
from sklearn.preprocessing import OneHotEncoder,StandardScaler
import numpy
from datetime import datetime,date
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score,confusion_matrix
#############################ADD VIZUALISATION######################################################################

In [8]:
df_sessions = pd.read_csv('data/ga_sessions.csv', low_memory=False)
df_hits = pd.read_csv('data/ga_hits.csv')

In [9]:
##################################################DATA CLEANINING################################################
#Cleaning (deleting cols)
def delete_cols(df):
    for col in df.columns:
        if float(df[col].isnull().sum()/df.shape[0]) >= 0.2:
            del df[col]

In [10]:
#First
delete_cols(df_sessions)
delete_cols(df_hits)

In [11]:
#Cleaning (filling data)
def fill_data(df):
    obj_col = []
    num_col = []
    for col in df.columns:
        if df[col].dtype == 'object':
            obj_col.append(col)
        else:
            num_col.append(col)
    
    for null_col in num_col:
        df[null_col].fillna(int(df[null_col].mean()),inplace=True)
        
    for null_col in obj_col:
        df[null_col].fillna(df[null_col].value_counts().idxmax(),inplace=True)
        
        
    '''anom_columns = []
    def calculate_outliers(data):      #change anom values with accepteble ones
        q25=data.quantile(0.25)
        q75=data.quantile(0.75)
        iqr=q75-q25
        boundaries=(q25-1.5*iqr,q75+1.5*iqr)
        return boundaries
    for n in num_col:
        boundaries=calculate_outliers(df[n])
        if df[n].max()>boundaries[1] or df[n].min()<boundaries[0]:
            anom_columns.append(n)
    print(anom_columns)''' # results --> ['visit_number'] ['hit_number'] so it's not that data that can be anom
    

In [12]:
#Second
fill_data(df_sessions)
fill_data(df_hits)

In [13]:
#Deleting features that are not necessary (trash)
#Third
del df_sessions['client_id']
del df_sessions['utm_source']
del df_sessions['utm_campaign']
del df_sessions['utm_adcontent']
del df_hits['hit_page_path']
df_sessions = df_sessions[df_sessions.device_screen_resolution!='(not set)']
for col in df_sessions.columns:
    df_sessions = df_sessions[df_sessions[col] != "(not set)"]
for col in df_hits.columns:
    df_hits = df_hits[df_hits[col] != "(not set)"]

In [14]:
#Look on the key features HAVE NO IDEA RN 

In [15]:
#########################################FEATURE ENGINEERING##################################################

In [16]:
#ONEHOTENCODER FEATURES
def OHE(df):
    obj_col = []
    for col in df.columns:
        if df[col].dtype == 'object':
            obj_col.append(col)
    for category_feature in obj_col:
        #if len(df[category_feature].unique()) < 20:
        ohe = OneHotEncoder(sparse=False)
        ohe.fit(df[[category_feature]])
        ohe_feature=ohe.transform(df[[category_feature]])
        df[ohe.get_feature_names_out()]=ohe_feature
        del df[category_feature]

In [17]:
#utm_medium device_category device_browser
OHE(df_sessions)
OHE(df_hits)

In [18]:
def date_conv(df):
    for col in df.columns:
        if col.endswith('date'):
            df[col]=df[col].apply(lambda x: (date.today()-datetime.strptime(x, '%Y-%m-%d').date()).days)
        elif col.endswith('time'):
            df[col]=df[col].apply(lambda x: int(x[:2])*60+int(x[3:5]))         

In [19]:
date_conv(df_sessions)
date_conv(df_hits)

In [20]:
def resolution(df):
    df['device_screen_resolution']=df['device_screen_resolution'].apply(lambda x: int(x.split('x')[0])*int(x.split('x')[0]))

In [21]:
resolution(df_sessions)

In [22]:
#Standartization
def STD(df):
    num_col = []
    for col in df.columns:
        if df[col].dtype != 'object':
            if df[col][0].dtype!='float64':
                num_col.append(col)
    for std_feature in num_col:
        std_scaler=StandardScaler()
        std_scaler.fit(df[[std_feature]])
        scaled=std_scaler.transform(df[[std_feature]])
        df[f"std_scaled_{std_feature}"]=scaled
        del df[std_feature]

In [23]:
STD(df_sessions)
STD(df_hits)

In [24]:
#Target column engineering
target_list = ['sub_car_claim_click', 'sub_car_claim_submit_click',
'sub_open_dialog_click', 'sub_custom_question_submit_click',
'sub_call_number_click', 'sub_callback_submit_click', 'sub_submit_success',
'sub_car_request_submit_click']
df_hits['event_action'] = df_hits['event_action'].apply(lambda x: 1 if x in target_list else 0)

In [25]:
#######################################SAVING FINAL DATASETS#######################################################

In [26]:
df_hits.merge(df_sessions, on=['session_id']).drop(columns=['session_id']).to_csv('data/final_df.csv',index=False)

In [27]:
##########################################MODELING#################################################################

In [3]:
final_df = pd.read_csv('data/final_df.csv')

In [7]:
X = final_df.drop(['event_action','event_category','hit_type_event','std_scaled_hit_date','std_scaled_hit_number'],axis=1)
Y = final_df['event_action']

In [16]:
final_df

Unnamed: 0,event_category,event_action,hit_type_event,std_scaled_hit_date,std_scaled_hit_number,utm_medium,device_brand,device_browser,geo_country,geo_city,device_category_desktop,device_category_mobile,device_category_tablet,std_scaled_visit_date,std_scaled_visit_time,std_scaled_visit_number,std_scaled_device_screen_resolution
0,quiz,0,1.0,-1.313431,0.222766,cpc,Huawei,Chrome,Russia,Saint Petersburg,0.0,1.0,0.0,-1.311787,-0.120834,-0.062174,-0.441124
1,card_web,0,1.0,-1.313431,-0.400564,cpc,Huawei,Chrome,Russia,Saint Petersburg,0.0,1.0,0.0,-1.311787,-0.120834,-0.062174,-0.441124
2,card_web,0,1.0,-1.313431,-0.192787,cpc,Huawei,Chrome,Russia,Saint Petersburg,0.0,1.0,0.0,-1.311787,-0.120834,-0.062174,-0.441124
3,card_web,0,1.0,-1.313431,-0.123528,cpc,Huawei,Chrome,Russia,Saint Petersburg,0.0,1.0,0.0,-1.311787,-0.120834,-0.062174,-0.441124
4,card_web,0,1.0,-1.313431,-0.539082,cpc,Huawei,Chrome,Russia,Saint Petersburg,0.0,1.0,0.0,-1.311787,-0.120834,-0.062174,-0.441124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14955598,quiz,0,1.0,-1.313431,-0.781489,(none),Apple,Safari,Russia,Moscow,0.0,1.0,0.0,-1.311787,0.476420,-0.062174,-0.400484
14955599,quiz,0,1.0,-1.313431,-0.781489,blogger_channel,Realme,Chrome,Russia,Ulyanovsk,0.0,1.0,0.0,-1.311787,0.719140,-0.062174,-0.441124
14955600,quiz,0,1.0,-1.313431,-0.781489,cpa,Apple,Safari (in-app),Russia,Samara,0.0,1.0,0.0,-1.311787,0.340061,-0.062174,-0.432762
14955601,quiz,0,1.0,-1.313431,-0.712230,cpc,Apple,Edge,Russia,Orenburg,1.0,0.0,0.0,-1.311787,-0.535367,0.020717,0.875773


In [13]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [14]:
lgr=LogisticRegression(solver="liblinear")
rfc=RandomForestClassifier(max_depth=200)
mlp=MLPClassifier()

In [15]:
print(cross_validate(lgr,x_train,y_train,cv=5))
print(cross_validate(rfc,x_train,y_train,cv=5))
print(cross_validate(mlp,x_train,y_train,cv=5))

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/home/nikita/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/nikita/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1508, in fit
    X, y = self._validate_data(
  File "/home/nikita/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 581, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/nikita/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 964, in check_X_y
    X 

{'fit_time': array([8.7056601 , 5.6071279 , 4.80013561, 4.70097733, 3.93349981]), 'score_time': array([0., 0., 0., 0., 0.]), 'test_score': array([nan, nan, nan, nan, nan])}


5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/home/nikita/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/nikita/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 327, in fit
    X, y = self._validate_data(
  File "/home/nikita/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 581, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/nikita/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 964, in check_X_y
    X = check

{'fit_time': array([3.98162985, 3.86661363, 3.89680123, 3.63330102, 3.85273218]), 'score_time': array([0., 0., 0., 0., 0.]), 'test_score': array([nan, nan, nan, nan, nan])}
{'fit_time': array([4.1075747 , 3.85155272, 3.77211833, 3.95949197, 3.78001904]), 'score_time': array([0., 0., 0., 0., 0.]), 'test_score': array([nan, nan, nan, nan, nan])}


5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/home/nikita/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/nikita/anaconda3/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py", line 752, in fit
    return self._fit(X, y, incremental=False)
  File "/home/nikita/anaconda3/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py", line 393, in _fit
    X, y = self._validate_input(X, y, incremental, reset=first_pass)
  File "/home/nikita/anaconda3/lib/python