## Load and Downcast

In [1]:
import os
from data_loading import load_data, downcast
import time

# Set aside validation data for development, but use all data for final model
# Downsample the data for faster iteration during development
developing = True

# start the timer
start = time.time()

workspace_path = os.getcwd()
data_path = os.path.join(
    workspace_path, 
    '2a161f8e_1679936280892_sc', 
    'sharechat_recsys2023_data', 
    'train')

data = load_data(data_path, verbose=True) 
# Takes about 1 minute to load

# Downcast the data to reduce memeory usage
downcast(data,verbose=True)

# Reduce the size of the data for faster iteration during development
if developing:
    data = data.sample(100000, random_state=6)

# sort according to timestamp in column 'f_1', ascending order
data.sort_values(by=['f_1'], inplace=True)

# 'is_installed' is the target
y = data['is_installed']

# Drop 'is_installed' and 'is_clicked' from the features
X = data.drop(['is_installed', 'is_clicked'], axis=1)

# Takes about 1:40 minutes to run
print(f"Time taken: {(time.time() - start)/60:.1f} minutes")

Loading data from file: 000000000023.csv
Loading data from file: 000000000022.csv
Loading data from file: 000000000008.csv
Loading data from file: 000000000020.csv
Loading data from file: 000000000021.csv
Loading data from file: 000000000009.csv
Loading data from file: 000000000025.csv
Loading data from file: 000000000019.csv
Loading data from file: 000000000018.csv
Loading data from file: 000000000024.csv
Loading data from file: 000000000026.csv
Loading data from file: 000000000027.csv
Loading data from file: 000000000016.csv
Loading data from file: 000000000002.csv
Loading data from file: 000000000003.csv
Loading data from file: 000000000017.csv
Loading data from file: 000000000029.csv
Loading data from file: 000000000001.csv
Loading data from file: 000000000015.csv
Loading data from file: 000000000014.csv
Loading data from file: 000000000000.csv
Loading data from file: 000000000028.csv
Loading data from file: 000000000004.csv
Loading data from file: 000000000010.csv
Loading data fro

## Training and Validation split

In [2]:
if developing:
    # Split the data into train and validation sets

    #'f_1' column contains the day of data collection

    # Set aside last 3 days for validation
    # chronological split to avoid data leakage
    max_day = X['f_1'].max()
    validation_day_start = max_day - 3

    # Split the data
    X_val = X[(X['f_1'] > validation_day_start)]
    y_val = y[(X['f_1'] > validation_day_start)]

    X_train = X[X['f_1'] <= validation_day_start]
    y_train = y[X['f_1'] <= validation_day_start]


    print(f"Train data samples: {len(X_train)}")
    print(f"Validation data samples: {len(X_val)}")
else:
    # Use all the data for training to get the best results possible
    X_train = X
    y_train = y

Train data samples: 88960
Validation data samples: 11040


## Preprocessing and Training Pipeline

In [3]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbPipeline
from flaml.default import LGBMClassifier # FLAML's default LGBMClassifier, comes with hyperparameter tuning
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from category_encoders import CatBoostEncoder
from sklearn.feature_selection import RFECV


# PARAMETERS
cardinality_cutoff = 100 # HYPERPARAMETER. high -> CatBoostEncoder, low -> OneHotEncoder.
fill_value = -114 # Arbitrary value that doesn't exist in the categorical features


# Define preprocessing categories
binary_features = [f"f_{i}" for i in range(33, 42)]
numeric_features = [f"f_{i}" for i in range(42, 80)]
categorical_features = [f"f_{i}" for i in range(1, 33)]

#split categorical features by cardinality
n_unique_values = X_train[categorical_features].nunique()
# low cardinality features will be one-hot encoded
low_cardinality_cat_features = n_unique_values[n_unique_values < cardinality_cutoff].index.tolist()
# high cardinality features will be encoded with CatBoostEncoder, a target encoding method
high_cardinality_cat_features = list(set(categorical_features) - set(low_cardinality_cat_features))

print(f"Low cardinality categorical features: {low_cardinality_cat_features}")
print(f"High cardinality categorical features: {high_cardinality_cat_features}")



# For high cardinality features
high_cardinality_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value= fill_value)),
        ('catboost_enc', CatBoostEncoder(handle_missing='value')),
    ],
    verbose=True)

# For low cardinality features
low_cardinality_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value= fill_value)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))],
    verbose=True)

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())], # important, because our data mainly consists of outliers
    verbose=True)

binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))],
    verbose=True)

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('high_card', high_cardinality_transformer, high_cardinality_cat_features),
        ('low_card', low_cardinality_transformer, low_cardinality_cat_features),
        ('bin', binary_transformer, binary_features)
    ],
    verbose=True)


# Instantiate LGBMClassifier with
lgbm = LGBMClassifier(metric='f1', n_jobs=-1, random_state=6, verbose=2, objective='binary')


# Bundle preprocessing in a pipeline
pipeline = imbPipeline(steps=[
    ('undersampler', RandomUnderSampler(random_state=6)),
    ('preprocessor', preprocessor),
    ('variance_threshold', VarianceThreshold(threshold=(0.01))),
    ('model', lgbm),
], verbose=True)

# Display pipeline
pipeline

Low cardinality categorical features: ['f_1', 'f_3', 'f_5', 'f_7', 'f_8', 'f_9', 'f_10', 'f_11', 'f_12', 'f_13', 'f_14', 'f_16', 'f_17', 'f_19', 'f_20', 'f_21', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26', 'f_27', 'f_28', 'f_29', 'f_30', 'f_31', 'f_32']
High cardinality categorical features: ['f_6', 'f_4', 'f_2', 'f_15', 'f_18']


In [4]:
# Fit the pipeline
pipeline.fit(X_train, y_train)

if developing:
    # Make predictions on the validation set
    y_val_pred = pipeline.predict(X_val)

    # Print the classification report
    print(classification_report(y_val, y_val_pred))

print(f"Time taken: {(time.time() - start)/60:.1f} minutes")

[Pipeline] ...... (step 1 of 4) Processing undersampler, total=   0.3s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.2s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[ColumnTransformer] ........... (1 of 4) Processing num, total=   0.2s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ...... (step 2 of 2) Processing catboost_enc, total=   0.0s
[ColumnTransformer] ..... (2 of 4) Processing high_card, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing onehot, total=   0.1s
[ColumnTransformer] ...... (3 of 4) Processing low_card, total=   0.1s
[Pipeline] ........... (step 1 of 1) Processing imputer, total=   0.0s
[ColumnTransformer] ........... (4 of 4) Processing bin, total=   0.0s
[Pipeline] ...... (step 2 of 4) Processing preprocessor, total=   0.5s
[Pipeline]  (step 3 of 4) Processing variance_threshold, total=   0.0s
[Light

## Predict on the test data

In [5]:
import os
from data_loading import load_data, downcast
import pandas as pd

if not developing:
    #  Load test data
    workspace_path = os.getcwd()
    test_data_path = os.path.join(
        workspace_path, 
        '2a161f8e_1679936280892_sc', 
        'sharechat_recsys2023_data', 
        'test')

    test_data = load_data(test_data_path, verbose=True) 

    #Downcast the data to reduce memeory usage
    downcast(test_data)

    #predict
    predictions = pipeline.predict_proba(test_data)[:,1]

    print(predictions)


    # Create a DataFrame with the required columns
    submission_df = pd.DataFrame({
        "RowId": test_data['f_0'],
        "is_installed": predictions
    })

    # Save the DataFrame to a csv file
    submission_df.to_csv("submission.csv", index=False, sep="\t")

    print("Submission saved!")
    print(f"Time taken: {(time.time() - start)/60:.1f} minutes")
