In [10]:
import pandas as pd
import numpy as np

def preprocess_data(registration_file, lives_file):
    registration_df = pd.read_csv(registration_file)
    reg_columns_to_drop = ['registration_platform_specific', 'registration_country', 'registration_store', 
                           'registration_channel_detailed', 'registration_device_type', 
                           'registration_device_manufacturer', 'avg_age_top_11_players',
                           'registration_season_day', 'tokens_bought', 'avg_stars_top_11_players', 
                           'number_of_devices_used', 'transaction_count_iap', 'tokens_bought', 
                           'morale_spent', 'total_match_won_count', 'tokens_spent', 'rests_stash']
    registration_df = registration_df.drop(columns=reg_columns_to_drop)

    # Učitavanje lives_data
    lives_df = pd.read_csv(lives_file)
    lives_columns_to_drop = ['registration_channel_detailed', 'registration_country', 'registration_store', 
                             'registration_platform_specific']
    lives_df = lives_df.drop(columns=lives_columns_to_drop)

    # Encode columns
    for col in lives_df.columns:
        if lives_df[col].dtype == 'bool':
            lives_df[col] = lives_df[col].astype(int)
    lives_df['registration_date'] = pd.to_datetime(lives_df['registration_date'], dayfirst=False)

    # Create last registration columns
    lives_df = lives_df.sort_values(by=['user_id', 'registration_date'])
    lives_df['last_registration_date'] = lives_df.groupby('user_id')['registration_date'].shift(1)
    lives_df['days_passed_last_registration'] = (
        lives_df['registration_date'] - lives_df['last_registration_date']
    ).dt.days.fillna(0).astype(int)
    lives_df['last_days_active_lifetime'] = lives_df.groupby('user_id')['days_active_lifetime'].shift(1)
    lives_df['last_days_active_lifetime'] = lives_df['last_days_active_lifetime'].fillna(0).astype(int)
    lives_df['num_previous_lives'] = lives_df.groupby('user_id').cumcount() + 1

    # Uzimanje poslednjeg reda za svakog korisnika u lives_df
    latest_lives = lives_df.sort_values(by=['user_id', 'registration_date']).groupby('user_id').last()
    latest_lives = latest_lives[['is_rewarded_video_watcher_lifetime', 'is_payer_lifetime', 
                                 'days_active_lifetime', 'num_previous_lives', 'registration_date']]

    # Merge sa registration_df na osnovu user_id
    expanded_registration_df = registration_df.merge(latest_lives, on='user_id', how='left')

    # Konverzija registration_date i registration_time_utc
    expanded_registration_df['registration_time_utc'] = pd.to_datetime(expanded_registration_df['registration_time_utc'])
    expanded_registration_df['registration_date'] = pd.to_datetime(expanded_registration_df['registration_date'])

    # Kreiranje kolone days_passed_since_last_registration
    expanded_registration_df['days_passed_since_last_registration'] = (
        expanded_registration_df['registration_time_utc'] - expanded_registration_df['registration_date']
    ).dt.days

    return expanded_registration_df


In [12]:
registration_file = "data/registration_data_training.csv"
lives_file = "data/previous_lives_training_data.csv"

# Poziv metode za preprocesiranje
training_df = preprocess_data(registration_file, lives_file)

# Prikaz rezultata
training_df.head()


Unnamed: 0,user_id,registration_time_utc,session_count,playtime,total_match_played_count,total_match_watched_count,transaction_count_rewarded_video,tokens_stash,rests_spent,treatments_spent,money_stash,avg_stars_top_3_players,training_count,days_active_first_28_days_after_registration,is_rewarded_video_watcher_lifetime,is_payer_lifetime,days_active_lifetime,num_previous_lives,registration_date,days_passed_since_last_registration
0,5,2024-05-25 01:26:48,1,141062,2,0,0,114,0,0,1066076772,5.451511,0,0,1,0,18,2,2023-05-07,384
1,7,2024-06-14 20:09:31,1,521754,0,0,0,157,0,0,1224446429,5.303822,1,0,0,0,1,3,2023-05-28,383
2,13,2024-05-31 09:06:51,2,1703183,0,0,0,232,0,0,1221792142,5.746889,3,0,0,0,1,2,2023-08-17,288
3,23,2024-05-31 04:00:33,4,3181700,3,1,0,8,39,12,928112709,6.692578,7,1,1,0,2,1,2023-08-30,275
4,25,2024-06-15 19:56:05,5,5866921,1,1,0,70,80,0,326553498,5.775289,6,3,0,0,1,2,2023-08-15,305


In [13]:
columns_to_drop = ['user_id', 'registration_time_utc', 'registration_date']
training_df = training_df.drop(columns=columns_to_drop)

In [15]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

target = 'days_active_first_28_days_after_registration'

X = training_df.drop(columns=[target]) 
y = training_df[target]


xgb_model = XGBRegressor(
    tree_method="hist",
    max_depth=3,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    eval_metric="mae",
    reg_alpha=0.1,
    random_state=42
)


xgb_model.fit(X, y)

In [18]:
registration_test_file = "data/registration_data_test.csv"
lives_test_file = "data/previous_lives_test_data.csv"


test_df = preprocess_data(registration_test_file, lives_test_file)


user_ids = test_df['user_id']


columns_to_drop = ['user_id', 'registration_time_utc', 'registration_date']
X_test = test_df.drop(columns=columns_to_drop)


y_test_pred = xgb_model.predict(X_test)


y_test_pred = np.clip(y_test_pred, 0, 28)


submission_df = pd.DataFrame({
    'user_id': user_ids,
    'predicted_days_active_first_28_days_after_registration': y_test_pred
})


submission_file = "days_active_first_28_days_after_registration_predictions.csv"
submission_df.to_csv(submission_file, index=False)

print(f"Predictions saved to {submission_file}")


Predictions saved to days_active_first_28_days_after_registration_predictions.csv
