# Rec Sys Project - Fitting Model on Kaggle

## Checking loaded features data on Kaggle

In [8]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/250117-features-csv/250117features.csv


In [3]:
import os, psutil  
import numpy as np
import pandas as pd

def cpu_stats():
    pid = os.getpid()
    py = psutil.Process(pid)
    memory_use = py.memory_info()[0] / 2. ** 30
    return 'memory GB:' + str(np.round(memory_use, 2))

In [4]:
cpu_stats()

'memory GB:0.11'

In [9]:
#df = pd.read_csv('/kaggle/input/250117-features-csv/250117features.csv')

## Loading features data from PostgreSQL

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

import gc
import psutil

In [2]:
def batch_load_sql(query: str) -> pd.DataFrame:
    CHUNKSIZE = 200000
    engine = create_engine(
        "postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
        "postgres.lab.karpov.courses:6432/startml"
    )
    conn = engine.connect().execution_options(stream_results=True)
    chunks = []
    for chunk_dataframe in pd.read_sql(query, conn, chunksize=CHUNKSIZE):
        chunks.append(chunk_dataframe)
    conn.close()
    return pd.concat(chunks, ignore_index=True)

In [3]:
def load_features() -> pd.DataFrame:
    QUERY = 'SELECT * FROM "nktn_lx_step8_features"'
    features_df = batch_load_sql(QUERY)
    return features_df

In [4]:
df = load_features()

In [5]:
# Define a dictionary to map columns to their optimized data types
dtype_mapping = {
    'user_id': 'int32',        # int32 is sufficient for user IDs
    'post_id': 'int16',        # int32 is sufficient for post IDs
    'target': 'int8',          # target is binary or small range, so int8 is enough
    'month': 'int8',           # month ranges from 1 to 12
    'day': 'int8',             # day ranges from 1 to 31
    'day_of_week': 'int8',     # day_of_week ranges from 0 to 6
    'hour_of_day': 'int8',     # hour_of_day ranges from 0 to 23
    'gender': 'object',        # gender is binary or small range
    'country': 'object',       # country is categorical
    'city': 'object',          # city is categorical
    'exp_group': 'object',     # exp_group is likely a small range
    'os': 'object',            # os is categorical
    'source': 'object',        # source is categorical
    'age_category': 'int8',    # age_category is likely a small range
    'topic': 'object',         # topic is categorical
    'PCA_1': 'float16',        # float16 is sufficient for PCA components
    'PCA_2': 'float16',        # float16 is sufficient for PCA components
    'PCA_3': 'float16',        # float16 is sufficient for PCA components
    'PCA_4': 'float16',        # float16 is sufficient for PCA components
    'PCA_5': 'float16'         # float16 is sufficient for PCA components
}

# Convert columns to optimized data types
df = df.astype(dtype_mapping)

# Check memory usage after optimization
print(df.info(memory_usage='deep'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4888776 entries, 0 to 4888775
Data columns (total 20 columns):
 #   Column        Dtype  
---  ------        -----  
 0   user_id       int32  
 1   post_id       int16  
 2   target        int8   
 3   month         int8   
 4   day           int8   
 5   day_of_week   int8   
 6   hour_of_day   int8   
 7   gender        object 
 8   country       object 
 9   city          object 
 10  exp_group     object 
 11  os            object 
 12  source        object 
 13  age_category  int8   
 14  topic         object 
 15  PCA_1         float16
 16  PCA_2         float16
 17  PCA_3         float16
 18  PCA_4         float16
 19  PCA_5         float16
dtypes: float16(5), int16(1), int32(1), int8(6), object(7)
memory usage: 1.9 GB
None


In [6]:
df

Unnamed: 0,user_id,post_id,target,month,day,day_of_week,hour_of_day,gender,country,city,exp_group,os,source,age_category,topic,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5
0,200,6264,1,10,2,5,14,1,Russia,Degtyarsk,3,Android,ads,45,movie,-0.224609,0.157959,0.023315,0.306396,0.005329
1,200,4200,1,12,29,2,14,1,Russia,Degtyarsk,3,Android,ads,45,covid,0.854004,0.125610,0.226196,0.000183,0.050446
2,200,3567,1,12,29,2,14,1,Russia,Degtyarsk,3,Android,ads,45,covid,0.244629,-0.007030,-0.248291,0.002506,0.010040
3,200,3539,1,12,29,2,15,1,Russia,Degtyarsk,3,Android,ads,45,covid,0.259521,0.005848,0.072815,0.002659,-0.030136
4,200,994,1,12,29,2,15,1,Russia,Degtyarsk,3,Android,ads,45,politics,-0.112061,-0.312744,0.031342,-0.073486,0.387695
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4888771,168552,2829,0,10,14,3,11,1,Russia,Ivanteyevka,4,Android,organic,18,covid,0.078918,-0.070984,-0.107605,-0.007092,-0.038239
4888772,168552,3205,0,12,20,0,18,1,Russia,Ivanteyevka,4,Android,organic,18,covid,0.342285,0.052765,0.101257,-0.011940,-0.025452
4888773,168552,4428,0,10,14,3,11,1,Russia,Ivanteyevka,4,Android,organic,18,movie,-0.087585,0.000636,-0.005936,-0.000200,-0.067505
4888774,168552,1229,0,12,7,1,18,1,Russia,Ivanteyevka,4,Android,organic,18,politics,-0.100342,-0.254639,0.014259,-0.039886,0.028564


In [7]:
df.user_id.nunique()

163205

In [8]:
df.post_id.nunique()

6831

## Preparing data to fit a model

In [9]:
X = df.drop(['target', 'user_id', 'post_id'], axis=1)
y = df.target

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42, 
                                                    test_size=0.2)

In [11]:
cat_cols = X.select_dtypes(include='object').columns.to_list()
cat_cols

['gender', 'country', 'city', 'exp_group', 'os', 'source', 'topic']

In [12]:
# Check memory usage
print(f"Memory usage: {psutil.Process().memory_info().rss / 1024 ** 2:.2f} MB")

del df

# Force garbage collection
gc.collect()

# Check memory usage again
print(f"Memory usage after gc: {psutil.Process().memory_info().rss / 1024 ** 2:.2f} MB")

Memory usage: 2992.43 MB
Memory usage after gc: 2992.43 MB


## Fitting a CatBoostClassifier Model

In [13]:
from catboost import CatBoostClassifier


catboost = CatBoostClassifier(
    eval_metric='AUC',
    random_seed=42,
    auto_class_weights="Balanced"
)

catboost.fit(
    X_train, y_train,
    cat_features=cat_cols,
    eval_set=(X_test, y_test),  # Use your test set as a validation set for now
    early_stopping_rounds=50,
    verbose=10
)

#catboost = CatBoostClassifier()
#catboost.fit(X_train, y_train, cat_features=cat_cols, verbose=100)

Learning rate set to 0.244458
0:	test: 0.5981027	best: 0.5981027 (0)	total: 3.06s	remaining: 51m
10:	test: 0.6533218	best: 0.6533218 (10)	total: 31.2s	remaining: 46m 47s
20:	test: 0.6737594	best: 0.6737594 (20)	total: 57.7s	remaining: 44m 48s
30:	test: 0.6831308	best: 0.6831308 (30)	total: 1m 22s	remaining: 42m 55s
40:	test: 0.6880381	best: 0.6880381 (40)	total: 1m 46s	remaining: 41m 35s
50:	test: 0.6915146	best: 0.6915146 (50)	total: 2m 12s	remaining: 41m 6s
60:	test: 0.6946767	best: 0.6946767 (60)	total: 2m 36s	remaining: 40m 4s
70:	test: 0.6985352	best: 0.6985352 (70)	total: 3m 1s	remaining: 39m 28s
80:	test: 0.7007365	best: 0.7007365 (80)	total: 3m 25s	remaining: 38m 56s
90:	test: 0.7033788	best: 0.7033788 (90)	total: 3m 51s	remaining: 38m 32s
100:	test: 0.7050320	best: 0.7050320 (100)	total: 4m 16s	remaining: 38m 4s
110:	test: 0.7065556	best: 0.7065556 (110)	total: 4m 41s	remaining: 37m 32s
120:	test: 0.7098295	best: 0.7098295 (120)	total: 5m 7s	remaining: 37m 11s
130:	test: 0.710

<catboost.core.CatBoostClassifier at 0x7b8772b08f70>

In [15]:
# Predict class labels
y_pred = catboost.predict(X_test)

# Predict probabilities
y_pred_proba = catboost.predict_proba(X_test)
y_pred_proba_positive = y_pred_proba[:, 1]  # Probability of class 1

## Evaluating the model performance

In [26]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix


# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba_positive)

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC-AUC Score:", roc_auc)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", conf_matrix)

Accuracy: 0.6874158788082099
Precision: 0.6665088627751705
Recall: 0.7159776742934956
F1 Score: 0.6903582098927407
ROC-AUC Score: 0.7596084530283467

Confusion Matrix:
 [[331417 170475]
 [135156 340708]]


In [31]:
feature_importance = catboost.get_feature_importance()
feature_names = catboost.feature_names_

feature_name = []
feature_imp = []

for n, i in zip(feature_names, feature_importance):
    #print(n, round(i, 2), sep=': ')
    feature_name.append(n)
    feature_imp.append(i)

fi_df = pd.DataFrame({'feature_name': feature_name,
                      'feature_importance': feature_imp})

In [32]:
print('FEATURE IMPORTANCE report')
fi_df.sort_values('feature_importance', ascending=False)

FEATURE IMPORTANCE report


Unnamed: 0,feature_name,feature_importance
1,day,13.878031
10,age_category,13.110757
0,month,11.149579
6,city,9.022607
3,hour_of_day,7.790263
16,PCA_5,7.027344
7,exp_group,6.91509
12,PCA_1,6.695463
13,PCA_2,6.474888
15,PCA_4,6.231149


## Saving and loading the model

In [34]:
catboost.save_model('catboost_model.cbm')

In [35]:
loaded_model = CatBoostClassifier()
loaded_model.load_model('catboost_model.cbm')

<catboost.core.CatBoostClassifier at 0x7b872847b0d0>

In [36]:
# Predict class labels
y_pred = loaded_model.predict(X_test)

# Predict probabilities
y_pred_proba = loaded_model.predict_proba(X_test)
y_pred_proba_positive = y_pred_proba[:, 1]  # Probability of class 1

In [37]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba_positive)

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC-AUC Score:", roc_auc)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", conf_matrix)

Accuracy: 0.6874158788082099
Precision: 0.6665088627751705
Recall: 0.7159776742934956
F1 Score: 0.6903582098927407
ROC-AUC Score: 0.7596084530283467

Confusion Matrix:
 [[331417 170475]
 [135156 340708]]


In [38]:
feature_importance = catboost.get_feature_importance()
feature_names = catboost.feature_names_

feature_name = []
feature_imp = []

for n, i in zip(feature_names, feature_importance):
    #print(n, round(i, 2), sep=': ')
    feature_name.append(n)
    feature_imp.append(i)

fi_df = pd.DataFrame({'feature_name': feature_name,
                      'feature_importance': feature_imp})

In [39]:
print('FEATURE IMPORTANCE report')
fi_df.sort_values('feature_importance', ascending=False)

FEATURE IMPORTANCE report


Unnamed: 0,feature_name,feature_importance
1,day,13.878031
10,age_category,13.110757
0,month,11.149579
6,city,9.022607
3,hour_of_day,7.790263
16,PCA_5,7.027344
7,exp_group,6.91509
12,PCA_1,6.695463
13,PCA_2,6.474888
15,PCA_4,6.231149


## Misc

In [14]:
# from sklearn.model_selection import RandomizedSearchCV
# from catboost import CatBoostClassifier


# # Define parameter grid
# param_grid = {
#     'iterations': [500, 1000, 1500],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'depth': [4, 6, 8],
#     'l2_leaf_reg': [1, 3, 5],
#     'border_count': [32, 64, 128],
#     'random_strength': [0.5, 1, 2],
#     'bagging_temperature': [0, 0.5, 1],
#     'early_stopping_rounds': [50, 100]
# }

# # Initialize CatBoostClassifier
# catboost = CatBoostClassifier(verbose=100, random_state=42)

# # Perform RandomizedSearchCV
# random_search = RandomizedSearchCV(
#     estimator=catboost,
#     param_distributions=param_grid,
#     n_iter=20,
#     scoring='roc_auc',
#     cv=3,
#     n_jobs=-1,
#     random_state=42
# )

# # Fit the model
# random_search.fit(X_train, y_train, cat_features=cat_cols)

# # Best model
# best_model = random_search.best_estimator_

In [None]:
# # Evaluate on test data
# y_pred = best_model.predict(X_test)
# y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# # Metrics
# print("ROC-AUC:", roc_auc_score(y_test, y_pred_proba))
# print(classification_report(y_test, y_pred))