# Import libraries

In [1]:
!pip install pandas==1.3.5

Collecting pandas==1.3.5
  Downloading pandas-1.3.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading pandas-1.3.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.5/11.5 MB[0m [31m83.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hInstalling collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 2.2.0
    Uninstalling pandas-2.2.0:
      Successfully uninstalled pandas-2.2.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-decision-forests 1.8.1 requires wurlitzer, which is not installed.
arviz 0.17.0 requires pandas>=1.4.0, but you have pandas 1.3.5 which is incompatible.
beatrix-jupyterlab 2023.128.151533 requires jupyterlab~=3.6.0, but you have jupyterlab 4.1.2 which is

In [2]:
import pandas as pd
import numpy  as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from catboost import CatBoostClassifier
import os
import random

In [3]:
SEED = 21

random.seed(SEED)
np.random.seed(SEED)

# Get dataset

In [4]:
X = pd.read_parquet("/kaggle/input/purple-hack/train_ai_comp_final_dp.parquet")
X.drop(columns = ["sample_ml_new", "id"], inplace = True)

In [5]:
X.sample(10)

Unnamed: 0,target,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,...,feature1067,feature1068,feature1069,feature1070,feature1071,feature1072,feature1073,feature1074,feature1075,feature1076
226755,0,1662,380,72,104442,191,36,0,0,21,...,61757,88187,139192,0,33306,41242,37685,61757,88187,139192
14878,0,1761,1759,120,100713,191,121,0,0,0,...,74632,115771,165119,41107,41706,43733,59681,74632,115771,165119
642602,0,1666,13,114,103814,107,59,0,0,0,...,0,0,0,0,0,0,0,0,0,0
444900,0,1761,1759,22,94004,191,37,125,290,298,...,836,47914,70080,0,569,0,737,836,47914,70080
20748,0,1761,1759,141,8429,3,3,125,290,298,...,0,0,0,0,0,0,0,0,0,0
9281,0,1635,1759,104,104733,191,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
580721,0,1761,1759,92,105312,191,23,0,69,40,...,0,0,0,0,0,0,0,0,0,0
1076,0,1632,1189,83,85184,191,7,125,290,298,...,0,0,0,0,0,0,0,0,0,0
572773,0,1657,892,100,105587,11,4,0,0,0,...,0,0,28854,0,0,0,0,0,0,28854
206852,0,907,45,14,78061,191,37,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Drop high corr columns

In [6]:
X = X.drop(['feature756'], axis=1)

In [7]:
X['target'].value_counts()

0    501078
1     18537
Name: target, dtype: int64

In [8]:
n = 18537

In [9]:
X_0 = X[X['target'] == 0].sample(n)
X_1 = X[X['target'] == 1].sample(n)

In [10]:
X = pd.concat([X_0, X_1])
y = X['target']
X = X.drop(['target'], axis=1)

In [11]:
def reduce_memory_usage(df):   
    start_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_memory} MB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
                    
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    
    end_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe after reduction {end_memory} MB")
    print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
    return df

In [12]:
X = reduce_memory_usage(X)

Memory usage of dataframe is 304.34893798828125 MB
Memory usage of dataframe after reduction 71.38481712341309 MB
Reduced by 76.54507434944237 % 


In [13]:
def remove_collinear_features(x, threshold=0.8):

    # Calculate the correlation matrix
    corr_matrix = x.corr()
    iters = range(len(corr_matrix.columns) - 1)
    drop_cols = []

    # Iterate through the correlation matrix and compare correlations
    for i in iters:
        for j in range(i+1):
            item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
            col = item.columns
            row = item.index
            val = abs(item.values)

            # If correlation exceeds the threshold
            if val >= threshold:
                # Print the correlated features and the correlation value
                #print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
                drop_cols.append(col.values[0])

    # Drop one of each pair of correlated columns
    drops = set(drop_cols)
    x = x.drop(columns=drops)
    print('Removed Columns {}'.format(drops))
    return x

In [14]:
# X = remove_collinear_features(X)
collinear_features = ['feature772', 'feature774', 'feature81', 'feature136', 'feature632', 'feature945', 'feature515', 'feature191', 'feature509', 'feature424', 'feature141', 'feature754', 'feature847', 'feature843', 'feature736', 'feature419', 'feature388', 'feature798', 'feature828', 'feature887', 'feature1055', 'feature466', 'feature882', 'feature548', 'feature1075', 'feature248', 'feature768', 'feature393', 'feature778', 'feature57', 'feature591', 'feature34', 'feature914', 'feature284', 'feature776', 'feature684', 'feature456', 'feature641', 'feature748', 'feature559', 'feature858', 'feature873', 'feature871', 'feature439', 'feature335', 'feature11', 'feature476', 'feature764', 'feature766', 'feature518', 'feature1052', 'feature869', 'feature446', 'feature1001', 'feature360', 'feature724', 'feature729', 'feature408', 'feature854', 'feature850', 'feature872', 'feature579', 'feature557', 'feature111', 'feature246', 'feature458', 'feature569', 'feature730', 'feature260', 'feature953', 'feature493', 'feature990', 'feature603', 'feature512', 'feature1066', 'feature416', 'feature348', 'feature483', 'feature925', 'feature597', 'feature931', 'feature198', 'feature230', 'feature220', 'feature537', 'feature209', 'feature951', 'feature987', 'feature824', 'feature556', 'feature56', 'feature863', 'feature465', 'feature654', 'feature923', 'feature547', 'feature949', 'feature673', 'feature899', 'feature598', 'feature482', 'feature595', 'feature377', 'feature901', 'feature206', 'feature396', 'feature262', 'feature261', 'feature410', 'feature602', 'feature514', 'feature129', 'feature874', 'feature779', 'feature1039', 'feature490', 'feature185', 'feature580', 'feature877', 'feature239', 'feature725', 'feature133', 'feature633', 'feature40', 'feature251', 'feature477', 'feature131', 'feature541', 'feature50', 'feature297', 'feature436', 'feature640', 'feature789', 'feature1046', 'feature517', 'feature181', 'feature520', 'feature245', 'feature944', 'feature777', 'feature219', 'feature995', 'feature221', 'feature650', 'feature746', 'feature782', 'feature826', 'feature492', 'feature235', 'feature734', 'feature690', 'feature13', 'feature392', 'feature657', 'feature1040', 'feature90', 'feature823', 'feature695', 'feature240', 'feature681', 'feature344', 'feature411', 'feature8', 'feature126', 'feature479', 'feature573', 'feature228', 'feature375', 'feature836', 'feature368', 'feature218', 'feature207', 'feature745', 'feature238', 'feature382', 'feature621', 'feature444', 'feature833', 'feature728', 'feature501', 'feature463', 'feature234', 'feature457', 'feature827', 'feature593', 'feature1061', 'feature940', 'feature865', 'feature992', 'feature425', 'feature242', 'feature629', 'feature741', 'feature315', 'feature494', 'feature503', 'feature132', 'feature159', 'feature864', 'feature524', 'feature283', 'feature272', 'feature370', 'feature254', 'feature743', 'feature891', 'feature227', 'feature630', 'feature53', 'feature480', 'feature1068', 'feature933', 'feature222', 'feature608', 'feature135', 'feature77', 'feature549', 'feature639', 'feature59', 'feature171', 'feature420', 'feature417', 'feature911', 'feature210', 'feature763', 'feature905', 'feature365', 'feature447', 'feature533', 'feature742', 'feature656', 'feature822', 'feature841', 'feature991', 'feature467', 'feature381', 'feature428', 'feature17', 'feature943', 'feature664', 'feature383', 'feature62', 'feature430', 'feature821', 'feature250', 'feature767', 'feature412', 'feature147', 'feature213', 'feature1060', 'feature812', 'feature32', 'feature758', 'feature1054', 'feature374', 'feature182', 'feature362', 'feature627', 'feature867', 'feature997', 'feature14', 'feature600', 'feature890', 'feature946', 'feature394', 'feature373', 'feature856', 'feature804', 'feature739', 'feature896', 'feature848', 'feature453', 'feature1058', 'feature226', 'feature585', 'feature576', 'feature266', 'feature378', 'feature442', 'feature907', 'feature508', 'feature403', 'feature727', 'feature738', 'feature525', 'feature441', 'feature399', 'feature947', 'feature271', 'feature421', 'feature560', 'feature1072', 'feature912', 'feature803', 'feature249', 'feature589', 'feature929', 'feature913', 'feature994', 'feature916', 'feature838', 'feature866', 'feature722', 'feature432', 'feature562', 'feature244', 'feature319', 'feature422', 'feature626', 'feature851', 'feature676', 'feature1074', 'feature327', 'feature775', 'feature223', 'feature134', 'feature429', 'feature473', 'feature834', 'feature189', 'feature1069', 'feature555', 'feature846', 'feature879', 'feature952', 'feature596', 'feature646', 'feature526', 'feature604', 'feature904', 'feature998', 'feature142', 'feature409', 'feature437', 'feature714', 'feature9', 'feature586', 'feature786', 'feature1059', 'feature835', 'feature870', 'feature703', 'feature404', 'feature1000', 'feature915', 'feature256', 'feature426', 'feature380', 'feature282', 'feature647', 'feature372', 'feature584', 'feature371', 'feature611', 'feature42', 'feature498', 'feature607', 'feature655', 'feature232', 'feature643', 'feature1062', 'feature521', 'feature257', 'feature810', 'feature255', 'feature902', 'feature415', 'feature119', 'feature691', 'feature723', 'feature1004', 'feature487', 'feature1053', 'feature908', 'feature317', 'feature433', 'feature634', 'feature921', 'feature51', 'feature996', 'feature737', 'feature1048', 'feature693', 'feature635', 'feature780', 'feature601', 'feature993', 'feature769', 'feature88', 'feature909', 'feature450', 'feature590', 'feature1067', 'feature615', 'feature860', 'feature423', 'feature267', 'feature389', 'feature47', 'feature623', 'feature1038', 'feature413', 'feature613', 'feature614', 'feature390', 'feature752', 'feature236', 'feature839', 'feature731', 'feature527', 'feature265', 'feature397', 'feature543', 'feature757', 'feature468', 'feature895', 'feature853', 'feature948', 'feature268', 'feature63', 'feature489', 'feature609', 'feature510', 'feature791', 'feature402', 'feature30', 'feature451', 'feature500', 'feature326', 'feature454', 'feature868', 'feature903', 'feature247', 'feature10', 'feature455', 'feature519', 'feature552', 'feature934', 'feature208', 'feature201', 'feature89', 'feature459', 'feature825', 'feature418', 'feature954', 'feature837', 'feature551', 'feature783', 'feature638', 'feature484', 'feature49', 'feature781', 'feature926', 'feature264', 'feature263', 'feature499', 'feature529', 'feature1051', 'feature1076', 'feature859', 'feature696', 'feature29', 'feature376', 'feature857', 'feature906', 'feature594', 'feature312', 'feature942', 'feature999', 'feature845', 'feature844', 'feature379', 'feature881', 'feature190', 'feature592', 'feature814', 'feature16', 'feature290', 'feature414', 'feature464', 'feature770', 'feature1041', 'feature211', 'feature831', 'feature747', 'feature528', 'feature564', 'feature241', 'feature438', 'feature618', 'feature628', 'feature502', 'feature784', 'feature832', 'feature624', 'feature1073', 'feature258', 'feature443', 'feature462', 'feature252', 'feature472', 'feature398', 'feature582', 'feature1071', 'feature733', 'feature460', 'feature577', 'feature474', 'feature1045', 'feature1003', 'feature440', 'feature449', 'feature39', 'feature323', 'feature740', 'feature237', 'feature1047', 'feature21', 'feature243', 'feature900', 'feature54', 'feature364', 'feature612', 'feature523', 'feature178', 'feature351', 'feature581', 'feature130', 'feature15', 'feature536', 'feature679', 'feature270', 'feature605', 'feature666', 'feature692', 'feature259', 'feature950', 'feature55', 'feature313', 'feature1070', 'feature636', 'feature1002', 'feature516', 'feature522', 'feature361', 'feature314', 'feature253', 'feature180', 'feature645', 'feature883', 'feature583', 'feature558', 'feature610', 'feature702', 'feature760', 'feature20', 'feature888', 'feature193', 'feature212', 'feature486', 'feature188', 'feature269', 'feature718', 'feature363', 'feature606', 'feature505', 'feature485', 'feature771', 'feature930', 'feature471', 'feature1044', 'feature875', 'feature927', 'feature233', 'feature773', 'feature852', 'feature461', 'feature347', 'feature849', 'feature445', 'feature637', 'feature306', 'feature797', 'feature550', 'feature648', 'feature840', 'feature481']
X = X.drop(collinear_features, axis=1)

# Remove nans

In [15]:
for col in X:
    mode = X[col].mean()
    X[col] = X[col].fillna(mode)

# Using ridge

In [16]:
from sklearn.linear_model import Ridge

In [17]:
ridge = Ridge().fit(X, y)

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [18]:
to_drop = []
for i, column_name in enumerate(ridge.feature_names_in_):
    if abs(np.round(ridge.coef_[i], 2)) < 0.001:
        to_drop.append(column_name)

In [19]:
len(to_drop), X.shape[1]

(394, 488)

In [20]:
test = pd.read_parquet("/kaggle/input/test-part-it-purple-sber/test_sber.parquet",)
test = test.drop(collinear_features+['sample_ml_new', 'feature756'], axis=1).set_index('id')

In [21]:
X = X.drop(to_drop, axis=1)
test = test.drop(to_drop, axis=1)

# Training model

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y)

In [65]:
from sklearn.linear_model import LogisticRegression

In [71]:
model = LogisticRegression(solver='saga', penalty='elasticnet', l1_ratio=0.2)
model.fit(X_train, y_train)



In [72]:
pred = model.predict_proba(X_test)

pred = pred[:, 1]
pred_binary = (pred >= 0.5)

In [75]:
print("F1_SCORE", f1_score(y_test, pred_binary))
print("PRECISION", precision_score(y_test, pred_binary))
print("RECALL", recall_score(y_test, pred_binary))
print("ROC_AUC", roc_auc_score(y_test, pred))

F1_SCORE 0.5910046237915091
PRECISION 0.614868804664723
RECALL 0.5689236579444295
ROC_AUC 0.6575284040892926


In [76]:
pred = model.predict_proba(test)

pred = pred[:, 1]
pred_binary = (pred >= 0.5)

In [78]:
subm = pd.read_csv('/kaggle/input/purple/sample_submission.csv', index_col='id')

In [79]:
subm['target_bin'] = pred_binary.astype(np.int8)
subm['target_prob'] = pred

In [80]:
subm.to_csv("submission_log_regr.csv")