In [1]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,StandardScaler,MinMaxScaler
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
import string
from category_encoders import TargetEncoder,LeaveOneOutEncoder
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction import FeatureHasher

# Get Data

In [2]:
train=pd.read_csv('../input/cat-in-the-dat/train.csv',index_col='id')
test=pd.read_csv('../input/cat-in-the-dat/test.csv',index_col='id')
submission=pd.read_csv('../input/cat-in-the-dat/sample_submission.csv')

# Concat Data

In [3]:
ind=len(train)
y_train=train['target']
train.drop('target',axis=1,inplace=True)
all_data=pd.concat([train,test])

In [4]:
all_data.drop(['bin_0'],axis=1,inplace=True)

# Encoding categorical features

# **1-binary**

In [5]:
all_data['bin_3'] = all_data['bin_3'].map({'F':0, 'T':1})
all_data['bin_4'] = all_data['bin_4'].map({'N':0, 'Y':1})

In [6]:
ordf=['ord_'+str(i)  for i  in range(6)]
nomf=['nom_'+str(i)  for i  in range(10)]
dmf=['day','month']


# **2-ordinal**

In [7]:

ord1_dic={'Novice':0, 'Contributor':1, 'Expert':2, 'Master':3, 'Grandmaster':4}
ord2_dic={'Freezing':0, 'Cold':1, 'Warm':2, 'Hot':3, 'Boiling Hot':4, 'Lava Hot':5}

all_data['ord_1']= all_data.ord_1.map(ord1_dic)
all_data['ord_2']= all_data.ord_2.map(ord2_dic)

In [8]:
enc = OrdinalEncoder(categories = 'auto')

all_data[ordf[3:]] = enc.fit_transform(all_data[ordf[3:]])


In [9]:
all_data[ordf] = StandardScaler().fit_transform(all_data[ordf])

# **3-Nominal**

In [10]:
ohe= OneHotEncoder(categories = 'auto', dtype = 'float64', drop = 'first')
data_dmf= ohe.fit_transform(all_data[dmf])

In [11]:
all_data.drop(dmf,axis=1,inplace=True)

In [12]:
all_data['nom_5'] = all_data['nom_5'].str[4:]
all_data['nom_6'] = all_data['nom_6'].str[3:]
all_data['nom_7'] = all_data['nom_7'].str[3:]
all_data['nom_8'] = all_data['nom_8'].str[3:]
all_data['nom_9'] = all_data['nom_9'].str[3:]

In [13]:
ohe= OneHotEncoder(categories = 'auto', dtype = 'float64', drop = 'first')
data_nomf= ohe.fit_transform(all_data[nomf])

In [14]:
all_data.drop(nomf,axis=1,inplace=True)

# **Sparse matrix**

In [15]:
from scipy.sparse import coo_matrix,hstack


df_work_sprs =hstack([data_nomf,coo_matrix(all_data).astype('float64'),
                                   data_dmf]).tocsr()
display(df_work_sprs)

<500000x16288 sparse matrix of type '<class 'numpy.float64'>'
	with 9235121 stored elements in Compressed Sparse Row format>

In [16]:
new_train=(df_work_sprs[:ind])
new_test=(df_work_sprs[ind:])

# **Modeling**

In [17]:
lg=LogisticRegression(C=0.1245, solver='saga',penalty='l2', max_iter=1000, verbose=0, n_jobs=-1,class_weight="balansed",random_state=42)
lg.fit(new_train,y_train)
#C_predict= lg.predict_proba(X_val)[:,1]

LogisticRegression(C=0.1245, class_weight='balansed', max_iter=1000, n_jobs=-1,
                   random_state=42, solver='saga')

In [18]:
y_pred=lg.predict_proba(new_test)[:,1]

# **Submission**

In [19]:
submission['target']=y_pred
submission.to_csv('subasma9.csv',index=False)