In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
import scipy
from sklearn.linear_model import LogisticRegression

In [72]:
df = pd.read_csv('../Data/cat.csv')
df = df.drop('id', axis=1)

dtarget = df['target']
df = df.drop('target', axis=1)

df.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,0,0,0,T,Y,Green,Triangle,Snake,Finland,Bassoon,...,c389000ab,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2
1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,Piano,...,4cd920251,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8
2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,Theremin,...,de9c9f684,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2
3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,Oboe,...,4ade6ab69,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1
4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,Oboe,...,cb43ab175,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8


In [32]:
dt = pd.get_dummies(df, columns=df.columns, sparse=True)
dt = dt.sparse.to_coo()
dt = dt.tocsr()

In [33]:
size = 0.8
np.random.seed(123)
ids = np.random.rand(dt.shape[0]) < size #list(np.where(np.random.rand(dt.shape[0]) < size)[0])

x = dt[ids]; y = dtarget[ids]
x_val = dt[~ids]; y_val = dtarget[~ids]

In [34]:
model = LogisticRegression(C=0.1, class_weight='balanced',max_iter=10000, solver='lbfgs', n_jobs=-1)
model.fit(x, y)

LogisticRegression(C=0.1, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=10000, multi_class='warn', n_jobs=-1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [35]:
from sklearn import metrics

pred = model.predict_proba(x)
print(metrics.roc_auc_score(y, pred[:,1]))

pred1 = model.predict_proba(x_val)
metrics.roc_auc_score(y_val, pred1[:,1])

0.8287054629090679


0.8056458162869101

In [36]:
dt.shape

(300000, 16461)

## reduce cardinality

In [73]:
def reduce_cardinality(dx, threshold = 10):
    dens = dx.value_counts()
    rare = dens[dens < threshold].index
    dx.loc[dx.isin(rare)] = 'XX'
    return dx

df['nom_9'] = reduce_cardinality(df['nom_9'].copy())
df['nom_8'] = reduce_cardinality(df['nom_8'].copy())
df['nom_7'] = reduce_cardinality(df['nom_7'].copy())
df['nom_6'] = reduce_cardinality(df['nom_6'].copy())

In [74]:
dt = pd.get_dummies(df, columns=df.columns, sparse=True)
dt = dt.sparse.to_coo()
dt = dt.tocsr()

In [75]:
size = 0.8
np.random.seed(123)
ids = np.random.rand(dt.shape[0]) < size #list(np.where(np.random.rand(dt.shape[0]) < size)[0])

x = dt[ids]; y = dtarget[ids]
x_val = dt[~ids]; y_val = dtarget[~ids]

In [76]:
model = LogisticRegression(C=0.1, class_weight='balanced',max_iter=10000, solver='lbfgs', n_jobs=-1)
model.fit(x, y)

LogisticRegression(C=0.1, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=10000, multi_class='warn', n_jobs=-1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [77]:
pred = model.predict_proba(x)
print(metrics.roc_auc_score(y, pred[:,1]))

pred1 = model.predict_proba(x_val)
metrics.roc_auc_score(y_val, pred1[:,1])

0.8280583565980837


0.8055775816342357

## Pytorch Logistic Regression

In [78]:
import torch
import torch.nn as nn

In [81]:
dt = pd.get_dummies(df, columns=df.columns, sparse=True)

size = 0.8
np.random.seed(123)
ids = np.random.rand(dt.shape[0]) < size #list(np.where(np.random.rand(dt.shape[0]) < size)[0])

x = dt[ids]; y = dtarget[ids]
x_val = dt[~ids]; y_val = dtarget[~ids]

In [82]:
Tx = torch.from_numpy(x.values.astype(np.float32))
Ty = torch.from_numpy(y.values.astype(np.int64)).flatten() 

Tx_val = torch.from_numpy(x_val.values.astype(np.float32))
Ty_val = torch.from_numpy(y_val.values.astype(np.int64)).flatten()

In [83]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):
        outputs = self.linear(x)
        return outputs

In [84]:
criterion = torch.nn.CrossEntropyLoss() # computes softmax and then the cross entropy

model = LogisticRegression(x.shape[1], 2)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, weight_decay=0.1)

In [85]:
epochs = 1000
for i in range(epochs):
    
    pred = model(Tx)
    loss = criterion(pred, Ty)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if i%100==0:
        with torch.set_grad_enabled(False):
            auc_train = metrics.roc_auc_score(y, torch.nn.functional.softmax(pred, dim=1).data.numpy()[:,1])

            pred_val = model(Tx_val)
            auc_val = metrics.roc_auc_score(y_val, torch.nn.functional.softmax(pred_val, dim=1).data.numpy()[:,1])
            print(loss.item(), auc_train, auc_val)

0.692625880241394 0.5078295693234837 0.5875284738139583
0.572114884853363 0.720425540962504 0.7220823107227965
0.5672004818916321 0.7255754476160837 0.7273202609337392
0.5662215948104858 0.7271822320309087 0.7289426853362461
0.5659962892532349 0.7276765990825306 0.7294375298411035
0.565934956073761 0.7278289508610114 0.7295882057923447
0.5659157633781433 0.7278763927322205 0.7296347718019768
0.5659096837043762 0.7278913700144309 0.7296497200576033
0.5659077167510986 0.7278961976511148 0.729654237910008
0.5659071803092957 0.7278978009385585 0.7296558155519361
