# Matrix

In [2]:
import numpy as np
from scipy import sparse

In [10]:
# create our example feature matrix
example = np.array(
    [
        [0, 0, 1],
        [1, 0, 0],
        [1, 0, 1]
    ]
)

In [9]:
# size of dense array
example.nbytes

72

## Sparse matrix

In [11]:
# convert numpy array to sparse CSR matrix
sparse_example = sparse.csr_matrix(example)


In [12]:
# size of sparse array
sparse_example.data.nbytes

32

In [32]:
# full size of sparse array
sparse_example.data.nbytes + \
    sparse_example.indptr.nbytes + \
        sparse_example.indices.nbytes

11987708

## One-hot encodeing

In [17]:
from sklearn import preprocessing

In [23]:
# create aandom 1-d array with 1001 different categories (int)
example = np.random.randint(1000, size = 1000000)

# initialize OneHotEncoder from scikit-learn
# keep sparse = False to get dense array
ohe = preprocessing.OneHotEncoder(sparse=False)

# fit and transform data with dense one hot encoder
ohe_example = ohe.fit_transform(example.reshape(-1, 1))

In [24]:
# size of dense array
ohe_example.nbytes

8000000000

In [26]:
# initialize OneHotEncoder from scikit-learn
# keep sparse = True to get dense array
ohe = preprocessing.OneHotEncoder(sparse=True)

# fit and transform data with dense one hot encoder
ohe_example = ohe.fit_transform(example.reshape(-1, 1))

In [27]:
# size of sparse array
ohe_example.data.nbytes

8000000

In [29]:
# full size of sparse array
ohe_example.data.nbytes + \
ohe_example.indptr.nbytes + \
ohe_example.indices.nbytes

16000004

## Groupby count

In [39]:
import pandas as pd 

In [34]:
df = pd.read_csv('../input/cat_train.csv')

In [37]:
df.groupby(['ord_2'])['id'].transform('count')

0          67508.0
1         124239.0
2         142726.0
3          64840.0
4          97822.0
            ...   
599995    142726.0
599996     84790.0
599997    142726.0
599998    124239.0
599999     84790.0
Name: id, Length: 600000, dtype: float64

## Test

In [1]:
import pandas as pd
from sklearn import preprocessing

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
train = pd.read_csv('../input/cat_train.csv')
test = pd.read_csv('../input/cat_test.csv')

In [3]:
test.loc[:, 'target'] = -1

In [4]:
data = pd.concat([train, test]).reset_index(drop=True)

features = [x for x in train.columns if x not in ['id', 'target']]


In [5]:
for feat in features:
    lbl_enc = preprocessing.LabelEncoder()
    tmp_col = data[feat].fillna('NONE').astype(str).values
    data.loc[:, feat] = lbl_enc.fit_transform(tmp_col)

In [6]:
train = data[data.target != -1].reset_index(drop=True)
test = data[data.target == -1].reset_index(drop=True)

## Rare

In [8]:
df = pd.read_csv('../input/cat_train.csv')
df.ord_4 = df.ord_4.fillna('NONE')

df.loc[
    df['ord_4'].value_counts()[df['ord_4']].values < 2000, # type: ignore
    'ord_4'
] = 'RARE'

df.ord_4.value_counts()

N       39978
P       37890
Y       36657
A       36633
R       33045
U       32897
M       32504
X       32347
C       32112
H       31189
Q       30145
T       29723
O       25610
B       25212
E       21871
K       21676
I       19805
NONE    17930
D       17284
F       16721
W        8268
Z        5790
S        4595
RARE     3607
G        3404
V        3107
Name: ord_4, dtype: int64

## Model

### Stratified KFold

In [10]:
import pandas as pd 
from sklearn import model_selection

In [11]:
df = pd.read_csv('../input/cat_train.csv')
df['kfold'] = -1

df = df.sample(frac = 1).reset_index(drop = True)

y = df.target.values
kf = model_selection.StratifiedKFold(n_splits = 5)
for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
    df.loc[v_, 'kfold'] = f

df.to_csv('../input/cat_train_folds.csv', index=False)

In [12]:
df = pd.read_csv('../input/cat_train_folds.csv')
df.kfold.value_counts()

4    120000
3    120000
2    120000
1    120000
0    120000
Name: kfold, dtype: int64

### Label encoding

In [14]:
from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing

  return f(*args, **kwds)


In [20]:
def run(fold):
    df = pd.read_csv('../input/cat_train_folds.csv')

    # all columns are features except id, target and kfold columns
    features = [
        f for f in df.columns if f not in ('id', 'target', 'kfold')
    ]

    # fill all NaN values with NONE
    # note that I am converting all columns to "strings"
    # it doesn't matter because all are categories
    for col in features:
        df.loc[:,col] = df[col].astype(str).fillna('NONE')

    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    # initialize OneHotEncoder from sklearn
    ohe = preprocessing.OneHotEncoder()

    # fit ohe on training + validation features
    full_data = pd.concat(
        [df_train[features], df_valid[features]],
        axis = 0
    )
    ohe.fit(full_data[features])

    # transform training data
    x_train = ohe.transform(df_train[features])

    # transform validation data
    x_valid = ohe.transform(df_valid[features])

    # initialize Logistic Regression model
    model = linear_model.LogisticRegression()

    # fit model on training data (ohe)
    model.fit(x_train, df_train.target.values)

    # predict on validation data
    # we need the probability values as we are calculationg AUC
    # we will use the probavility of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]

    # get roc auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
    print(auc)

In [22]:
if __name__ =="__main__":
    # run funciton for fold =0
    # we can just replace this numver and run this for any fold
    for f in range(5):
        run(0)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7868564586686613


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7868564586686613


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7868564586686613


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7868564586686613
0.7868564586686613


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## US adult census data