In [1]:
import pandas as pd
import numpy as np

from fastai.tabular import * 

import gc

from tqdm.autonotebook import tqdm
import os
tqdm.pandas()



In [2]:
!nvidia-smi

/bin/sh: nvidia-smi: command not found


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [4]:
train_dir = './train-data'
train_x = pd.read_csv(os.path.join(train_dir, 'train_x.csv'))
# Class will start with 0
train_y = pd.read_csv(os.path.join(train_dir, 'train_y.csv'))
test_x = pd.read_csv(os.path.join(train_dir, 'test_x.csv'))

In [5]:
categorical_features = ['gender', 'city', 'prodName', 'color', 'carrier']

In [6]:
# train_x.fillna(train_x.mean(), inplace=True)
# test_x.fillna(train_x.mean(), inplace=True)

In [7]:
categories_dict = []
embed_size = {}
for cate in categorical_features:
    # (dict_size, embedding_size)
    dict_size = max(train_x[cate].unique()) + 1
    categories_dict.append((dict_size, min(dict_size * 2, 200)))
    embed_size[cate] = min(dict_size * 2, 200)

In [8]:
embed_size

{'gender': 4, 'city': 200, 'prodName': 200, 'color': 200, 'carrier': 8}

In [9]:
train_x['age_group'] = train_y['age_group']

In [10]:
gc.collect()
import sklearn
import sklearn.model_selection
splits = 10
kfold = sklearn.model_selection.StratifiedKFold(splits, shuffle=True)
split = kfold.split(train_x, train_y)
train_index, test_index = next(split)
procs = [FillMissing, Categorify, Normalize]
data = TabularDataBunch.from_df(train_dir, train_x, 'age_group', valid_idx=test_index, test_df=test_x,
                                procs=procs, cat_names=categorical_features)
learn = tabular_learner(data, layers=[220, 110], emb_szs=embed_size, metrics=accuracy)

In [11]:
learn.fit_one_cycle(12, 4e-5)

epoch,train_loss,valid_loss,accuracy,time
0,1.243069,2.460918,0.484289,05:22
1,1.160252,2.748532,0.509398,04:59
2,1.120514,2.67109,0.517572,05:23
3,1.103036,2.624993,0.525771,05:02
4,1.103024,2.513845,0.52602,05:26
5,1.086924,2.698526,0.528065,05:04
6,1.083186,3.187808,0.532587,05:15
7,1.088012,2.786288,0.532905,05:02
8,1.058056,3.249843,0.533836,02:41
9,1.080067,3.421112,0.532289,02:58


In [12]:
learn.save('220-110')

In [13]:
results = torch.max(learn.get_preds(ds_type=DatasetType.Test)[0], 1)[0]

In [14]:
np.unique(results.numpy())

array([1, 2, 3, 4, 5, 6])

In [15]:
out = pd.DataFrame(test_x['uid'])
out.columns = ['id']
out['label'] = results.numpy()

In [16]:
out.to_csv('submission.csv', index=False)

In [19]:
prob = learn.get_preds(ds_type=DatasetType.Test)

In [27]:
out = pd.DataFrame(test_x['uid'])
out.columns = ['id']
for i in range(6):
    out['prob_%i'%(i+1)] = prob[0][:, i]

In [29]:
out.to_csv('class_score.csv', index=False)