In [2]:
import tensorflow as tf
import sklearn
import pandas as pd
import numpy as np

In [1]:
X_cols = [
    "B_2", "B_7", "B_18", "B_23", "B_32", "D_48",
    "D_55", "D_61", "D_121", "P_2", "S_11",
]

In [3]:
chunksize = 1000000

train_df_iter = pd.read_csv("train_data.csv", chunksize=chunksize, usecols=["customer_ID"] + X_cols)


train_df = pd.DataFrame()
for i_chunk, chunk in enumerate(train_df_iter):
    train_df = pd.concat([train_df, chunk])
    print(train_df.shape)

(1000000, 12)
(2000000, 12)
(3000000, 12)
(4000000, 12)
(5000000, 12)
(5531451, 12)


In [4]:
train_labels_df = pd.read_csv("train_labels.csv")

In [5]:
train_df_mean = train_df.groupby("customer_ID")[X_cols].mean().reset_index()
train_df_last = train_df.groupby("customer_ID")[X_cols].last().reset_index()

train_df = pd.merge(
    left=train_df_mean, 
    right=train_df_last, 
    how="inner",
    on="customer_ID",
    suffixes=("_mean", "_last"),
)

train_df = pd.merge(train_df, train_labels_df, on="customer_ID", how="left")

In [6]:
train_df.head()

Unnamed: 0,customer_ID,B_2_mean,B_7_mean,B_18_mean,B_23_mean,B_32_mean,D_48_mean,D_55_mean,D_61_mean,D_121_mean,...,B_18_last,B_23_last,B_32_last,D_48_last,D_55_last,D_61_last,D_121_last,P_2_last,S_11_last,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,1.005086,0.036624,0.842565,0.026177,0.005084,0.240978,0.224432,0.225847,0.711829,...,1.007897,0.040367,0.007645,0.192376,0.187285,0.227637,0.719791,0.934745,0.402246,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.991083,0.028049,1.004884,0.013286,0.00596,0.048203,0.048069,0.053319,0.535892,...,1.003602,0.014705,0.008645,0.014696,0.036112,0.048978,0.551341,0.880519,0.363754,0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0.815677,0.034433,0.933173,0.023436,0.00522,0.092284,0.077362,0.109526,0.431903,...,1.00408,0.020228,0.001252,0.08037,0.098963,0.137834,0.444615,0.880875,0.280417,0
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0.955264,0.06213,0.910999,0.046339,0.005252,0.076686,0.061726,0.066872,0.621386,...,1.007289,0.00506,0.007541,0.013057,0.0214,0.026844,0.629147,0.621776,0.368774,0
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0.814543,0.11529,0.610194,0.100755,0.005332,0.253697,0.203298,0.356445,0.55094,...,0.531486,0.145214,0.007263,0.325121,0.254067,0.600739,0.565815,0.8719,0.326776,0


In [7]:
train_df.isna().sum()

customer_ID        0
B_2_mean          31
B_7_mean           0
B_18_mean          0
B_23_mean          0
B_32_mean          0
D_48_mean      28816
D_55_mean       2478
D_61_mean      21083
D_121_mean      4739
P_2_mean        2434
S_11_mean          0
B_2_last          31
B_7_last           0
B_18_last          0
B_23_last          0
B_32_last          0
D_48_last      28816
D_55_last       2478
D_61_last      21083
D_121_last      4739
P_2_last        2434
S_11_last          0
target             0
dtype: int64

In [8]:
train_df["B_2_mean"] = train_df["B_2_mean"].fillna(train_df["B_2_mean"].mean())
train_df["B_2_last"] = train_df["B_2_last"].fillna(train_df["B_2_last"].mean())

train_df["D_48_mean"] = train_df["D_48_mean"].fillna(train_df["D_48_mean"].mean())
train_df["D_48_last"] = train_df["D_48_last"].fillna(train_df["D_48_last"].mean())

train_df["D_55_mean"] = train_df["D_55_mean"].fillna(train_df["D_55_mean"].mean())
train_df["D_55_last"] = train_df["D_55_last"].fillna(train_df["D_55_last"].mean())

train_df["D_61_mean"] = train_df["D_61_mean"].fillna(train_df["D_61_mean"].mean())
train_df["D_61_last"] = train_df["D_61_last"].fillna(train_df["D_61_last"].mean())

train_df["D_121_mean"] = train_df["D_121_mean"].fillna(train_df["D_121_mean"].mean())
train_df["D_121_last"] = train_df["D_121_last"].fillna(train_df["D_121_last"].mean())

train_df["P_2_mean"] = train_df["P_2_mean"].fillna(train_df["P_2_mean"].mean())
train_df["P_2_last"] = train_df["P_2_last"].fillna(train_df["P_2_last"].mean())

In [9]:
_X_cols = train_df.columns[1:-1]

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_df[_X_cols], train_df["target"], test_size=0.2, 
                                                    random_state=42, stratify=train_df["target"])

In [13]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(164, activation = "relu"),
    tf.keras.layers.Dense(128, activation = "relu"),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(64, activation = "relu"),
    tf.keras.layers.Dense(32, activation = "relu"),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(8, activation = "relu"),
    tf.keras.layers.Dense(1, activation = "sigmoid")
])

In [15]:
model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = "accuracy")

In [16]:
model.fit(X_train, y_train, epochs = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x168934dbd00>

In [17]:
predictions = model.predict(X_test)



In [21]:
model.evaluate(X_test, y_test)



[0.27959054708480835, 0.8761535286903381]

In [22]:
chunksize = 1000000

test_df_iter = pd.read_csv("test_data.csv", chunksize=chunksize, usecols=["customer_ID"] + X_cols)

In [23]:
_index = []
_vals = []

for chunk in test_df_iter:
    _chunk_mean = chunk.groupby("customer_ID")[X_cols].mean().reset_index()
    _chunk_last = chunk.groupby("customer_ID")[X_cols].last().reset_index()
    _chunk = pd.merge(
        left=_chunk_mean, 
        right=_chunk_last, 
        how="inner",
        on="customer_ID",
        suffixes=("_mean", "_last"),
    )

    X_test = _chunk[_X_cols]
    X_test = X_test.fillna(0)
    y_test_pred = model.predict(X_test)#[:, 1]
    _index.extend(_chunk["customer_ID"])
    _vals.extend(y_test_pred)
    
    print(len(_index))

81358
162618
244059
325450
406815
488260
569555
650904
732217
813543
895040
924631


In [37]:
res_df = pd.DataFrame(
    {"customer_ID": _index, "prediction": np.squeeze(_vals)}
).groupby("customer_ID")["prediction"].mean().reset_index()

In [38]:
res_df.head()

Unnamed: 0,customer_ID,prediction
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0.165087
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0.012149
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,0.091402
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,0.367754
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,0.801557


In [39]:
res_df.to_csv("tensorflow_predictions_submission2.csv", index = False)