In [46]:
import pandas as pd
import numpy as np

In [47]:
import glob
import os
import random
csvs = glob.glob('Player A-*.csv')
print(f"Loading {len(csvs)} files")
columns = [f'cor_{i}' for i in range(25)] + ['status', 'attack']
train_csvs = random.sample(csvs, int(len(csvs)*.8))
test_csvs = random.sample(csvs, int(len(csvs)*.2))
print(f"train - {len(train_csvs)}, test - {len(test_csvs)}")
train_data = pd.DataFrame()
for csv in train_csvs:
    df = pd.read_csv(csv, index_col=None, header=None, names=columns)
    train_data = pd.concat([train_data, df])
test_data = pd.DataFrame()
for csv in test_csvs:
    df = pd.read_csv(csv, index_col=None, header=None, names=columns)
    test_data = pd.concat([test_data, df])
train_data.shape, test_data.shape

Loading 33 files
train - 26, test - 6


((459, 27), (104, 27))

In [48]:
def clean_status(df):
    return df.status.apply(str.strip)\
                        .apply(str.lower)\
                        .apply(lambda v: v.replace(' ', ''))\
                        .apply(lambda v: v.replace('n/a', 'na'))
train_data.status = clean_status(train_data)
test_data.status = clean_status(test_data)
train_data.status.head(), test_data.status.head()

(0      na
 1    miss
 2    miss
 3     hit
 4    miss
 Name: status, dtype: object, 0      na
 1     hit
 2    miss
 3    miss
 4    miss
 Name: status, dtype: object)

In [49]:
def droprows_na(df):
    before = len(df)
    df = df.dropna(how='any')
    print(f'Dropped {before-len(df)} rows')
    return df

train_data = droprows_na(train_data)
test_data = droprows_na(test_data)

Dropped 25 rows
Dropped 5 rows


In [50]:
def split_attack(df):
    df.attack = df.attack.apply(str.lower)
    df['attack_col'] = df.attack.apply(lambda v: v[0])
    df['attack_row'] = df.attack.apply(lambda v: v[1])
    df = df.drop(columns=['attack'])
    df.head()
    return df
    
train_data = split_attack(train_data)
test_data = split_attack(test_data)

# Data preparation for col classifier

In [51]:
X_train = train_data.loc[:, train_data.columns.str.match('^cor_|^status')]
y_train = train_data.loc[:, train_data.columns.str.startswith('attack_col')]
X_train.head(2), y_train.head(2)

(   cor_0  cor_1  cor_2  cor_3  cor_4  cor_5  cor_6  cor_7  cor_8  cor_9  ...  \
 0      0      0      0      0      0      0      0      0      0      0  ...   
 1      0      0      0      0      0      0      0      0      0      0  ...   
 
    cor_16  cor_17  cor_18  cor_19  cor_20  cor_21  cor_22  cor_23  cor_24  \
 0       0       0       0       0       0       0       0       0       0   
 1       0       0       0       0       0       0       0       0       0   
 
    status  
 0      na  
 1    miss  
 
 [2 rows x 26 columns],   attack_col
 0          e
 1          a)

In [52]:
X_test = test_data.loc[:, test_data.columns.str.match('^cor_|^status')]
y_test = test_data.loc[:, test_data.columns.str.startswith('attack_col')]
X_test.head(2), y_test.head(2)

(   cor_0  cor_1  cor_2  cor_3  cor_4  cor_5  cor_6  cor_7  cor_8  cor_9  ...  \
 0      0      0      0      0      0      0      0      0      0      0  ...   
 1      0      0      0      0      0      1      0      0      0      0  ...   
 
    cor_16  cor_17  cor_18  cor_19  cor_20  cor_21  cor_22  cor_23  cor_24  \
 0       0       0       0       0       0       0       0       0       0   
 1       0       0       0       0       0       0       0       0       0   
 
    status  
 0      na  
 1     hit  
 
 [2 rows x 26 columns],   attack_col
 0          a
 1          b)

In [53]:
X_train = pd.get_dummies(X_train, columns=['status'])
X_test  = pd.get_dummies(X_test, columns=['status'])

In [54]:
X_train.columns

Index(['cor_0', 'cor_1', 'cor_2', 'cor_3', 'cor_4', 'cor_5', 'cor_6', 'cor_7',
       'cor_8', 'cor_9', 'cor_10', 'cor_11', 'cor_12', 'cor_13', 'cor_14',
       'cor_15', 'cor_16', 'cor_17', 'cor_18', 'cor_19', 'cor_20', 'cor_21',
       'cor_22', 'cor_23', 'cor_24', 'status_hit', 'status_miss', 'status_na',
       'status_shipsunk'],
      dtype='object')

In [55]:
y_train.head()

Unnamed: 0,attack_col
0,e
1,a
2,b
3,a
4,c


In [56]:
X_train.to_csv('dataset_X_train.csv', index_label='index')
y_train.to_csv('dataset_y_train.csv', index_label='index')
X_train.shape, y_train.shape

((434, 29), (434, 1))

In [57]:
X_test.head()

Unnamed: 0,cor_0,cor_1,cor_2,cor_3,cor_4,cor_5,cor_6,cor_7,cor_8,cor_9,...,cor_19,cor_20,cor_21,cor_22,cor_23,cor_24,status_hit,status_miss,status_na,status_shipsunk
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,-1,0,0,0,0,1,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,-1,0,0,0,0,1,0,0
4,-1,0,0,0,0,1,0,0,0,0,...,0,0,-1,0,0,0,0,1,0,0


In [58]:
y_test.head()

Unnamed: 0,attack_col
0,a
1,b
2,c
3,a
4,b


In [59]:
X_test.to_csv('dataset_X_test.csv', index_label='index')
y_test.to_csv('dataset_y_test.csv', index_label='index')
X_test.shape, y_test.shape

((99, 29), (99, 1))

# Data preparation for row classifier

In [60]:
train_data.shape, test_data.shape

((434, 28), (99, 28))

In [61]:
train_data.columns, test_data.columns

(Index(['cor_0', 'cor_1', 'cor_2', 'cor_3', 'cor_4', 'cor_5', 'cor_6', 'cor_7',
        'cor_8', 'cor_9', 'cor_10', 'cor_11', 'cor_12', 'cor_13', 'cor_14',
        'cor_15', 'cor_16', 'cor_17', 'cor_18', 'cor_19', 'cor_20', 'cor_21',
        'cor_22', 'cor_23', 'cor_24', 'status', 'attack_col', 'attack_row'],
       dtype='object'),
 Index(['cor_0', 'cor_1', 'cor_2', 'cor_3', 'cor_4', 'cor_5', 'cor_6', 'cor_7',
        'cor_8', 'cor_9', 'cor_10', 'cor_11', 'cor_12', 'cor_13', 'cor_14',
        'cor_15', 'cor_16', 'cor_17', 'cor_18', 'cor_19', 'cor_20', 'cor_21',
        'cor_22', 'cor_23', 'cor_24', 'status', 'attack_col', 'attack_row'],
       dtype='object'))

In [62]:
X_train_row = train_data.loc[:,train_data.columns[train_data.columns.str.match('^cor_|^status|^attack_col')]]
y_train_row = train_data.loc[:, ['attack_row']]
X_test_row = test_data.loc[:,test_data.columns[test_data.columns.str.match('^cor_|^status|^attack_col')]]
y_test_row = test_data.loc[:, ['attack_row']]
X_train_row.shape, X_test_row.shape, y_train_row.shape, y_test_row.shape

((434, 27), (99, 27), (434, 1), (99, 1))

In [64]:
X_train_row = pd.get_dummies(X_train_row, columns=['status', 'attack_col'])
X_test_row = pd.get_dummies(X_test_row, columns=['status', 'attack_col'])

In [65]:
X_train_row.to_csv('dataset_X_train_row.csv', index_label='index')
y_train_row.to_csv('dataset_y_train_row.csv', index_label='index')
X_test_row.to_csv('dataset_X_test_row.csv', index_label='index')
y_test_row.to_csv('dataset_y_test_row.csv', index_label='index')
X_train_row.shape, y_train_row.shape, X_test_row.shape, y_test_row.shape

((434, 34), (434, 1), (99, 34), (99, 1))

In [70]:
X_train_row.head()

Unnamed: 0,cor_0,cor_1,cor_2,cor_3,cor_4,cor_5,cor_6,cor_7,cor_8,cor_9,...,cor_24,status_hit,status_miss,status_na,status_shipsunk,attack_col_a,attack_col_b,attack_col_c,attack_col_d,attack_col_e
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
2,0,0,0,0,0,-1,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
3,0,0,0,0,0,-1,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
4,0,0,0,0,0,-1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0


In [71]:
y_train_row.head()

Unnamed: 0,attack_row
0,3
1,2
2,5
3,5
4,5
