In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
target = "winner"

In [3]:
data = pd.read_csv("data.csv")

In [4]:
n_rows, n_cols = data.shape

In [5]:
## replace spaces in column names with "_"

In [6]:
data.columns = data.columns.str.replace(' ', '_')

## 1) drop columns with a single value 

In [7]:
un_cols=data.nunique()[data.nunique()==1].index.tolist()

In [8]:
data.drop(un_cols, axis=1, inplace=True)

## 2) drop columns with 80%+ of values are NaNs. 

In [9]:
nan_thresh = 0.8

In [10]:
nan_prop_data = data.isnull().sum()/n_rows

In [11]:
nan_cols_drop = nan_prop_data[nan_prop_data>nan_thresh].index.tolist()

In [12]:
data.drop(nan_cols_drop, axis=1, inplace=True)

## 3) drop unique indicator columns

In [13]:
ind_check = ["name","id", "names", "ids"]
ind_cols = []
for col in data:
    for i in ind_check:
        if i in col.lower().split("_"): 
            ind_cols.append(col)
          

In [14]:
ind_cols

['B_ID', 'B_Name', 'Event_ID', 'Fight_ID', 'R_ID', 'R_Name']

In [15]:
data.drop(ind_cols, axis=1, inplace=True)

## 4) handling date
- feature engineering from date column

In [16]:
date_cols=[x for x in data if "date" in x.lower()]

In [17]:
date_series=pd.to_datetime(data[date_cols[0]])

In [18]:
data["day"] = date_series.dt.day
data["month"] = date_series.dt.month
data["year"] = date_series.dt.year

In [19]:
data.drop(date_cols, axis=1, inplace=True)

## 5) cat_cols
- Keep the top 70% of the values
- Assign the 30% to a new value ‘other

Took only the ones which have more than 10 unique values 

In [20]:
obj_cols = data.select_dtypes("O").columns.tolist()

In [21]:
obj_cols = [x for x in obj_cols if x!=target]

In [22]:
for i in obj_cols:
    t=int(np.round(data[i].nunique()*0.7))
    b_30=data[i].value_counts()[t:].index.tolist()
    other = f"{i}_other"
    data[i] = data[i].apply(lambda x: other if x in b_30 else x)

## 6) making dummies and dropping the last one

In [23]:
for col in obj_cols:
    d=pd.get_dummies(data[col])
    d=d.iloc[:,:-1]
    d.columns = [f"{col}_{x.replace(' ', '_')}" for x in d]
    data = data.join(d)

data.drop(obj_cols, axis=1, inplace=True)

## 7) Bin the data for columns ‘R_Weight’ and ‘B_Weight’

In [24]:
to_be_binned = ["R_Weight", "B_Weight"]
for col in to_be_binned:
    data[f"Binned_{col}"] = pd.cut(data[col], bins=20)

data.drop(to_be_binned, axis=1, inplace=True)

## 8) Fill missing for numeric columns

In [25]:
num_cols = data.select_dtypes(exclude="O").columns.tolist()

In [26]:
nan_num_cols = data[num_cols].isnull().sum()[data[num_cols].isnull().sum()>0].index.tolist()

- Filling `height`, `weight` and `age` with corresponding median, to avoid outlier cases messing with values.

In [27]:
median_fill_cols=list({x for x in nan_num_cols for j in ["height","weight","age"] if j in x.lower()})

In [28]:
fill_median = lambda col : col.fillna(col.median())

In [29]:
data[median_fill_cols] = data[median_fill_cols].apply(fill_median)

In [30]:
_=[nan_num_cols.remove(x) for x in median_fill_cols]

- Fill columns containing `time` with corresponding averages.

In [31]:
fill_mean = lambda col : col.fillna(col.mean())

In [32]:
time_cols = [x for x in nan_num_cols if "Time" in x]

In [33]:
data[time_cols] = data[time_cols].apply(fill_mean)

In [34]:
_=[nan_num_cols.remove(x) for x in time_cols]

- Fill columns containing `Strikes` or `Grappling` with 0 as they seem to represent count and maybe there were not recorded <br>and cann be considered as 0.

In [35]:
count_fill_cols=list({x for x in nan_num_cols for j in ["Strikes","Grappling"] if j in x})

In [36]:
data[count_fill_cols] = data[count_fill_cols].fillna(0)

### Filling methods that could have implemented but not enough data description available to implement correctly:
 - Backfill/Frontfill: `fillna(method="bfill")`, `fillna(method="ffill")`
 - Leaving a column and adding a binary column next to it with 1 representing NaN values and 0 not NaN value.
 - Dropping entire row of of the missing value from data.
 - etc.

## 9) Check if there are numeric columns with NaNs.

In [37]:
print(data.select_dtypes(exclude="O").isnull().sum().sum() == 0)

True


## 10) Splitting into train and test and saving

In [38]:
train, test = train_test_split(data, test_size=0.25, random_state=42)

In [39]:
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)