### Spaceship Titanic with fastai

Competition [Link](https://www.kaggle.com/competitions/spaceship-titanic/overview)

In [1]:
#|default_exp app

In [2]:
#The Following cell of code is used everytime FASTAI library is used.
#They tell the notebook to reload any changes made to any libraries used.
#They also ensure that any graphs are plotted are shown in this notebook
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
from fastai.tabular.all import *
from fastbook import *

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

import seaborn as sns

from dtreeviz.trees import *
import dtreeviz

from IPython.display import Image, display_svg, SVG

In [4]:
#| export
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
creds = ''

In [5]:
#| export
cred_path = Path('~/.kaggle/kaggle.json').expanduser()
if not cred_path.exists():
    cred_path.parent.mkdir(exist_ok=True)
    cred_path.write_text(creds)
    cred_path.chmod(0o600)

In [6]:
#| export
path = Path('spaceship-titanic')

In [7]:
#| export
if not iskaggle and not path.exists():
    import zipfile, kaggle
    kaggle.api.competition_download_cli(str(path))    
    zipfile.ZipFile(f'{path}.zip').extractall(path)

In [8]:
#| export
if iskaggle:
    path = Path('../input/spaceship-titanic')
    ! pip install -q dataset

Import CSV's as Pandas Dataframes

In [9]:
#| export
df = pd.read_csv(path/'train.csv', low_memory=False)
test_df = pd.read_csv(path/'test.csv', low_memory=False)
sample_df = pd.read_csv(path/'sample_submission.csv', low_memory=False)

In [10]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


#### Infer Cryosleep NaN by Amenities Use

In [11]:
df['CryoSleep'].isnull().count

<bound method Series.count of 0       False
1       False
2       False
3       False
4       False
        ...  
8688    False
8689    False
8690    False
8691    False
8692    False
Name: CryoSleep, Length: 8693, dtype: bool>

In [12]:
df[np.where(df['CryoSleep'] == True, True, False)].head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True
9,0008_01,Europa,True,B/1/P,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0,Erraiam Flatic,True
10,0008_02,Europa,True,B/1/P,TRAPPIST-1e,34.0,False,0.0,0.0,,0.0,0.0,Altardr Flatic,True
18,0016_01,Mars,True,F/5/P,TRAPPIST-1e,45.0,False,0.0,0.0,0.0,0.0,0.0,Alus Upead,True
21,0020_01,Earth,True,E/0/S,TRAPPIST-1e,1.0,False,0.0,0.0,0.0,0.0,0.0,Almary Brantuarez,False


In [13]:
df[np.where(df['CryoSleep'] == True, True, False)].shape

(3037, 14)

In [14]:
cryo_amenities_df = ['CryoSleep', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [15]:
null_cryo_df = df.loc[df['CryoSleep'].isnull(), cryo_amenities_df]
test_null_cryo_df = test_df.loc[df['CryoSleep'].isnull(), cryo_amenities_df]

In [16]:
df['CryoSleep'].isnull().sum()

217

In [17]:
null_cryo_df

Unnamed: 0,CryoSleep,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
92,,0.0,0.0,0.0,0.0,0.0
98,,0.0,0.0,570.0,2.0,131.0
104,,0.0,331.0,0.0,0.0,1687.0
111,,0.0,0.0,0.0,0.0,
152,,0.0,985.0,0.0,5.0,0.0
...,...,...,...,...,...,...
8620,,0.0,0.0,0.0,0.0,0.0
8651,,0.0,0.0,0.0,0.0,0.0
8664,,0.0,0.0,0.0,0.0,0.0
8675,,1030.0,1015.0,0.0,11.0,


here

In [18]:
amenities_mask =(null_cryo_df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] > 0).any(axis=1)
test_amenities_mask =(test_null_cryo_df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] > 0).any(axis=1)
amenities_mask

92      False
98       True
104      True
111     False
152      True
        ...  
8620    False
8651    False
8664    False
8675     True
8687     True
Length: 217, dtype: bool

In [19]:
df.loc[null_cryo_df.index, cryo_amenities_df].head(), df.loc[null_cryo_df.index, cryo_amenities_df].shape

(    CryoSleep  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck
 92        NaN          0.0        0.0           0.0  0.0     0.0
 98        NaN          0.0        0.0         570.0  2.0   131.0
 104       NaN          0.0      331.0           0.0  0.0  1687.0
 111       NaN          0.0        0.0           0.0  0.0     NaN
 152       NaN          0.0      985.0           0.0  5.0     0.0,
 (217, 6))

In [20]:
df.loc[null_cryo_df.index, 'CryoSleep']

92      NaN
98      NaN
104     NaN
111     NaN
152     NaN
       ... 
8620    NaN
8651    NaN
8664    NaN
8675    NaN
8687    NaN
Name: CryoSleep, Length: 217, dtype: object

In [21]:
df.loc[null_cryo_df.index, 'CryoSleep'] = np.where(amenities_mask, False, True)
test_df.loc[test_null_cryo_df.index, 'CryoSleep'] = np.where(test_amenities_mask, False, True)

In [22]:
df['CryoSleep'].isnull().sum()

0

#### Replace NaN Amenities Values with 0

In [23]:
df['RoomService'].isnull().sum()

181

In [24]:
test_df.shape

(4277, 13)

In [25]:
test_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [26]:
df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].isnull().sum()

RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
dtype: int64

In [27]:
amenities = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [28]:
df[amenities] = df[amenities].fillna(0)
test_df[amenities] = test_df[amenities].fillna(0)

#### Names to Surnames

This name on such a small dataset could cause overfitting, so I'm going to try splitting into families by last name. I'll also be dropping `Name` as a category afterward

In [29]:
df['surname'] = df['Name'].str.split(' ').str[1]

In [30]:
test_df['surname'] = test_df['Name'].str.split(' ').str[1]

In [31]:
df = df.drop(['Name'], axis=1)

In [32]:
test_df = test_df.drop(['Name'], axis=1)

#### Split PassengerId and Room

Looking at this there's some extra data we can extract to new columns, *PassengerId* looks like it's by group number and the number in a particular group, split with an underscore `_` 

In [33]:
df['group_num'] = df['PassengerId'].str.split('_').str[0]
df['num_w_in_group'] = df['PassengerId'].str.split('_').str[1]

In [34]:
test_df['group_num'] =test_df['PassengerId'].str.split('_').str[0]
test_df['num_w_in_group'] =test_df['PassengerId'].str.split('_').str[1]

The same is true for the Cabin, there are 3 different values to analyze here, so let's split them up as well

In [35]:
df['deck'] = df['Cabin'].str.split('/').str[0]
df['room_num'] = df['Cabin'].str.split('/').str[1]
df['side'] = df['Cabin'].str.split('/').str[2]

In [36]:
test_df['deck'] = test_df['Cabin'].str.split('/').str[0]
test_df['room_num'] = test_df['Cabin'].str.split('/').str[1]
test_df['side'] = test_df['Cabin'].str.split('/').str[2]

Take the max number of the last two digits in PassengerId given that the first four numbers are the same

In [37]:
df['num_w_in_group'] = df['num_w_in_group'].astype(int)

In [38]:
test_df['num_w_in_group'] = test_df['num_w_in_group'].astype(int)

In [39]:
max_num_in_group = df.groupby('group_num')['num_w_in_group'].max().astype(int)

In [40]:
test_max_num_in_group = test_df.groupby('group_num')['num_w_in_group'].max().astype(int)

In [41]:
len(max_num_in_group)

6217

In [42]:
max_num_in_group.head()

group_num
0001    1
0002    1
0003    2
0004    1
0005    1
Name: num_w_in_group, dtype: int64

In [43]:
df['num_in_group'] = df['group_num'].map(max_num_in_group)

In [44]:
test_df['num_in_group'] = test_df['group_num'].map(test_max_num_in_group)

In [45]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,surname,group_num,num_w_in_group,deck,room_num,side,num_in_group
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,Ofracculy,1,1,B,0,P,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,Vines,2,1,F,0,S,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,Susent,3,1,A,0,S,2
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,Susent,3,2,A,0,S,2
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,Santantines,4,1,F,1,S,1


Finding if everyone boarding is a family group or not by last name:
- Find if the count of family group == num_in_group

In [46]:
boarded_together = df.groupby('surname')['num_in_group'].nunique() == 1

In [47]:
test_boarded_together = test_df.groupby('surname')['num_in_group'].nunique() == 1

In [48]:
df['fam_board_together'] = df['surname'].map(boarded_together)

In [49]:
test_df['fam_board_together'] = test_df['surname'].map(test_boarded_together)

In [50]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,surname,group_num,num_w_in_group,deck,room_num,side,num_in_group,fam_board_together
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,Ofracculy,1,1,B,0,P,1,True
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,Vines,2,1,F,0,S,1,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,Susent,3,1,A,0,S,2,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,Susent,3,2,A,0,S,2,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,Santantines,4,1,F,1,S,1,False


In [51]:
test_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,surname,group_num,num_w_in_group,deck,room_num,side,num_in_group,fam_board_together
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Carsoning,13,1,G,3,S,1,False
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Peckers,18,1,F,4,S,1,True
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Unhearfus,19,1,C,0,S,1,True
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Caltilter,21,1,C,1,S,1,True
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Harperez,23,1,F,5,S,1,True


#### Replace NaN Home and Destination with Group Values

In [52]:
df.loc[np.where(df['HomePlanet'].isnull())]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,surname,group_num,num_w_in_group,deck,room_num,side,num_in_group,fam_board_together
59,0064_02,,True,E/3/S,TRAPPIST-1e,33.0,False,0.0,0.0,0.0,0.0,0.0,True,Keen,0064,2,E,3,S,2,True
113,0119_01,,False,A/0/P,TRAPPIST-1e,39.0,False,0.0,2344.0,0.0,65.0,6898.0,False,Coning,0119,1,A,0,P,2,False
186,0210_01,,True,D/6/P,55 Cancri e,24.0,False,0.0,0.0,0.0,0.0,0.0,True,Inicont,0210,1,D,6,P,1,False
225,0242_01,,False,F/46/S,TRAPPIST-1e,18.0,False,313.0,1.0,691.0,283.0,0.0,False,Sté,0242,1,F,46,S,1,True
234,0251_01,,True,C/11/S,55 Cancri e,54.0,False,0.0,0.0,0.0,0.0,0.0,True,Amsive,0251,1,C,11,S,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8515,9084_01,,False,E/582/P,TRAPPIST-1e,25.0,False,1258.0,0.0,22.0,19.0,0.0,False,Mone,9084,1,E,582,P,1,False
8613,9194_01,,False,E/603/S,55 Cancri e,53.0,False,0.0,4017.0,0.0,13.0,3147.0,False,,9194,1,E,603,S,2,
8666,9248_01,,False,F/1792/S,55 Cancri e,38.0,,28.0,1208.0,973.0,207.0,0.0,True,Perle,9248,1,F,1792,S,1,False
8674,9257_01,,False,F/1892/P,TRAPPIST-1e,13.0,False,39.0,0.0,1085.0,24.0,0.0,False,Apple,9257,1,F,1892,P,1,True


In [53]:
df.groupby('group_num')['HomePlanet'].value_counts()

group_num  HomePlanet
0001       Europa        1
0002       Earth         1
0003       Europa        2
0004       Earth         1
0005       Earth         1
                        ..
9275       Europa        3
9276       Europa        1
9278       Earth         1
9279       Earth         1
9280       Europa        2
Name: HomePlanet, Length: 6107, dtype: int64

In [54]:
mode_planet = df.groupby('group_num')['HomePlanet'].apply(lambda x: x.mode(dropna=True))

In [55]:
mode_planet

group_num   
0001       0    Europa
0002       0     Earth
0003       0    Europa
0004       0     Earth
0005       0     Earth
                 ...  
9275       0    Europa
9276       0    Europa
9278       0     Earth
9279       0     Earth
9280       0    Europa
Name: HomePlanet, Length: 6107, dtype: object

In [56]:
np.where(df['HomePlanet'].isnull())

(array([  59,  113,  186,  225,  234,  274,  286,  291,  347,  365,  405,  407,  438,  471,  481,  501,  505,  524,  568,  637,  737,  807,  848,  920,  962,  993,  999, 1004, 1024, 1267, 1307, 1392,
        1399, 1550, 1600, 1706, 1714, 1757, 1807, 1855, 1892, 1916, 1926, 2166, 2173, 2227, 2233, 2246, 2274, 2281, 2290, 2322, 2324, 2425, 2442, 2461, 2502, 2544, 2617, 2630, 2631, 2638, 2642, 2715,
        2749, 2773, 2791, 2898, 2926, 2969, 3091, 3119, 3168, 3192, 3277, 3281, 3326, 3370, 3433, 3503, 3535, 3560, 3595, 3622, 3635, 3769, 3816, 3858, 3891, 3940, 3946, 3951, 4025, 4089, 4097, 4154,
        4172, 4230, 4315, 4343, 4366, 4374, 4535, 4548, 4581, 4632, 4702, 4770, 4817, 4861, 5016, 5024, 5055, 5079, 5081, 5096, 5101, 5112, 5208, 5252, 5316, 5427, 5438, 5465, 5576, 5624, 5634, 5687,
        5689, 5699, 5723, 5762, 5904, 5906, 5953, 5988, 6004, 6011, 6030, 6054, 6056, 6154, 6169, 6197, 6205, 6257, 6267, 6450, 6565, 6591, 6609, 6616, 6622, 6629, 6644, 6735, 6737, 6854, 6913, 6917,


In [57]:
df.groupby('group_num')['HomePlanet'].size()

group_num
0001    1
0002    1
0003    2
0004    1
0005    1
       ..
9275    3
9276    1
9278    1
9279    1
9280    2
Name: HomePlanet, Length: 6217, dtype: int64

In [58]:
df.groupby('group_num')['HomePlanet'].size().to_frame('counts')

Unnamed: 0_level_0,counts
group_num,Unnamed: 1_level_1
0001,1
0002,1
0003,2
0004,1
0005,1
...,...
9275,3
9276,1
9278,1
9279,1


In [59]:
df.groupby('group_num')['HomePlanet'].size().to_frame('counts').reset_index()

Unnamed: 0,group_num,counts
0,0001,1
1,0002,1
2,0003,2
3,0004,1
4,0005,1
...,...,...
6212,9275,3
6213,9276,1
6214,9278,1
6215,9279,1


In [60]:
df.groupby('group_num')['HomePlanet'].size().to_frame('counts').reset_index().sort_values('counts')

Unnamed: 0,group_num,counts
0,0001,1
3872,5751,1
3871,5750,1
3870,5748,1
3869,5747,1
...,...,...
654,0984,8
3023,4498,8
2858,4256,8
5893,8796,8


In [61]:
df.groupby('group_num')['HomePlanet'].size().to_frame('counts').reset_index().sort_values('counts', ascending=False)

Unnamed: 0,group_num,counts
3023,4498,8
5487,8168,8
5851,8728,8
5893,8796,8
6000,8956,8
...,...,...
2334,3483,1
2333,3480,1
2332,3478,1
2328,3473,1


In [62]:
df.groupby('group_num')['HomePlanet'].size().to_frame('counts') \
  .reset_index().sort_values('counts', ascending=False) \
  .drop_duplicates()

Unnamed: 0,group_num,counts
3023,4498,8
5487,8168,8
5851,8728,8
5893,8796,8
6000,8956,8
...,...,...
2334,3483,1
2333,3480,1
2332,3478,1
2328,3473,1


In [63]:
df.groupby('group_num')['HomePlanet'].size().to_frame('counts') \
  .reset_index().sort_values('counts', ascending=False) \
  .drop_duplicates(subset='group_num')

Unnamed: 0,group_num,counts
3023,4498,8
5487,8168,8
5851,8728,8
5893,8796,8
6000,8956,8
...,...,...
2334,3483,1
2333,3480,1
2332,3478,1
2328,3473,1


In [64]:
df.groupby('group_num')['HomePlanet'].size().to_frame('counts') \
  .reset_index().sort_values('counts', ascending=False) \
  .drop_duplicates(subset='group_num').drop(columns='counts')

Unnamed: 0,group_num
3023,4498
5487,8168
5851,8728
5893,8796
6000,8956
...,...
2334,3483
2333,3480
2332,3478
2328,3473


In [65]:
mode_planet.head()

group_num   
0001       0    Europa
0002       0     Earth
0003       0    Europa
0004       0     Earth
0005       0     Earth
Name: HomePlanet, dtype: object

Final Replacements:

In [66]:
# create a function to replace NaN values with mode values
def fillna_mode(group):
    mode_series = group['HomePlanet'].mode()
    if mode_series.empty:
        # if mode Series is empty, return the original group
        return group
    else:
        # otherwise, fill NaN values with the mode value
        mode_value = mode_series[0]
        group['HomePlanet'].fillna(mode_value, inplace=True)
        return group


In [67]:
# apply the function to each group
df = df.groupby('group_num', group_keys=False).apply(fillna_mode)

In [68]:
test_df['HomePlanet'].isnull().sum()

87

In [69]:
test_df = test_df.groupby('group_num', group_keys=False).apply(fillna_mode)

In [70]:
test_df['HomePlanet'].isnull().sum()

46

In [None]:
df.head(n=5)

In [None]:
df['HomePlanet'].isnull().sum()

#### Check Nulls

In [None]:
def get_null_counts(df):
    null_count_df = pd.DataFrame((df.isnull().sum()).sort_values(ascending=False).reset_index())
    null_count_df.columns = ['column_name','null_counts']
    null_count_df = null_count_df.query("column_name!='Transported'")
    return null_count_df

null_= get_null_counts(df)
null_.style.background_gradient(cmap='summer')

In [None]:
test_null = get_null_counts(test_df)
null_.style.background_gradient(cmap='winter')

Boolean values converted to 0's and 1's

In [None]:
def bool_switch(df, col_name):
    encoder = LabelEncoder()
    df[f'{col_name}'] = encoder.fit_transform(df[f'{col_name}']) 

In [None]:
bool_switch(df, col_name='Transported')
bool_switch(df, col_name='VIP')
bool_switch(df, col_name='CryoSleep')
bool_switch(df, col_name="fam_board_together")

In [None]:
df.shape

In [None]:
bool_switch(test_df, col_name='VIP')
bool_switch(test_df, col_name='CryoSleep')
bool_switch(test_df, col_name="fam_board_together")

In [None]:
test_df.shape

### Preparing Data

Declare dependant variable(y-axis)

In [None]:
#| export
dep_var = 'Transported'

In [None]:
df['Destination'].sample

Add tabular processes to transform categorical variables to something similar to `pd.Categorical`, and fill in missing/na values

In [None]:
#| export
procs = [Categorify, FillMissing, Normalize]

In [None]:
#| export
cont, cat = cont_cat_split(df, 1, dep_var=dep_var)

In [None]:
test_cont, test_cat = cont_cat_split(test_df, 1, dep_var=dep_var)

In [None]:
to = TabularPandas(df, procs, cat, cont, y_names=dep_var, 
                    y_block=CategoryBlock(), 
                    splits=RandomSplitter(valid_pct=0.2, seed=42)(df)).dataloaders(bs=128)

In [None]:
to.fill_strategy

In [None]:
test_to = TabularPandas(test_df, procs, test_cat, test_cont, y_names=None, 
                    y_block=CategoryBlock(), 
                    splits=None).dataloaders(bs=128)

In [None]:
xs, y = to.train.xs, to.train.y
valid_xs, valid_y = to.valid.xs, to.valid.y

In [None]:
test_xs = test_to.train.xs

In [None]:
xs.head()

In [None]:
xs.shape

In [None]:
test_xs.head()

In [None]:
test_xs.shape

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), annot=True)

In [None]:
def rf(xs, y, n_estimators=40, max_features=0.5, min_samples_leaf=5, **kwargs):
    return RandomForestClassifier(n_jobs=-1, n_estimators=n_estimators,
        max_features=max_features, min_samples_leaf=min_samples_leaf, 
        oob_score=True).fit(xs,y)

In [None]:
m = rf(xs,y)

In [None]:
def pred_acc(m, valid_xs=valid_xs):
    y_pred = m.predict(valid_xs)
    accuracy = accuracy_score(valid_y, y_pred)
    return accuracy

In [None]:
pred_acc(m)

In [None]:
def rf_feat_importances(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}).sort_values('imp', ascending=False)

In [None]:
fi = rf_feat_importances(m, xs)
fi

In [None]:
def plot_fi(fi):
    return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)

In [None]:
plot_fi(fi)

In [None]:
cluster_columns(xs)

In [None]:
to_keep = fi[fi.imp > 0.005].cols
len(to_keep), len(fi)

In [None]:
xs_imp = xs[to_keep]
valid_xs_imp = valid_xs[to_keep]

In [None]:
test_xs_imp = test_xs[to_keep]

In [None]:
len(test_xs_imp)

In [None]:
m = rf(xs_imp, y)

In [None]:
pred_acc(m, valid_xs=valid_xs_imp)

In [None]:
plot_fi(rf_feat_importances(m, xs_imp))

In [None]:
cluster_columns(xs_imp)

In [None]:
def get_oob(df):
    m = RandomForestClassifier(n_estimators=40, min_samples_leaf=15, max_features=0.5, n_jobs=-1, oob_score=True)
    m.fit(df, y)
    return m.oob_score_

In [None]:
get_oob(xs_imp)


In [None]:
{c:get_oob(xs_imp.drop(c, axis=1)) for c in xs_imp.columns}

In [None]:
{c:get_oob(xs.drop(c, axis=1)) for c in xs.columns}

In [None]:
xs_final = xs_imp
valid_xs_final = valid_xs_imp

In [None]:
test_xs_final = test_xs_imp

In [None]:
xs_final.head()

In [None]:
pred_acc(m, valid_xs=valid_xs_final)

In [None]:
valid_xs_final.columns

In [None]:
# test_xs = [test_xs.drop([x], axis=1) for x in test_xs if x not in valid_xs_final]

In [None]:
# for x in test_xs.columns:
#     if x not in valid_xs_final.columns:
#         test_xs = test_xs.drop([x], axis=1)

In [None]:
len(test_xs_final)

In [None]:
len(test_xs.columns), len(valid_xs_final.columns)

In [None]:
test_xs.columns, valid_xs_final.columns

In [None]:
preds = m.predict(test_xs_final)

In [None]:
preds

In [None]:
sample_df.head()

In [None]:
sample_df['Transported'] = preds.astype(bool)

In [None]:
sample_df.value_counts()

In [None]:
sub_df = sample_df

In [None]:
sub_df

In [None]:
sub_df.to_csv('submission.csv', index=False)