In [8]:
import numpy as np                                     # mathematical essentials
import pandas as pd                                    # data science essentials

from sklearn.metrics import confusion_matrix           # confusion matrix
from sklearn.metrics import roc_auc_score              # auc score
from sklearn.model_selection import train_test_split   # train-test split
from sklearn.model_selection import GridSearchCV       # GridSearchC
from sklearn.feature_selection import SelectFromModel  

from sklearn.linear_model import LogisticRegression    # logistic regression
from sklearn.ensemble import RandomForestClassifier     # random forest
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.tree import DecisionTreeClassifier         # classification trees

In [9]:
data = pd.read_excel('./GOT_character_predictions.xlsx')


In [10]:
data.head()

Unnamed: 0,S.No,name,title,culture,dateOfBirth,mother,father,heir,house,spouse,...,isAliveMother,isAliveFather,isAliveHeir,isAliveSpouse,isMarried,isNoble,age,numDeadRelations,popularity,isAlive
0,1,Viserys II Targaryen,,,,Rhaenyra Targaryen,Daemon Targaryen,Aegon IV Targaryen,,,...,1.0,0.0,0.0,,0,0,,11,0.605351,0
1,2,Walder Frey,Lord of the Crossing,Rivermen,208.0,,,,House Frey,Perra Royce,...,,,,1.0,1,1,97.0,1,0.896321,1
2,3,Addison Hill,Ser,,,,,,House Swyft,,...,,,,,0,1,,0,0.267559,1
3,4,Aemma Arryn,Queen,,82.0,,,,House Arryn,Viserys I Targaryen,...,,,,0.0,1,1,23.0,0,0.183946,0
4,5,Sylva Santagar,Greenstone,Dornish,276.0,,,,House Santagar,Eldon Estermont,...,,,,1.0,1,1,29.0,0,0.043478,1


In [13]:
data.isna().sum() 

S.No                             0
name                             0
title                         1008
culture                       1269
dateOfBirth                   1513
mother                        1925
father                        1920
heir                          1923
house                          427
spouse                        1670
book1_A_Game_Of_Thrones          0
book2_A_Clash_Of_Kings           0
book3_A_Storm_Of_Swords          0
book4_A_Feast_For_Crows          0
book5_A_Dance_with_Dragons       0
isAliveMother                 1925
isAliveFather                 1920
isAliveHeir                   1923
isAliveSpouse                 1670
isMarried                        0
isNoble                          0
age                           1513
numDeadRelations                 0
popularity                       0
isAlive                          0
dtype: int64

A quick explore the data indicate there are quite a lot of missing values

check if there is any missing values in y variable# 

In [15]:
data['isAlive'].isna().any()
# or np.where(df['isAlive'].isna())

False

Fortunately, there is no missing values in class labels. For those missing values in x variables, we could deal with accordingly (either impute the missing values or using models that accept null values)

#  Data Pre-processing

#### Removing redundant features

In [19]:
# check the unique number of instance for column 'name' in the data set
data['name'].nunique() == len(data)

True

In [20]:
# check the unique number of instance for column 'S.No' in the data set
data['S.No'].nunique() == len(data)

True

Since the datapoints in first two variables (ID and name) are all unique values, which indicates both variables are not associated with the class labels. I would like to remove these two variable in advance.

In [25]:
data_new = data.loc[:,~data.columns.isin(['S.No','name'])]

####  Checking datatypes

In [26]:
data_new.dtypes

title                          object
culture                        object
dateOfBirth                   float64
mother                         object
father                         object
heir                           object
house                          object
spouse                         object
book1_A_Game_Of_Thrones         int64
book2_A_Clash_Of_Kings          int64
book3_A_Storm_Of_Swords         int64
book4_A_Feast_For_Crows         int64
book5_A_Dance_with_Dragons      int64
isAliveMother                 float64
isAliveFather                 float64
isAliveHeir                   float64
isAliveSpouse                 float64
isMarried                       int64
isNoble                         int64
age                           float64
numDeadRelations                int64
popularity                    float64
isAlive                         int64
dtype: object

####  Converting the categorical variables into dummy code

In [27]:
# exclude the class labels
data_dummy = pd.get_dummies(data_new.iloc[:,:-1])

####  Impute missing values

Since the dataset has both numeric and categorical variables (even though converted to dummy code), I think mode would be more appropriate than mean or mdeian when imputing the missing values

In [28]:
# one simple/quick approach to impute the missing values, but this could lead to some bias 
data_imput = data_dummy.fillna(data.mode().iloc[0])

In [29]:
# double check to make sure there is no more missing data
np.where(data_imput.isna())

(array([], dtype=int64), array([], dtype=int64))

####  Stratified train test split

In [30]:
X, y = data_imput.values, data_new.iloc[:,-1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=219, stratify=y, test_size=0.10)

In [31]:
# convert the train/test data back to dataframe
data_train = pd.DataFrame(X_train, columns=data_dummy.columns)
data_train.loc[:,'isAlive'] = list(y_train)

data_test = pd.DataFrame(X_test, columns=data_dummy.columns)
data_test.loc[:,'isAlive'] = list(y_test)

####   Feature selection

In [32]:
# Fit the model (Gradient Boosting Model)
etc = GradientBoostingClassifier(random_state=219)
etc.fit(X_train, y_train)
print(etc.feature_importances_)

[0.28034408 0.025923   0.01164872 ... 0.         0.         0.        ]


In [34]:
# create dataframe to store the scores
dfscores = pd.DataFrame(etc.feature_importances_)
dfcolumns = pd.DataFrame(data_train.columns[:-1])

# concatenate the two dataframes
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Attribute','Score']

then check the top 10 features with the most importance based on this tree classifier

In [35]:
featureScores = featureScores.sort_values(by=['Score'],ascending=False)
featureScores.head(10)

Unnamed: 0,Attribute,Score
0,dateOfBirth,0.280344
14,popularity,0.201428
4,book4_A_Feast_For_Crows,0.115423
12,age,0.057042
727,house_Night's Watch,0.036827
1,book1_A_Game_Of_Thrones,0.025923
13,numDeadRelations,0.02099
5,book5_A_Dance_with_Dragons,0.018277
291,culture_Free Folk,0.017659
405,house_Brave Companions,0.014913


In [36]:
# convert the top 10 features to a list
features_selected = featureScores['Attribute'].head(10).to_list()
features_selected

['dateOfBirth',
 'popularity',
 'book4_A_Feast_For_Crows',
 'age',
 "house_Night's Watch",
 'book1_A_Game_Of_Thrones',
 'numDeadRelations',
 'book5_A_Dance_with_Dragons',
 'culture_Free Folk',
 'house_Brave Companions']

In [37]:
X_train = data_train.loc[:, features_selected].values
y_train = data_train.iloc[:,-1].values
X_test = data_test.loc[:, features_selected].values
y_test = data_test.iloc[:,-1].values

# Modeling

####  logistic regression

In [38]:
lgm = LogisticRegression(random_state=219, max_iter = 10000, solver = 'newton-cg')
lgm.fit(X_train, y_train)

LogisticRegression(max_iter=10000, random_state=219, solver='newton-cg')

In [39]:
# Now generate the confusion matrix
cm_lg = confusion_matrix(y_test, lgm.predict(X_test))

In [40]:
ans1={}
ans1['model type'] = 'logistic regression'
ans1['train score'] = lgm.score(X_train,y_train)
ans1['test score'] = lgm.score(X_test,y_test)
ans1['AUC'] = roc_auc_score(y_test, lgm.predict(X_test))
print(f'{ans1}, \nConfusion Matrix:\n{cm_lg}')

{'model type': 'logistic regression', 'train score': 0.7909765848086807, 'test score': 0.8153846153846154, 'AUC': 0.6793103448275862}, 
Confusion Matrix:
[[ 20  30]
 [  6 139]]


#### decision tree

In [44]:
datas = DecisionTreeClassifier(random_state=219)

In [45]:
# provide the grid parameters for decision tree
tree_parameters = {'criterion':['gini','entropy'],
                   'max_depth':[3,4,5,6,7,8],
                   'min_samples_leaf':[2,3,4,5,6,7,8,9,10],
                   'max_leaf_nodes':[None,10,30,50]}
grid_search_datas = GridSearchCV(datas, tree_parameters, cv=5, verbose=1)
grid_search_datas.fit(X_train, y_train)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=219),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [3, 4, 5, 6, 7, 8],
                         'max_leaf_nodes': [None, 10, 30, 50],
                         'min_samples_leaf': [2, 3, 4, 5, 6, 7, 8, 9, 10]},
             verbose=1)

In [47]:
# Now generate the confusion matrix
cm_datas_cv = confusion_matrix(y_test, grid_search_datas.predict(X_test))

In [48]:
ans2={}
ans2['model type'] = 'decision tree'
ans2['train score'] = grid_search_datas.score(X_train,y_train)
ans2['test score'] = grid_search_datas.score(X_test,y_test)
ans2['AUC'] = roc_auc_score(y_test, grid_search_datas.predict(X_test))
print(f'{ans2}, \nConfusion Matrix:\n{cm_datas_cv}')

{'model type': 'decision tree', 'train score': 0.8332381496287835, 'test score': 0.8769230769230769, 'AUC': 0.786206896551724}, 
Confusion Matrix:
[[ 30  20]
 [  4 141]]


#### random foreast 

In [None]:
rf = RandomForestClassifier(random_state=219)

In [54]:
# provide the grid parameters for random forest
forest_parameters = {'max_depth': [4,5,6,7,8],
                 'max_features': ['auto', 'sqrt'],
                 'min_samples_leaf': [1, 2, 4, 6],
                 'min_samples_split': [2, 5, 10]}

grid_search_rf = GridSearchCV(rf, forest_parameters, cv=5, verbose=1)
grid_search_rf.fit(X_train, y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=219),
             param_grid={'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2, 4, 6],
                         'min_samples_split': [2, 5, 10]},
             verbose=1)

In [55]:
# Now generate the confusion matrix
cm_rf_cv = confusion_matrix(y_test, grid_search_rf.predict(X_test))

In [56]:
ans3={}
ans3['model type'] = 'decision tree'
ans3['train score'] = grid_search_rf.score(X_train,y_train)
ans3['test score'] = grid_search_rf.score(X_test,y_test)
ans3['AUC'] = roc_auc_score(y_test, grid_search_rf.predict(X_test))
print(f'{ans3}, \nConfusion Matrix:\n{cm_rf_cv}')

{'model type': 'decision tree', 'train score': 0.8498001142204454, 'test score': 0.8512820512820513, 'AUC': 0.7493103448275862}, 
Confusion Matrix:
[[ 27  23]
 [  6 139]]


# Final Model Selected

Considering the overall AUC as the criteria, I would choose the decision tree model since it has the highest AUC score.

In [60]:
ans2={}
ans2['model type'] = 'decision tree'
ans2['train score'] = grid_search_datas.score(X_train,y_train)
ans2['test score'] = grid_search_datas.score(X_test,y_test)
ans2['AUC'] = roc_auc_score(y_test, grid_search_datas.predict(X_test))
print(f'{ans2}, \nConfusion Matrix:\n{cm_datas_cv}')

{'model type': 'decision tree', 'train score': 0.8332381496287835, 'test score': 0.8769230769230769, 'AUC': 0.786206896551724}, 
Confusion Matrix:
[[ 30  20]
 [  4 141]]
