# GAME OF THRONES Death Prediction

## Importing and Cleaning Data

This script will retrieve information from SQL database \got, different tables, join them together using Fuzzy Wuzzy and save it in a .csv file that will be used in the rest of the analysis to avoid running all these steps all over again.

This script should be executed just **one time** in the sql instance (i.e. EC2).  To repeat the rest of the analysis, if necessary, it can just be executed starting on next step ([EDA](02_GOT_EDA.ipynb))

In [27]:
#Library Section
import pandas as pd
import numpy as np
import patsy
#import seaborn as sns
#import matplotlib.pyplot as plt

from scipy import stats
from sqlalchemy import create_engine, text
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# from sklearn import model_selection
# from sklearn import decomposition
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn import svm
# from sklearn.learning_curve import learning_curve
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import roc_curve, auc
# from mpl_toolkits.mplot3d import Axes3D

In [28]:
cnx = create_engine('postgresql://ubuntu:ubuntu1@localhost:5432/got')

# Load Book Information

In [29]:
query_books='''
SELECT DISTINCT
    COALESCE(C.name, C2.name) AS name,
    CASE WHEN C.isAlive = 1 THEN 0
        WHEN C.isAlive = 0 THEN 1
        WHEN C2.death_bk IS NULL THEN 0
        ELSE 1
    END isdead_bk,
    
    -- Boolean original features
    COALESCE(C.male, C2.gender) ismale,
    COALESCE(C.book1, C2.book1) book1,
    COALESCE(C.book2, C2.book2) book2  ,
    COALESCE(C.book3, C2.book3) book3  ,
    COALESCE(C.book4, C2.book4) book4  ,
    COALESCE(C.book5, C2.book5) book5  ,
    C.isAliveMother  isAliveMother_bk,
    C.isAliveFather  isAliveFather_bk,
    C.isAliveHeir  isAliveHeir_bk,
    C.isAliveSpouse  isAliveSpouse_bk,
    C.isMarried  isMarried_bk,
    COALESCE(C.isNoble, C2.nobility) isNoble_bk  ,
    C.boolDeadRelations  boolDeadRelations_bk,
    C.isPopular  isPopular_bk,
    C2.Death_Bk  ,
    C2.Apperance_Chp,
    
    -- Boolean Transformed Features
    CASE WHEN C.mother IS NULL THEN 0 ELSE 1 END as hasMom_bk,
    CASE WHEN C.father IS NULL THEN 0 ELSE 1 END as hasDad_bk,
    CASE WHEN C.heir IS NULL THEN 0 ELSE 1 END as hasHeir_bk ,
    CASE WHEN C.spouse IS NULL THEN 0 ELSE 1 END as hasSpouse_bk  ,
    
    -- Numerical Original Features
    CASE WHEN C.age < 0 THEN NULL ELSE C.age END age_bk,
    C.numDeadRelations numDeadRelations_bk,
    C.popularity popularity_bk,
    
    -- Categorical Transformed Features
    COALESCE(SUBSTRING(C.culture, 1, 5), 'None') culture_bk,
    CASE WHEN COALESCE(C.house, C2.allegiances) IS NULL THEN 'UNKNOWN' 
        WHEN H.house IS NULL THEN 'OTHER' 
        ELSE UPPER(COALESCE(C.house, C2.allegiances)) END as house_bk

FROM raw01_character AS C
LEFT OUTER JOIN (SELECT
    CASE WHEN UPPER(T2.allegiances) = 'NONE' THEN UPPER(T1.house)
        WHEN UPPER(T2.allegiances) NOT LIKE 'HOUSE%' THEN 'HOUSE '||UPPER(T2.allegiances)
        WHEN T2.allegiances IS NULL THEN COALESCE(UPPER(T1.house), 'NONE')
    ELSE UPPER(T2.allegiances) END house,
    count(*)
    FROM raw01_character T1
    FULL OUTER JOIN raw01_character_death T2
    ON UPPER(T1.name) = UPPER(T2.Name)
    GROUP BY 1
    HAVING count(*) >= 10) AS H
ON UPPER(C.house) = UPPER(H.house)

FULL OUTER JOIN raw01_character_death C2
ON UPPER(C.name) = UPPER(C2.Name)
'''

In [30]:
book_df = pd.read_sql_query(text(query_books),cnx)
len(book_df)

2012

In [31]:
book_df.head()

Unnamed: 0,name,isdead_bk,ismale,book1,book2,book3,book4,book5,isalivemother_bk,isalivefather_bk,...,apperance_chp,hasmom_bk,hasdad_bk,hasheir_bk,hasspouse_bk,age_bk,numdeadrelations_bk,popularity_bk,culture_bk,house_bk
0,Abelar Hightower,0,1,0,0,0,0,0,,,...,,0,0,0,0,,0.0,0.0,,HOUSE HIGHTOWER
1,Addam,0,1,0,0,0,0,0,,,...,,0,0,0,0,,0.0,0.006689,,UNKNOWN
2,Addam Frey,0,1,0,0,0,0,0,,,...,,0,0,0,0,,0.0,0.026756,,HOUSE FREY
3,Addam Marbrand,0,1,1,1,1,1,1,,,...,56.0,0,0,0,0,,0.0,0.130435,,OTHER
4,Addam Osgrey,1,1,0,0,0,0,0,,,...,,0,0,0,0,13.0,0.0,0.036789,,HOUSE OSGREY


### Check for duplicates

In [32]:
book_df[book_df.name == 'Myles']

Unnamed: 0,name,isdead_bk,ismale,book1,book2,book3,book4,book5,isalivemother_bk,isalivefather_bk,...,apperance_chp,hasmom_bk,hasdad_bk,hasheir_bk,hasspouse_bk,age_bk,numdeadrelations_bk,popularity_bk,culture_bk,house_bk
1266,Myles,0,1,0,0,0,1,1,,,...,39.0,0,0,0,0,,0.0,0.023411,,HOUSE MARTELL
1267,Myles,0,1,0,0,0,1,1,,,...,2.0,0,0,0,0,,0.0,0.023411,,HOUSE MARTELL


In [33]:
# Check for duplicates rows by name
dup_dataset=((book_df.groupby('name')
              .isdead_bk.count().reset_index()
              .sort_values('isdead_bk', ascending=False))['isdead_bk']>1).sum()
dup_dataset

1

In [34]:
book_df=book_df.drop_duplicates('name')

In [35]:
len(book_df)

2011

# Load Tv Show Character Information

In [36]:
query_show='''
SELECT
    C.name  ,
    CASE WHEN C.isAlive = 'Deceased' THEN 1 ELSE 0 END isdead_shw  ,
    C.death_season  ,
    COALESCE(CAST(NULLIF(LEFT(C.time_S1,POSITION(':' IN C.time_S1)-1), '') AS INT)*60 + 
             CAST(NULLIF(RIGHT(C.time_S1,2), '') AS INT),0) time_s1,
    COALESCE(CAST(NULLIF(LEFT(C.time_S2,POSITION(':' IN C.time_S2)-1), '') AS INT)*60 + 
             CAST(NULLIF(RIGHT(C.time_S2,2), '') AS INT),0) time_s2,
    COALESCE(CAST(NULLIF(LEFT(C.time_S3,POSITION(':' IN C.time_S3)-1), '') AS INT)*60 + 
             CAST(NULLIF(RIGHT(C.time_S3,2), '') AS INT),0) time_s3,
    COALESCE(CAST(NULLIF(LEFT(C.time_S4,POSITION(':' IN C.time_S4)-1), '') AS INT)*60 + 
             CAST(NULLIF(RIGHT(C.time_S4,2), '') AS INT),0) time_s4,
    C.total_episode_num,
    CASE WHEN C.season1 = 'TRUE' THEN 1 ELSE 0 END season1,
    CASE WHEN C.season2 = 'TRUE' THEN 1 ELSE 0 END season2 ,
    CASE WHEN C.season3 = 'TRUE' THEN 1 ELSE 0 END season3  ,
    CASE WHEN C.season4 = 'TRUE' THEN 1 ELSE 0 END season4  ,
    COALESCE(H.house, 'Other') house_shw,
    C.age age_shw
FROM raw11_character_S6 C
LEFT OUTER JOIN (
    SELECT LEFT(
            CASE WHEN POSITION(E'\n' IN allegiance) = 0 THEN allegiance
            ELSE LEFT(allegiance,COALESCE(POSITION(E'\n' IN allegiance)-1,0)) 
            END, 15) house, 
        COUNT(*)
    FROM raw11_character_S6
    GROUP BY 1
    HAVING COUNT(*) > 3) AS H
    ON H.house = LEFT(
            CASE WHEN POSITION(E'\n' IN C.allegiance) = 0 THEN C.allegiance
            ELSE LEFT(C.allegiance,COALESCE(POSITION(E'\n' IN C.allegiance)-1,0)) 
            END, 15)
'''

In [37]:
show_df = pd.read_sql_query(text(query_show),cnx)
show_df.columns

Index(['name', 'isdead_shw', 'death_season', 'time_s1', 'time_s2', 'time_s3',
       'time_s4', 'total_episode_num', 'season1', 'season2', 'season3',
       'season4', 'house_shw', 'age_shw'],
      dtype='object')

In [38]:
len(show_df)

121

### Prepare Join (Fuzzy Wuzzy)

In [39]:
# One Example
w1 = 'Catelyn Stark' 
w2 = 'Catelyn Stark (Tully)'

fuzz.token_sort_ratio(w1, w2)

81

In [40]:
def fuzzy_match(x, choices, scorer, cutoff) :
    res = process.extractOne(x, choices=choices, scorer=scorer, score_cutoff=cutoff)
    if res is None :
        return None
    else :
        return res[0]

In [41]:
show_df['match_name'] = show_df.loc[:,'name'].apply(
    fuzzy_match,
    args = (book_df.loc[:, 'name'],
    fuzz.token_sort_ratio,
    60
    )
)
show_df[['name', 'match_name']]

Unnamed: 0,name,match_name
0,Tyrion Lannister,Tyrion Lannister
1,Jon Snow,Jon Snow
2,Daenerys Targaryen,Daenerys Targaryen
3,Cersei Lannister,Cersei Lannister
4,Arya Stark,Arya Stark
5,Sansa Stark,Sansa Stark
6,Jaime Lannister,Jaime Lannister
7,Theon Greyjoy,Theon Greyjoy
8,Eddard Stark,Eddard Stark
9,Catelyn Stark (Tully),Catelyn Stark


In [42]:
# Fast Review of Characters that still don't match
show_df[pd.isnull(show_df.match_name)]

Unnamed: 0,name,isdead_shw,death_season,time_s1,time_s2,time_s3,time_s4,total_episode_num,season1,season2,season3,season4,house_shw,age_shw,match_name
31,Talisa Maegyr,1,3.0,0,829,1116,0,12,0,1,1,0,House Stark,,
87,Dagmer Cleftjaw,1,2.0,0,517,0,0,5,0,1,0,0,House Greyjoy,,
99,The Spice King,1,2.0,0,416,0,0,3,0,1,0,0,Other,,
111,Lommy Greenhands,1,2.0,40,238,0,0,4,1,1,0,0,,,


In [43]:
show_df = show_df[pd.isnull(show_df.match_name)==False]
len(show_df)

117

In [44]:
# Check for duplicates rows by name
dup_dataset=((show_df.groupby('match_name')
              .name.count().reset_index()
              .sort_values('name', ascending=False))['name']>1).sum()
dup_dataset

2

In [45]:
stats.mode(show_df['match_name'])



ModeResult(mode=array(['Renly Baratheon'], dtype=object), count=array([2]))

In [46]:
print(len(show_df))
show_df=show_df.drop_duplicates('match_name')
print(len(show_df))

117
115


In [47]:
show_df[show_df.match_name == 'Renly Baratheon']

Unnamed: 0,name,isdead_shw,death_season,time_s1,time_s2,time_s3,time_s4,total_episode_num,season1,season2,season3,season4,house_shw,age_shw,match_name
41,Renly Baratheon,1,2.0,702,824,0,0,8,1,1,0,0,House Baratheon,,Renly Baratheon


# Load Tv Show Ratings Information

In [48]:
query_ratings = '''
SELECT
    C.name,
    SUM(rating)/341 sum_rating,
    SUM(viewers)/181.23 total_viewers
FROM raw03_character_tv C
INNER JOIN raw03_character_episode_tv CE
ON C.char_id = CE.char_id
INNER JOIN raw11_episodes_tv E
ON CE.episode = E.episode
GROUP BY 1
'''

In [49]:
rating_df = pd.read_sql_query(text(query_ratings),cnx)
rating_df.columns

Index(['name', 'sum_rating', 'total_viewers'], dtype='object')

In [50]:
len(rating_df)

372

In [51]:
len(show_df)

115

### Prepare Join (Fuzzy Wuzzy)

In [52]:
w1 = 'Ramsay Bolton' 
w2 = 'Ramsay Snow'

fuzz.ratio(w1, w2)

67

In [53]:
rating_df['match_name'] = rating_df.loc[:,'name'].apply(
    fuzzy_match,
    args = (show_df.loc[:, 'match_name'],
    fuzz.ratio,
    85
    )
)
rating_df[(pd.isnull(rating_df.match_name)==False) & (rating_df.name != rating_df.match_name)]

Unnamed: 0,name,sum_rating,total_viewers,match_name
32,Olenna Tyrell,0.223754,0.284776,Leona Tyrell
244,Bran Stark,0.620528,0.578602,Brandon Stark
331,Alton Lannister,0.071261,0.062407,Jason Lannister


In [54]:
rating_df = rating_df[pd.isnull(rating_df.match_name)==False]

In [55]:
# Check for duplicates rows by name
dup_dataset=((rating_df.groupby('match_name')
              .name.count().reset_index()
              .sort_values('name', ascending=False))['name']>1).sum()
dup_dataset

0

In [56]:
print(len(show_df))
show_df=show_df.drop_duplicates('match_name')
print(len(show_df))

115
115


# Merge Information (Left)

**(All characters from the books, and matching information from tv Show)**

In [57]:
left_df = pd.merge(book_df, show_df, how='left', left_on='name', right_on='match_name')
len(left_df)

2011

In [58]:
stats.mode(left_df['name_x'])



ModeResult(mode=array(['Abelar Hightower'], dtype=object), count=array([1]))

In [59]:
# Check for duplicates
((left_df.groupby('match_name')
              .name_x.count().reset_index()
              .sort_values('name_x', ascending=False))['name_x']>1).sum()

0

### LEFT JOIN + Show Ratings
**(All characters from the books, and matching information from tv Show)**

In [60]:
left_df = pd.merge(left_df, rating_df, how='left', left_on='name_x', right_on='match_name')
len(left_df)

2011

# Analysis of the Values (Remove Nan)

In [61]:
got_df=left_df
for c in got_df.columns[2:] :
    print(c, got_df[c].unique())

ismale [1 0]
book1 [0 1]
book2 [0 1]
book3 [0 1]
book4 [0 1]
book5 [0 1]
isalivemother_bk [nan  1.  0.]
isalivefather_bk [nan  0.  1.]
isaliveheir_bk [nan  0.  1.]
isalivespouse_bk [nan  1.  0.]
ismarried_bk [ 0. nan  1.]
isnoble_bk [1 0]
booldeadrelations_bk [ 0. nan  1.]
ispopular_bk [ 0. nan  1.]
death_bk [nan  5.  3.  4.  2.  1.]
apperance_chp [nan 56. 20. 49.  5. 21. 59. 11.  0. 50. 54. 18. 15. 38. 26.  4.  6. 65.
 36. 28. 10. 19. 12. 16. 35. 44. 24. 14. 29. 30. 32. 34.  2.  3. 75. 62.
 22. 57. 52. 13. 63. 37. 23. 51. 31. 42.  7. 41.  1. 25. 48.  8. 68. 55.
 33. 47.  9. 17. 46. 43. 73. 39. 72. 66. 53. 45. 60. 61. 40. 27. 71. 80.
 69. 64. 78. 58. 79. 70. 74. 67.]
hasmom_bk [0 1]
hasdad_bk [0 1]
hasheir_bk [0 1]
hasspouse_bk [0 1]
age_bk [ nan  13.  16.  12.  52.  50.  17. 100.   2.  69.  23.  20.  60.  41.
  36.  48.  40.  15.  45.  19.  10.  49.  56.  32.  14.  21.  76.  29.
  42.  30.  28.   0.  39.   1.   6.  22.  18.  38.  24.  85.  77.  63.
  35.  54.  65.  79.  80.  47.  26. 

In [62]:
#If a Character is not found in the Book Set, Is not popular
got_df['ispopular_bk'] = got_df['ispopular_bk'].fillna(0)

In [63]:
# Fill Other Nan values with mean value of that column
got_df['booldeadrelations_bk'] = got_df['booldeadrelations_bk'].fillna(int(np.mean(got_df['booldeadrelations_bk'])))
got_df['apperance_chp'] = got_df['apperance_chp'].fillna(int(np.mean(got_df['apperance_chp'])))
got_df['age_bk'] = got_df['age_bk'].fillna(int(np.mean(got_df['age_bk'])))
got_df['numdeadrelations_bk'] = got_df['numdeadrelations_bk'].fillna(int(np.mean(got_df['numdeadrelations_bk'])))
got_df['age_shw'] = got_df['age_shw'].fillna(int(np.mean(got_df['age_shw'])))
got_df['popularity_bk'] = got_df['popularity_bk'].fillna(int(np.mean(got_df['popularity_bk'])))

In [64]:
# Fill values for characters in books but not in show
got_df['isdead_shw'] = got_df['isdead_shw'].fillna(0)
got_df['time_s1'] = got_df['time_s1'].fillna(0)
got_df['time_s2'] = got_df['time_s2'].fillna(0)
got_df['time_s3'] = got_df['time_s3'].fillna(0)
got_df['time_s4'] = got_df['time_s4'].fillna(0)
got_df['total_episode_num'] = got_df['total_episode_num'].fillna(0)
got_df['season1'] = got_df['season1'].fillna(0)
got_df['season2'] = got_df['season2'].fillna(0)
got_df['season3'] = got_df['season3'].fillna(0)
got_df['season4'] = got_df['season4'].fillna(0)
got_df['house_shw'] = got_df['house_shw'].fillna('Other')

In [65]:
# Fill values for characters in books but not in show-ratings
got_df['sum_rating'] = got_df['sum_rating'].fillna(0)
got_df['total_viewers'] = got_df['total_viewers'].fillna(0)

In [66]:
for c in got_df.columns[1:] :
    print(c, got_df[c].unique())

isdead_bk [0 1]
ismale [1 0]
book1 [0 1]
book2 [0 1]
book3 [0 1]
book4 [0 1]
book5 [0 1]
isalivemother_bk [nan  1.  0.]
isalivefather_bk [nan  0.  1.]
isaliveheir_bk [nan  0.  1.]
isalivespouse_bk [nan  1.  0.]
ismarried_bk [ 0. nan  1.]
isnoble_bk [1 0]
booldeadrelations_bk [0. 1.]
ispopular_bk [0. 1.]
death_bk [nan  5.  3.  4.  2.  1.]
apperance_chp [28. 56. 20. 49.  5. 21. 59. 11.  0. 50. 54. 18. 15. 38. 26.  4.  6. 65.
 36. 10. 19. 12. 16. 35. 44. 24. 14. 29. 30. 32. 34.  2.  3. 75. 62. 22.
 57. 52. 13. 63. 37. 23. 51. 31. 42.  7. 41.  1. 25. 48.  8. 68. 55. 33.
 47.  9. 17. 46. 43. 73. 39. 72. 66. 53. 45. 60. 61. 40. 27. 71. 80. 69.
 64. 78. 58. 79. 70. 74. 67.]
hasmom_bk [0 1]
hasdad_bk [0 1]
hasheir_bk [0 1]
hasspouse_bk [0 1]
age_bk [ 36.  13.  16.  12.  52.  50.  17. 100.   2.  69.  23.  20.  60.  41.
  48.  40.  15.  45.  19.  10.  49.  56.  32.  14.  21.  76.  29.  42.
  30.  28.   0.  39.   1.   6.  22.  18.  38.  24.  85.  77.  63.  35.
  54.  65.  79.  80.  47.  26.  31. 

# Categorical variables Generation

### Culture in the Book

In [67]:
got_df.culture_bk.unique()

array(['None', 'Valyr', 'Storm', 'Ironm', 'Dothr', 'Ironb', 'North',
       'Tyros', 'Free ', 'Braav', 'Valem', 'Dorni', 'Reach', 'Norvo',
       'Assha', 'Ghisc', 'Weste', 'Meere', 'Summe', 'First', 'River',
       'Vale ', 'Astap', 'Myris', 'Lysen', 'Qarth', 'Lhaza', 'Andal',
       'Dorne', 'Siste', 'Pento', 'Wildl', 'north', 'Crann', 'Vale',
       'ironb', 'The R', 'weste', 'Naath', 'Rhoyn', 'free ', 'Qohor',
       'Ibben'], dtype=object)

In [68]:
culture_df=patsy.dmatrix('culture_bk',data=got_df,return_type='dataframe')
culture_df.head()

Unnamed: 0,Intercept,culture_bk[T.Assha],culture_bk[T.Astap],culture_bk[T.Braav],culture_bk[T.Crann],culture_bk[T.Dorne],culture_bk[T.Dorni],culture_bk[T.Dothr],culture_bk[T.First],culture_bk[T.Free ],...,culture_bk[T.Vale],culture_bk[T.Vale ],culture_bk[T.Valem],culture_bk[T.Valyr],culture_bk[T.Weste],culture_bk[T.Wildl],culture_bk[T.free ],culture_bk[T.ironb],culture_bk[T.north],culture_bk[T.weste]
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### House in the Book

In [69]:
got_df.house_bk.unique()

array(['HOUSE HIGHTOWER', 'UNKNOWN', 'HOUSE FREY', 'OTHER',
       'HOUSE OSGREY', 'HOUSE TARGARYEN', 'HOUSE BRACKEN', 'HOUSE ARRYN',
       'HOUSE GREYJOY', "NIGHT'S WATCH", 'FAITH OF THE SEVEN',
       'HOUSE STARK', 'HOUSE TYRELL', 'HOUSE WHENT',
       'BROTHERHOOD WITHOUT BANNERS', 'HOUSE MARTELL', 'HOUSE BOTLEY',
       'HOUSE BARATHEON', 'HOUSE BOLTON', 'HOUSE OF LORAQ',
       'BRAVE COMPANIONS', 'HOUSE BAELISH', 'HOUSE TULLY',
       'HOUSE LANNISTER', 'HOUSE CLEGANE', 'STONE CROWS'], dtype=object)

In [70]:
# Remove special charactyers from column names
got_df['house_bk'].replace(["NIGHT'S WATCH"], 'NIGHTS WATCH', inplace=True)
got_df.house_bk.unique()

array(['HOUSE HIGHTOWER', 'UNKNOWN', 'HOUSE FREY', 'OTHER',
       'HOUSE OSGREY', 'HOUSE TARGARYEN', 'HOUSE BRACKEN', 'HOUSE ARRYN',
       'HOUSE GREYJOY', 'NIGHTS WATCH', 'FAITH OF THE SEVEN',
       'HOUSE STARK', 'HOUSE TYRELL', 'HOUSE WHENT',
       'BROTHERHOOD WITHOUT BANNERS', 'HOUSE MARTELL', 'HOUSE BOTLEY',
       'HOUSE BARATHEON', 'HOUSE BOLTON', 'HOUSE OF LORAQ',
       'BRAVE COMPANIONS', 'HOUSE BAELISH', 'HOUSE TULLY',
       'HOUSE LANNISTER', 'HOUSE CLEGANE', 'STONE CROWS'], dtype=object)

In [71]:
house_df=patsy.dmatrix('house_bk',data=got_df,return_type='dataframe')
house_df.head()

Unnamed: 0,Intercept,house_bk[T.BROTHERHOOD WITHOUT BANNERS],house_bk[T.FAITH OF THE SEVEN],house_bk[T.HOUSE ARRYN],house_bk[T.HOUSE BAELISH],house_bk[T.HOUSE BARATHEON],house_bk[T.HOUSE BOLTON],house_bk[T.HOUSE BOTLEY],house_bk[T.HOUSE BRACKEN],house_bk[T.HOUSE CLEGANE],...,house_bk[T.HOUSE OSGREY],house_bk[T.HOUSE STARK],house_bk[T.HOUSE TARGARYEN],house_bk[T.HOUSE TULLY],house_bk[T.HOUSE TYRELL],house_bk[T.HOUSE WHENT],house_bk[T.NIGHTS WATCH],house_bk[T.OTHER],house_bk[T.STONE CROWS],house_bk[T.UNKNOWN]
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### House in the Show

In [72]:
got_df.house_shw.unique()

array(['Other', "The Night's Wat", 'House Lannister', 'House Stark',
       'House Greyjoy', 'None', 'House Targaryen', 'House Baratheon',
       'House Bolton'], dtype=object)

In [73]:
# Remove Special Characters from column names
got_df['house_shw'].replace(["The Night's Wat"], 'Nights Watch', inplace=True)
got_df.house_shw.unique()

array(['Other', 'Nights Watch', 'House Lannister', 'House Stark',
       'House Greyjoy', 'None', 'House Targaryen', 'House Baratheon',
       'House Bolton'], dtype=object)

In [74]:
houseshw_df=patsy.dmatrix('house_shw',data=got_df,return_type='dataframe')
houseshw_df.head()

Unnamed: 0,Intercept,house_shw[T.House Bolton],house_shw[T.House Greyjoy],house_shw[T.House Lannister],house_shw[T.House Stark],house_shw[T.House Targaryen],house_shw[T.Nights Watch],house_shw[T.None],house_shw[T.Other]
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [75]:
got_df = got_df.join(culture_df)
got_df.head()

Unnamed: 0,name_x,isdead_bk,ismale,book1,book2,book3,book4,book5,isalivemother_bk,isalivefather_bk,...,culture_bk[T.Vale],culture_bk[T.Vale ],culture_bk[T.Valem],culture_bk[T.Valyr],culture_bk[T.Weste],culture_bk[T.Wildl],culture_bk[T.free ],culture_bk[T.ironb],culture_bk[T.north],culture_bk[T.weste]
0,Abelar Hightower,0,1,0,0,0,0,0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Addam,0,1,0,0,0,0,0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Addam Frey,0,1,0,0,0,0,0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Addam Marbrand,0,1,1,1,1,1,1,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Addam Osgrey,1,1,0,0,0,0,0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
got_df.drop(columns=['Intercept'], inplace=True)

In [77]:
got_df = got_df.join(house_df)
got_df.head()

Unnamed: 0,name_x,isdead_bk,ismale,book1,book2,book3,book4,book5,isalivemother_bk,isalivefather_bk,...,house_bk[T.HOUSE OSGREY],house_bk[T.HOUSE STARK],house_bk[T.HOUSE TARGARYEN],house_bk[T.HOUSE TULLY],house_bk[T.HOUSE TYRELL],house_bk[T.HOUSE WHENT],house_bk[T.NIGHTS WATCH],house_bk[T.OTHER],house_bk[T.STONE CROWS],house_bk[T.UNKNOWN]
0,Abelar Hightower,0,1,0,0,0,0,0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Addam,0,1,0,0,0,0,0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,Addam Frey,0,1,0,0,0,0,0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Addam Marbrand,0,1,1,1,1,1,1,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,Addam Osgrey,1,1,0,0,0,0,0,,,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
got_df.drop(columns=['Intercept'], inplace=True)

In [79]:
got_df = got_df.join(houseshw_df)
got_df.head()

Unnamed: 0,name_x,isdead_bk,ismale,book1,book2,book3,book4,book5,isalivemother_bk,isalivefather_bk,...,house_bk[T.UNKNOWN],Intercept,house_shw[T.House Bolton],house_shw[T.House Greyjoy],house_shw[T.House Lannister],house_shw[T.House Stark],house_shw[T.House Targaryen],house_shw[T.Nights Watch],house_shw[T.None],house_shw[T.Other]
0,Abelar Hightower,0,1,0,0,0,0,0,,,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Addam,0,1,0,0,0,0,0,,,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,Addam Frey,0,1,0,0,0,0,0,,,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,Addam Marbrand,0,1,1,1,1,1,1,,,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,Addam Osgrey,1,1,0,0,0,0,0,,,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [80]:
got_df.drop(columns=['Intercept'], inplace=True)
got_df.columns

Index(['name_x', 'isdead_bk', 'ismale', 'book1', 'book2', 'book3', 'book4',
       'book5', 'isalivemother_bk', 'isalivefather_bk',
       ...
       'house_bk[T.STONE CROWS]', 'house_bk[T.UNKNOWN]',
       'house_shw[T.House Bolton]', 'house_shw[T.House Greyjoy]',
       'house_shw[T.House Lannister]', 'house_shw[T.House Stark]',
       'house_shw[T.House Targaryen]', 'house_shw[T.Nights Watch]',
       'house_shw[T.None]', 'house_shw[T.Other]'],
      dtype='object', length=121)

### DONE!  Save data to a .csv file and transfer to local machine

In [81]:
got_df.to_csv('csv/SQL_GOT_data.csv')

To pull this file from the local machine:  
> scp myaws:~/GOT/csv/SQL_GOT_data.csv ../csv

## INNER JOIN
**Characters that appear both in the Book and in the Tv Show**

In [82]:
got_df=pd.merge(book_df, show_df, how='inner', left_on='name', right_on='match_name')
got_df.columns

Index(['name_x', 'isdead_bk', 'ismale', 'book1', 'book2', 'book3', 'book4',
       'book5', 'isalivemother_bk', 'isalivefather_bk', 'isaliveheir_bk',
       'isalivespouse_bk', 'ismarried_bk', 'isnoble_bk',
       'booldeadrelations_bk', 'ispopular_bk', 'death_bk', 'apperance_chp',
       'hasmom_bk', 'hasdad_bk', 'hasheir_bk', 'hasspouse_bk', 'age_bk',
       'numdeadrelations_bk', 'popularity_bk', 'culture_bk', 'house_bk',
       'name_y', 'isdead_shw', 'death_season', 'time_s1', 'time_s2', 'time_s3',
       'time_s4', 'total_episode_num', 'season1', 'season2', 'season3',
       'season4', 'house_shw', 'age_shw', 'match_name'],
      dtype='object')

In [83]:
total = len(got_df)
total

115

# Analysis of the Values (Remove Nan)

In [84]:
for c in got_df.columns[2:] :
    print(c, got_df[c].unique())

ismale [1 0]
book1 [1 0]
book2 [1 0]
book3 [1 0]
book4 [1 0]
book5 [1 0]
isalivemother_bk [nan  1.  0.]
isalivefather_bk [nan  0.  1.]
isaliveheir_bk [nan  1.]
isalivespouse_bk [nan  1.  0.]
ismarried_bk [ 0. nan  1.]
isnoble_bk [1 0]
booldeadrelations_bk [ 0.  1. nan]
ispopular_bk [ 0.  1. nan]
death_bk [nan  2.  1.  4.  3.  5.]
apperance_chp [19. 35. 14. 30.  2. nan 15.  5. 42. 22. 28.  4. 23.  3.  0. 57. 11.  1.
 13. 38. 27. 29. 37. 24. 48. 21. 43. 56. 47. 18. 62. 10.  8. 61.  7. 59.
 51. 26. 12. 20. 64. 54. 34. 55. 53. 33. 70.  6. 44. 25.]
hasmom_bk [0 1]
hasdad_bk [0 1]
hasheir_bk [0 1]
hasspouse_bk [0 1]
age_bk [nan 32. 16. 69. 38. 23. 20. 41. 63. 35. 39. 21. 45. 36. 24. 22. 19. 81.
 51. 56. 34. 49. 14. 42. 17. 15. 43. 37. 84. 10. 30. 29. 27. 58. 97.]
numdeadrelations_bk [ 0.  8.  4.  5. nan  7.  9. 15.  1.  6.  2.  3. 10. 12. 11.]
popularity_bk [0.23076923 0.07023411 0.27759197 0.13043478 1.         0.73913043
 0.40133779 0.43478261 0.34782609        nan 0.47157191 0.34113712
 0

In [85]:
#If a Character is not found in the Book Set, Is not popular
got_df['ispopular_bk'] = got_df['ispopular_bk'].fillna(0)

In [86]:
# Fill Other Nan values with mean value of that column
got_df['booldeadrelations_bk'] = got_df['booldeadrelations_bk'].fillna(int(np.mean(got_df['booldeadrelations_bk'])))
got_df['apperance_chp'] = got_df['apperance_chp'].fillna(int(np.mean(got_df['apperance_chp'])))
got_df['age_bk'] = got_df['age_bk'].fillna(int(np.mean(got_df['age_bk'])))
got_df['numdeadrelations_bk'] = got_df['numdeadrelations_bk'].fillna(int(np.mean(got_df['numdeadrelations_bk'])))
got_df['age_shw'] = got_df['age_shw'].fillna(int(np.mean(got_df['age_shw'])))
got_df['popularity_bk'] = got_df['popularity_bk'].fillna(int(np.mean(got_df['popularity_bk'])))

In [89]:
# Fill values for characters in books but not in show
got_df['isdead_shw'] = got_df['isdead_shw'].fillna(0)
got_df['time_s1'] = got_df['time_s1'].fillna(0)
got_df['time_s2'] = got_df['time_s2'].fillna(0)
got_df['time_s3'] = got_df['time_s3'].fillna(0)
got_df['time_s4'] = got_df['time_s4'].fillna(0)
got_df['total_episode_num'] = got_df['total_episode_num'].fillna(0)
got_df['season1'] = got_df['season1'].fillna(0)
got_df['season2'] = got_df['season2'].fillna(0)
got_df['season3'] = got_df['season3'].fillna(0)
got_df['season4'] = got_df['season4'].fillna(0)
got_df['house_shw'] = got_df['house_shw'].fillna('Other')

In [90]:
for c in got_df.columns[1:] :
    print(c, got_df[c].unique())

isdead_bk [0 1]
ismale [1 0]
book1 [1 0]
book2 [1 0]
book3 [1 0]
book4 [1 0]
book5 [1 0]
isalivemother_bk [nan  1.  0.]
isalivefather_bk [nan  0.  1.]
isaliveheir_bk [nan  1.]
isalivespouse_bk [nan  1.  0.]
ismarried_bk [ 0. nan  1.]
isnoble_bk [1 0]
booldeadrelations_bk [0. 1.]
ispopular_bk [0. 1.]
death_bk [nan  2.  1.  4.  3.  5.]
apperance_chp [19. 35. 14. 30.  2. 23. 15.  5. 42. 22. 28.  4.  3.  0. 57. 11.  1. 13.
 38. 27. 29. 37. 24. 48. 21. 43. 56. 47. 18. 62. 10.  8. 61.  7. 59. 51.
 26. 12. 20. 64. 54. 34. 55. 53. 33. 70.  6. 44. 25.]
hasmom_bk [0 1]
hasdad_bk [0 1]
hasheir_bk [0 1]
hasspouse_bk [0 1]
age_bk [33. 32. 16. 69. 38. 23. 20. 41. 63. 35. 39. 21. 45. 36. 24. 22. 19. 81.
 51. 56. 34. 49. 14. 42. 17. 15. 43. 37. 84. 10. 30. 29. 27. 58. 97.]
numdeadrelations_bk [ 0.  8.  4.  5.  1.  7.  9. 15.  6.  2.  3. 10. 12. 11.]
popularity_bk [0.23076923 0.07023411 0.27759197 0.13043478 1.         0.73913043
 0.40133779 0.43478261 0.34782609 0.         0.47157191 0.34113712
 0.170

# Categorical variables Generation

### Culture in the Book

In [91]:
got_df.culture_bk.unique()

array(['None', 'North', 'Weste', 'River', 'Free ', 'Tyros', 'Valyr',
       'Lysen', 'Dothr', 'Valem', 'Dorni', 'Ghisc', 'Pento', 'Crann',
       'Astap', 'The R', 'Reach', 'Ironb', 'Assha', 'Lhaza', 'Naath',
       'Storm', 'Braav', 'Myris', 'Ironm', 'Qarth', 'Wildl'], dtype=object)

In [92]:
culture_df=patsy.dmatrix('culture_bk',data=got_df,return_type='dataframe')
culture_df.head()

Unnamed: 0,Intercept,culture_bk[T.Astap],culture_bk[T.Braav],culture_bk[T.Crann],culture_bk[T.Dorni],culture_bk[T.Dothr],culture_bk[T.Free ],culture_bk[T.Ghisc],culture_bk[T.Ironb],culture_bk[T.Ironm],...,culture_bk[T.Qarth],culture_bk[T.Reach],culture_bk[T.River],culture_bk[T.Storm],culture_bk[T.The R],culture_bk[T.Tyros],culture_bk[T.Valem],culture_bk[T.Valyr],culture_bk[T.Weste],culture_bk[T.Wildl]
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### House in the Book

In [93]:
got_df.house_bk.unique()

array(['OTHER', 'BROTHERHOOD WITHOUT BANNERS', 'HOUSE STARK', 'UNKNOWN',
       'HOUSE TULLY', 'HOUSE LANNISTER', 'HOUSE TARGARYEN',
       'HOUSE CLEGANE', "NIGHT'S WATCH", 'HOUSE BARATHEON', 'HOUSE ARRYN',
       'HOUSE TYRELL', 'HOUSE GREYJOY', 'HOUSE MARTELL', 'HOUSE FREY',
       'HOUSE BAELISH', 'HOUSE BOLTON', 'BRAVE COMPANIONS'], dtype=object)

In [94]:
# Remove special charactyers from column names
got_df['house_bk'].replace(["NIGHT'S WATCH"], 'NIGHTS WATCH', inplace=True)
got_df.house_bk.unique()

array(['OTHER', 'BROTHERHOOD WITHOUT BANNERS', 'HOUSE STARK', 'UNKNOWN',
       'HOUSE TULLY', 'HOUSE LANNISTER', 'HOUSE TARGARYEN',
       'HOUSE CLEGANE', 'NIGHTS WATCH', 'HOUSE BARATHEON', 'HOUSE ARRYN',
       'HOUSE TYRELL', 'HOUSE GREYJOY', 'HOUSE MARTELL', 'HOUSE FREY',
       'HOUSE BAELISH', 'HOUSE BOLTON', 'BRAVE COMPANIONS'], dtype=object)

In [95]:
house_df=patsy.dmatrix('house_bk',data=got_df,return_type='dataframe')
house_df.head()

Unnamed: 0,Intercept,house_bk[T.BROTHERHOOD WITHOUT BANNERS],house_bk[T.HOUSE ARRYN],house_bk[T.HOUSE BAELISH],house_bk[T.HOUSE BARATHEON],house_bk[T.HOUSE BOLTON],house_bk[T.HOUSE CLEGANE],house_bk[T.HOUSE FREY],house_bk[T.HOUSE GREYJOY],house_bk[T.HOUSE LANNISTER],house_bk[T.HOUSE MARTELL],house_bk[T.HOUSE STARK],house_bk[T.HOUSE TARGARYEN],house_bk[T.HOUSE TULLY],house_bk[T.HOUSE TYRELL],house_bk[T.NIGHTS WATCH],house_bk[T.OTHER],house_bk[T.UNKNOWN]
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


### House in the Show

In [96]:
got_df.house_shw.unique()

array(["The Night's Wat", 'Other', 'House Lannister', 'House Stark',
       'House Greyjoy', 'None', 'House Targaryen', 'House Baratheon',
       'House Bolton'], dtype=object)

In [97]:
# Remove Special Characters from column names
got_df['house_shw'].replace(["The Night's Wat"], 'Nights Watch', inplace=True)
got_df.house_shw.unique()

array(['Nights Watch', 'Other', 'House Lannister', 'House Stark',
       'House Greyjoy', 'None', 'House Targaryen', 'House Baratheon',
       'House Bolton'], dtype=object)

In [98]:
houseshw_df=patsy.dmatrix('house_shw',data=got_df,return_type='dataframe')
houseshw_df.head()

Unnamed: 0,Intercept,house_shw[T.House Bolton],house_shw[T.House Greyjoy],house_shw[T.House Lannister],house_shw[T.House Stark],house_shw[T.House Targaryen],house_shw[T.Nights Watch],house_shw[T.None],house_shw[T.Other]
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [99]:
got_df = got_df.join(culture_df)
got_df.head()

Unnamed: 0,name_x,isdead_bk,ismale,book1,book2,book3,book4,book5,isalivemother_bk,isalivefather_bk,...,culture_bk[T.Qarth],culture_bk[T.Reach],culture_bk[T.River],culture_bk[T.Storm],culture_bk[T.The R],culture_bk[T.Tyros],culture_bk[T.Valem],culture_bk[T.Valyr],culture_bk[T.Weste],culture_bk[T.Wildl]
0,Alliser Thorne,0,1,1,1,1,1,1,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Alysane Mormont,0,0,0,0,0,1,1,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Amory Lorch,1,1,1,1,1,1,1,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Anguy,0,1,1,0,1,1,0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Arya Stark,0,0,1,1,1,1,1,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [100]:
got_df.drop(columns=['Intercept'], inplace=True)

In [101]:
got_df = got_df.join(house_df)
got_df.head()

Unnamed: 0,name_x,isdead_bk,ismale,book1,book2,book3,book4,book5,isalivemother_bk,isalivefather_bk,...,house_bk[T.HOUSE GREYJOY],house_bk[T.HOUSE LANNISTER],house_bk[T.HOUSE MARTELL],house_bk[T.HOUSE STARK],house_bk[T.HOUSE TARGARYEN],house_bk[T.HOUSE TULLY],house_bk[T.HOUSE TYRELL],house_bk[T.NIGHTS WATCH],house_bk[T.OTHER],house_bk[T.UNKNOWN]
0,Alliser Thorne,0,1,1,1,1,1,1,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,Alysane Mormont,0,0,0,0,0,1,1,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,Amory Lorch,1,1,1,1,1,1,1,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,Anguy,0,1,1,0,1,1,0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Arya Stark,0,0,1,1,1,1,1,,,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [102]:
got_df.drop(columns=['Intercept'], inplace=True)

In [103]:
got_df = got_df.join(houseshw_df)
got_df.head()

Unnamed: 0,name_x,isdead_bk,ismale,book1,book2,book3,book4,book5,isalivemother_bk,isalivefather_bk,...,house_bk[T.UNKNOWN],Intercept,house_shw[T.House Bolton],house_shw[T.House Greyjoy],house_shw[T.House Lannister],house_shw[T.House Stark],house_shw[T.House Targaryen],house_shw[T.Nights Watch],house_shw[T.None],house_shw[T.Other]
0,Alliser Thorne,0,1,1,1,1,1,1,,,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,Alysane Mormont,0,0,0,0,0,1,1,,,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,Amory Lorch,1,1,1,1,1,1,1,,,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,Anguy,0,1,1,0,1,1,0,,,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,Arya Stark,0,0,1,1,1,1,1,,,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [104]:
got_df.drop(columns=['Intercept'], inplace=True)
got_df.columns

Index(['name_x', 'isdead_bk', 'ismale', 'book1', 'book2', 'book3', 'book4',
       'book5', 'isalivemother_bk', 'isalivefather_bk', 'isaliveheir_bk',
       'isalivespouse_bk', 'ismarried_bk', 'isnoble_bk',
       'booldeadrelations_bk', 'ispopular_bk', 'death_bk', 'apperance_chp',
       'hasmom_bk', 'hasdad_bk', 'hasheir_bk', 'hasspouse_bk', 'age_bk',
       'numdeadrelations_bk', 'popularity_bk', 'culture_bk', 'house_bk',
       'name_y', 'isdead_shw', 'death_season', 'time_s1', 'time_s2', 'time_s3',
       'time_s4', 'total_episode_num', 'season1', 'season2', 'season3',
       'season4', 'house_shw', 'age_shw', 'match_name', 'culture_bk[T.Astap]',
       'culture_bk[T.Braav]', 'culture_bk[T.Crann]', 'culture_bk[T.Dorni]',
       'culture_bk[T.Dothr]', 'culture_bk[T.Free ]', 'culture_bk[T.Ghisc]',
       'culture_bk[T.Ironb]', 'culture_bk[T.Ironm]', 'culture_bk[T.Lhaza]',
       'culture_bk[T.Lysen]', 'culture_bk[T.Myris]', 'culture_bk[T.Naath]',
       'culture_bk[T.None]', 'cultur

### DONE!  Save data to a .csv file and transfer to local machine

In [105]:
got_df.to_csv('csv/SQLINNER_GOT_data.csv')

To pull this file from the local machine:  
> scp myaws:~/GOT/csv/SQLINNER_GOT_data.csv ../csv
