In [2]:
import numpy as np

# data processing
import pandas as pd


# data visualization
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style

# Algorithms
from sklearn.tree import DecisionTreeClassifier


In [3]:
data_train = pd.read_csv('../data/train.csv')
data_test = pd.read_csv('../data/test.csv')
data_gender = pd.read_csv('../data/gender_submission.csv')

In [4]:
data_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
alldata = [data_train, data_test]

In [7]:
data_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [8]:
# data_train.corr()

In [9]:
data_train.columns.values

array(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype=object)

In [10]:
data_train.columns.values.shape

(12,)

In [11]:
data_train['Embarked'].describe()

count     889
unique      3
top         S
freq      644
Name: Embarked, dtype: object

In [12]:
data_train['Embarked'].isna().sum()

2

In [13]:
common_value = 'S'

for dataset in alldata:
    dataset['Embarked'] = dataset['Embarked'].fillna(common_value)

In [14]:
data_train['Embarked'].isna().sum()

0

In [15]:
for dataset in alldata:
    dataset['Fare'] = dataset['Fare'].fillna(0)
    dataset['Fare'] = dataset['Fare'].astype(int)

In [16]:
genders = {'male': 0, 'female': 1}

for dataset in alldata:
    dataset['Sex'] = dataset['Sex'].map(genders)

In [17]:
ports = {"S": 0, "C": 1, "Q": 2}

for dataset in alldata:
    dataset['Embarked'] = dataset['Embarked'].map(ports)

In [18]:
data_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7,,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71,C85,1
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7,,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53,C123,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8,,0


In [19]:
data_train = data_train.drop(['PassengerId','Name','Ticket','Cabin','Fare'], axis=1)
data_test  = data_test.drop(['PassengerId','Name','Ticket','Cabin','Fare'], axis=1)

In [20]:
data_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,0,22.0,1,0,0
1,1,1,1,38.0,1,0,1
2,1,3,1,26.0,0,0,0
3,1,1,1,35.0,1,0,0
4,0,3,0,35.0,0,0,0


In [21]:
data_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,3,0,34.5,0,0,2
1,3,1,47.0,1,0,0
2,2,0,62.0,0,0,2
3,3,0,27.0,0,0,0
4,3,1,22.0,1,1,0


In [22]:
data_test.insert(loc=0, column='Survived', value=data_gender['Survived'].values)



In [23]:
data_test.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,0,34.5,0,0,2
1,1,3,1,47.0,1,0,0
2,0,2,0,62.0,0,0,2
3,0,3,0,27.0,0,0,0
4,1,3,1,22.0,1,1,0


In [24]:
data_test.shape

(418, 7)

In [25]:
data_train['Age'].isna().sum()

177

In [26]:
np.random.seed(0)

In [27]:
mean = data_train["Age"].mean()
std = data_train["Age"].std()
list_isna = data_train['Age'].isna()
for line in range(list_isna.shape[0]):
    if(list_isna[line]==True):
      #print("Antes",line,data_train.iloc[line]["Age"])
      rand_age = np.random.randint(mean - std, mean + std, 1)
      #print(rand_age)
      data_train.at[line,'Age']=rand_age
      #print("Depois",line,data_train.iloc[line]["Age"])


In [28]:

data_train["Age"].isnull().sum()

0

In [29]:
data_test["Age"].isnull().sum()

86

In [30]:
mean = data_test["Age"].mean()
std = data_test["Age"].std()
list_isna = data_test['Age'].isna()
for line in range(list_isna.shape[0]):
    if(list_isna[line]==True):
      #print("Antes",line,data_test.iloc[line]["Age"])
      rand_age = np.random.randint(mean - std, mean + std, 1)
      #print(rand_age)
      data_test.at[line,'Age']=rand_age
      #print("Depois",line,data_test.iloc[line]["Age"])

In [31]:
data_test["Age"].isnull().sum()

0

In [32]:
data_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,0,22.0,1,0,0
1,1,1,1,38.0,1,0,1
2,1,3,1,26.0,0,0,0
3,1,1,1,35.0,1,0,0
4,0,3,0,35.0,0,0,0


In [33]:
data_train["Age"] = data_train["Age"].astype(int)
data_test["Age"] = data_test["Age"].astype(int)

In [34]:
X = data_train.drop(['Age'], axis=1).values
Y = data_train['Age'].values

In [35]:
#https://mljar.com/blog/visualize-decision-tree/

In [36]:
attributes = data_train.columns.values.tolist()
del attributes[0]
print(attributes)

#decisoes = np.unique(data_train['Survived'].values).tolist()
#print(decisoes)
#decisoes = map(str, decisoes)
#print(decisoes)
decisoes = ['0','1']


['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']


In [37]:
# X_test = data_test.values
# Y_pred = clf.predict(X_test)

In [38]:
# Y_test = data_gender['Survived'].values
# print(Y_test)

In [39]:
#from sklearn.metrics import accuracy_score

#accuracy_score(Y_test, Y_pred)

In [40]:
#from sklearn.metrics import balanced_accuracy_score
#balanced_accuracy_score(Y_test, Y_pred)

In [41]:
data_train.columns.values

array(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked'],
      dtype=object)

In [42]:
data_train.drop(['Survived'], axis=1).values

array([[ 3,  0, 22,  1,  0,  0],
       [ 1,  1, 38,  1,  0,  1],
       [ 3,  1, 26,  0,  0,  0],
       ...,
       [ 3,  1, 34,  1,  2,  0],
       [ 1,  0, 26,  0,  0,  1],
       [ 3,  0, 32,  0,  0,  2]])

In [43]:
data_train['Survived'].values

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,

In [44]:
aux = np.where(data_gender['Survived'].values==0)[0]
aux.shape

(266,)

In [45]:
data_gender.shape

(418, 2)

In [46]:
data_gender.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [47]:
data_test['Survived'] = data_gender['Survived']

In [48]:
data_test.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,0,34,0,0,2
1,1,3,1,47,1,0,0
2,0,2,0,62,0,0,2
3,0,3,0,27,0,0,0
4,1,3,1,22,1,1,0


In [49]:
data_test.corr()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
Survived,1.0,-0.108615,1.0,-0.003778,0.099943,0.15912,0.126779
Pclass,-0.108615,1.0,-0.108615,-0.43029,0.001087,0.018721,0.031096
Sex,1.0,-0.108615,1.0,-0.003778,0.099943,0.15912,0.126779
Age,-0.003778,-0.43029,-0.003778,1.0,-0.06748,-0.050601,0.057969
SibSp,0.099943,0.001087,0.099943,-0.06748,1.0,0.306895,-0.100603
Parch,0.15912,0.018721,0.15912,-0.050601,0.306895,1.0,-0.125164
Embarked,0.126779,0.031096,0.126779,0.057969,-0.100603,-0.125164,1.0


In [50]:
# 1- juntar todos os dados em apenas uma planilha;
# 2- separar os dados em k-folds, onde k=5;
# 3- realizar tarefa de regressão de AGE utilizando modelos lineares;
# 4- Utilizar as medidas de avaliação MSE, MAE e MAPE, guardando numa lista cada valor de k para cada medida;
# 5- Criar gráficos de barras ou candlestick com média e desvio padrão de cada medida;
# 6- Repetir mesmos passos 1--5 para o algoritmo de árvores de decisão;




In [51]:
full_data = pd.concat([data_train, data_test])
full_data.shape

(1309, 7)

In [52]:
nulos = np.where(full_data['Age'].values==0)
print(nulos)
full_data.values[nulos,3] = 1

(array([  78,  305,  469,  644,  755,  803,  831, 1092, 1141, 1172, 1198,
       1245]),)


In [53]:
np.where(full_data['Age'].values==0)

(array([  78,  305,  469,  644,  755,  803,  831, 1092, 1141, 1172, 1198,
        1245]),)

In [54]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5,random_state=0,shuffle=True)

In [55]:
for i, (train_index, test_index) in enumerate(kf.split(full_data)):
  print(f"Fold {i}:")
  print(f"  Train Size={train_index.shape}")
  print(f"  Test Size={test_index.shape}")
  print(f"  Train: index={train_index}")
  print(f"  Test:  index={test_index}")

  Y_train = full_data.values[train_index][:,3]
  X_train = full_data.values[train_index]
  X_train = np.delete(X_train, 3, 1)


  Y_test = full_data.values[test_index][:,3]
  X_test = full_data.values[test_index]
  X_test = np.delete(X_test, 3, 1)


Fold 0:
  Train Size=(1047,)
  Test Size=(262,)
  Train: index=[   0    2    3 ... 1304 1306 1307]
  Test:  index=[   1    5    8    9   14   18   27   31   34   39   45   47   49   52
   55   58   61   75   80   85   92  108  124  140  141  142  152  156
  159  161  184  186  192  194  198  202  204  211  215  224  227  231
  233  240  241  247  251  254  268  270  276  283  295  298  299  302
  303  310  312  317  319  320  328  330  342  349  358  359  361  381
  386  390  393  397  406  408  412  416  418  420  422  425  426  427
  434  435  436  438  442  443  445  446  453  458  459  461  466  472
  474  477  478  482  483  486  491  503  505  506  511  512  513  523
  528  531  533  535  539  541  549  554  557  569  571  589  597  601
  608  609  610  618  625  632  638  641  642  649  651  656  658  669
  678  693  695  717  725  733  743  753  759  760  762  768  769  772
  773  785  790  792  793  794  795  796  798  813  814  817  833  839
  846  853  857  858  874  875  87

In [57]:
from sklearn import linear_model
reg = linear_model.Lasso(alpha=0.1)

reg.fit(X_train, Y_train)

reg.predict([[1, 1]])

[[0 3 0 1 0 0]
 [1 1 1 1 0 1]
 [1 3 1 0 0 0]
 ...
 [0 3 0 0 0 0]
 [0 3 0 0 0 0]
 [0 3 0 1 1 1]]
[22 38 26 ... 38 27 16]


ValueError: X has 2 features, but Lasso is expecting 6 features as input.