# Decision Tree 
## Train & Test

---

In [26]:
import numpy as np
import pandas as pd 
from sklearn import tree
from sklearn import preprocessing
dataset = pd.read_csv('train.csv')
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Checking Null values

In [27]:
dataset.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

### Droping the unnecessary records

In [28]:
dataset.drop(['PassengerId','Name','Ticket','Cabin'],axis=1,inplace=True)
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


### Convert Sex into 0 or 1

In [29]:
label_encoder = preprocessing.LabelEncoder()
dataset['Sex'] = label_encoder.fit_transform(dataset['Sex'])

---

# Decision Tree

In [30]:
predictors = pd.DataFrame([dataset['Sex'],dataset['Age'],dataset['Fare']]).T

In [31]:
tree_model = tree.DecisionTreeClassifier(max_depth=6)
tree_model.fit(X=predictors, y=dataset['Survived'])

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

### - Graphviz

In [7]:
with open('Dtree1.dot','w') as f :
    f = tree.export_graphviz(tree_model, feature_names=['Sex','Age','Fare'], out_file=f);

### - Accuracy

In [8]:
Accuracy = tree_model.score(X=predictors, y=dataset['Survived'])
Accuracy

0.8301462317210349

In [9]:
print('Accuracy of 3 IDV and 1 DV ')
print('Accuracy :',Accuracy*100,'%')

Accuracy of 3 IDV and 1 DV 
Accuracy : 83.01462317210348 %


---

## Predict the dataset

In [32]:
dataset_test = pd.read_csv('test.csv')
dataset_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S


### - Checking Null values

In [11]:
dataset.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

### - Convert Sex and Embarked into numeric

In [33]:
label_encoder2 = preprocessing.LabelEncoder()
dataset['Sex'] = label_encoder2.fit_transform(dataset['Sex'])

In [34]:
dataset['Embarked'] = label_encoder2.fit_transform(dataset['Embarked'])

In [35]:
dataset['Embarked'].head()

0    2
1    0
2    2
3    2
4    2
Name: Embarked, dtype: int32

In [23]:
dataset['Sex'].head()

0    1
1    0
2    0
3    0
4    1
Name: Sex, dtype: int64

In [37]:
dataset_test["Sex"] = label_encoder2.fit_transform(dataset_test["Sex"])

In [38]:
test_features1 = dataset_test[['Age', 'Sex', 'Fare']]
test_features1.head()

Unnamed: 0,Age,Sex,Fare
0,34.5,1,7.8292
1,47.0,0,7.0
2,62.0,1,9.6875
3,27.0,1,8.6625
4,22.0,0,12.2875


##  predict

In [40]:
test_preds = tree_model.predict(test_features1)

In [42]:
predicted_output = pd.DataFrame({'passengerId':dataset_test['PassengerId'],'Survived':test_preds})

### - Creating output csv file

In [43]:
predicted_output.to_csv('Output.csv',index=False)

### Output csv file

In [45]:
data_output = pd.read_csv('Output.csv')
data_output.head()

Unnamed: 0,passengerId,Survived
0,892,1
1,893,1
2,894,1
3,895,1
4,896,1
