In [2]:
# Objective: to build a model from historical data of students
# and their response to different technology uses. 
# Then you use the trained decision tree to predict the class of an unknown student

# Importing libraries
import numpy as np 
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [6]:
df = pd.read_csv(r"C:\Users\lconc\OneDrive\Desktop\Data Science\datathreevar.csv")

In [7]:
df.head()

Unnamed: 0,RESPONDENT_AGE,SEX,GPA,HRSPC,VG,HRSCELL,HIGHLOWGPA
0,1,2,4,3,9,5,1
1,1,2,8,3,9,9,0
2,2,1,3,3,3,5,1
3,2,1,6,4,9,9,0
4,2,1,9,1,1,1,0


In [9]:
# X will be feature matrix and y will be response vector or target(high or low GPA)
X = df[['RESPONDENT_AGE', 'SEX', 'HRSPC', 'GPA', 'VG','HRSCELL']].values
X[0:5]

array([[1, 2, 3, 4, 9, 5],
       [1, 2, 3, 8, 9, 9],
       [2, 1, 3, 3, 3, 5],
       [2, 1, 4, 6, 9, 9],
       [2, 1, 1, 9, 1, 1]], dtype=int64)

In [10]:
y = df["HIGHLOWGPA"]
y[0:5]

0    1
1    0
2    1
3    0
4    0
Name: HIGHLOWGPA, dtype: int64

In [11]:
# Setting up decision tree - split dataset in train and test
from sklearn.model_selection import train_test_split

In [12]:
X_trainset, X_testset, y_trainset, y_testset = train_test_split(X, y, test_size=0.3, random_state=3)

In [14]:
# Create an instance of the DecisionTreeClassifier called studentHighLowGPA
studentHighLowGPA = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
studentHighLowGPA # it shows the default parameters

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [15]:
# Fit the data with the training feature matrix X_trainset and training response vector y_trainset 
studentHighLowGPA.fit(X_trainset,y_trainset)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [16]:
# Making some predictions
predTree = studentHighLowGPA.predict(X_testset)

In [17]:
# Print 
print (predTree [0:5])
print (y_testset [0:5])

[1 0 1 0 0]
610     1
889     0
1767    1
774     0
361     0
Name: HIGHLOWGPA, dtype: int64


In [18]:
# Evaluating
from sklearn import metrics
import matplotlib.pyplot as plt
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_testset, predTree))

DecisionTrees's Accuracy:  1.0


In [19]:
# Visualizing tree
!conda install -c conda-forge pydotplus -y
!conda install -c conda-forge python-graphviz -y

Collecting package metadata: ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\ProgramData\Anaconda3

  added / updated specs:
    - pydotplus


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    conda-4.6.14               |           py37_0         2.1 MB  conda-forge
    graphviz-2.38.0            |    h6538335_1011        41.0 MB  conda-forge
    pydotplus-2.0.2            |             py_2          23 KB  conda-forge
    ------------------------------------------------------------
                                           Total:        43.1 MB

The following NEW packages will be INSTALLED:

  graphviz           conda-forge/win-64::graphviz-2.38.0-h6538335_1011
  pydotplus          conda-forge/noarch::pydotplus-2.0.2-py_2

The following packages will be UPDATED:

  conda                      pkgs/main::conda-4.6.11-py37_0 --> con



  current version: 4.6.11
  latest version: 4.6.14

Please update conda by running

    $ conda update -n base -c defaults conda



EnvironmentNotWritableError: The current user does not have write permissions to the target environment.
  environment location: C:\ProgramData\Anaconda3




Collecting package metadata: ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\ProgramData\Anaconda3

  added / updated specs:
    - python-graphviz


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    python-graphviz-0.10.1     |             py_0          17 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          17 KB

The following NEW packages will be INSTALLED:

  graphviz           conda-forge/win-64::graphviz-2.38.0-h6538335_1011
  python-graphviz    conda-forge/noarch::python-graphviz-0.10.1-py_0

The following packages will be UPDATED:

  conda                      pkgs/main::conda-4.6.11-py37_0 --> conda-forge::conda-4.6.14-py37_0

The following packages will be SUPERSEDED by a higher-priority channel:

  certifi                              



  current version: 4.6.11
  latest version: 4.6.14

Please update conda by running

    $ conda update -n base -c defaults conda



EnvironmentNotWritableError: The current user does not have write permissions to the target environment.
  environment location: C:\ProgramData\Anaconda3




In [22]:
from sklearn.externals.six import StringIO
import matplotlib.image as mpimg
from sklearn import tree
%matplotlib inline 

In [24]:
dot_data = StringIO()
filename = "studentHighLowGPA.png"
featureNames = df.columns[0:6]
targetNames = df["HIGHLOWGPA"].unique().tolist()
out=tree.export_graphviz(studentHighLowGPA,feature_names=featureNames, out_file=dot_data, class_names= np.unique(y_trainset), filled=True,  special_characters=True,rotate=False)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png(filename)
img = mpimg.imread(filename)
plt.figure(figsize=(100, 200))
plt.imshow(img,interpolation='nearest')

TypeError: can only concatenate str (not "numpy.int64") to str