In [1]:
# Import Python libraries for data manipuation and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as pyplot



In [2]:
# Import the Python machine learning libraries we need
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


In [3]:
# Import some convenience functions.  This can be found on the course github
from functions import *

In [4]:
# Load the data set
dataset = pd.read_csv("learn.csv")

In [5]:
# Inspect first few rows
dataset.head(12)

Unnamed: 0,id,country,lifeexp,unemployment,happiness
0,0,Albania,77.6,6.09,Low
1,1,Bulgaria,75.0,3.24,Low
2,2,Iran,75.8,2.11,Low
3,3,Ukraine,71.9,1.53,Low
4,4,South Africa,61.8,7.52,Low
5,5,Ukraine,71.9,1.53,Low
6,6,Austria,81.4,1.43,High
7,7,Croatia,77.3,5.53,High
8,8,Denmark,80.7,1.36,High
9,9,Portugal,80.8,4.37,High


In [6]:
# Inspect data shape
dataset.shape

(12, 5)

In [7]:
# Inspect descriptive stats
dataset.describe()

Unnamed: 0,id,lifeexp,unemployment
count,12.0,12.0,12.0
mean,5.5,75.741667,2.910833
std,3.605551,5.375104,2.434526
min,0.0,61.8,0.06
25%,2.75,74.225,1.4125
50%,5.5,77.2,1.82
75%,8.25,78.375,4.66
max,11.0,81.4,7.52


In [8]:
# Split into input and output features
y = dataset["happiness"]
X = dataset[["lifeexp","unemployment"]]
X.head(12)

Unnamed: 0,lifeexp,unemployment
0,77.6,6.09
1,75.0,3.24
2,75.8,2.11
3,71.9,1.53
4,61.8,7.52
5,71.9,1.53
6,81.4,1.43
7,77.3,5.53
8,80.7,1.36
9,80.8,4.37


In [9]:
y.head(12)

0      Low
1      Low
2      Low
3      Low
4      Low
5      Low
6     High
7     High
8     High
9     High
10    High
11    High
Name: happiness, dtype: object

In [10]:
# Split into test and training sets
test_size = 0.33
seed = 7
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=test_size, random_state=seed)

In [11]:
X_train

Unnamed: 0,lifeexp,unemployment
0,77.6,6.09
1,75.0,3.24
11,77.6,0.06
8,80.7,1.36
3,71.9,1.53
6,81.4,1.43
9,80.8,4.37
4,61.8,7.52


In [12]:
X_test

Unnamed: 0,lifeexp,unemployment
7,77.3,5.53
10,77.1,0.16
2,75.8,2.11
5,71.9,1.53


In [13]:
y_train

0      Low
1      Low
11    High
8     High
3      Low
6     High
9     High
4      Low
Name: happiness, dtype: object

In [14]:
y_test

7     High
10    High
2      Low
5      Low
Name: happiness, dtype: object

In [15]:
# Select algorithm
model = DecisionTreeClassifier()

In [16]:
# Fit model to the data
model.fit(X_train, y_train)

In [17]:
# Check model performance on training data
predictions = model.predict(X_train)
print(accuracy_score(y_train, predictions))

1.0


In [18]:
# Evaluate the model on the test data
predictions = model.predict(X_test)

In [19]:
predictions

array(['Low', 'High', 'Low', 'Low'], dtype=object)

In [20]:
print(accuracy_score(y_test, predictions))

0.75


In [None]:
df = X_test.copy()
df['Actual'] = y_test
df['Prediction'] = predictions
df

Unnamed: 0,lifeexp,unemployment,Actual,Prediction
7,77.3,5.53,High,Low
10,77.1,0.16,High,High
2,75.8,2.11,Low,Low
5,71.9,1.53,Low,Low
