### **Classification using scikit-learn and keras (with pandas)**

In [1]:
# Set-up
import csv
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from keras import Sequential
from keras.layers import Dense
from numpy.random import seed
import tensorflow

In [2]:
# Upload all data files - must be on local computer
# Cities.csv, Players.csv, Titanic.csv
# If running notebook on local computer:
#   No need to run this cell (it will generate an error)
#   Make sure data files are in same workspace as notebook
from google.colab import files
uploaded = files.upload()

Saving Cities.csv to Cities.csv
Saving Players.csv to Players.csv
Saving Titanic.csv to Titanic.csv


### Prepare Cities data for classification
Predict <i>temperature category</i> from other features

In [3]:
# Read Cities.csv into dataframe, add column for temperature category
# Note: For a dataframe D and integer i, D.loc[i] is the i-th row of D
f = open('Cities.csv')
cities = pd.read_csv(f)
cats = []
for i in range(len(cities)):
    if cities.loc[i]['temperature'] < 5:
        cats.append('cold')
    elif cities.loc[i]['temperature'] < 9:
        cats.append('cool')
    elif cities.loc[i]['temperature'] < 15:
        cats.append('warm')
    else: cats.append('hot')
cities['category'] = cats
print("cold:", len(cities[(cities.category == 'cold')]))
print("cool:", len(cities[(cities.category == 'cool')]))
print("warm:", len(cities[(cities.category == 'warm')]))
print("hot:", len(cities[(cities.category == 'hot')]))

cold: 17
cool: 92
warm: 79
hot: 25


In [4]:
# Create training and test sets for cities data
numitems = len(cities)
percenttrain = 0.85
numtrain = int(numitems*percenttrain)
numtest = numitems - numtrain
print('Training set', numtrain, 'items')
print('Test set', numtest, 'items')
citiesTrain = cities[0:numtrain]
citiesTest = cities[numtrain:]

Training set 181 items
Test set 32 items


### K-nearest-neighbors classification

In [15]:
features = ['longitude', 'latitude']
neighbors = 10
classifier = KNeighborsClassifier(neighbors)
classifier.fit(citiesTrain[features], citiesTrain['category'])
predictions = classifier.predict(citiesTest[features])
# Calculate accuracy
numtrain = len(citiesTrain)
numtest = len(citiesTest)
correct = 0
for i in range(numtest):
    #print('Predicted:', predictions[i], ' Actual:', citiesTest.loc[numtrain+i]['category'])
    if predictions[i] == citiesTest.loc[numtrain+i]['category']: correct +=1
print('Accuracy:', correct/numtest)
# Comment out print, try different values for neighbors, different features

Accuracy: 1.0


### <font color="green">**Your Turn: K-nearest-neighbors on World Cup data**</font>
<font color="green">Predict <i>position</i> from one or more of <i>minutes, shots, passes, tackles, saves</i></font>

In [6]:
# This cell does all the set-up, including reordering the data to avoid team bias.
f = open('Players.csv')
players = pd.read_csv(f)
players = players.sort_values(by='surname')
players = players.reset_index(drop=True)
numitems = len(players)
percenttrain = 0.92
numtrain = int(numitems*percenttrain)
numtest = numitems - numtrain
print('Training set', numtrain, 'items')
print('Test set', numtest, 'items')
playersTrain = players[0:numtrain]
playersTest = players[numtrain:]

Training set 547 items
Test set 48 items


In [43]:
# This cell does the classification.
# Try different features and different numbers of neighbors.
# What's the highest accuracy you can get?
features = ['shots', 'tackles']
neighbors = 30
classifier = KNeighborsClassifier(neighbors)
classifier.fit(playersTrain[features], playersTrain['position'])
predictions = classifier.predict(playersTest[features])
# Calculate accuracy
numtrain = len(playersTrain)
numtest = len(playersTest)
correct = 0
for i in range(numtest):
#    print('Predicted:', predictions[i], ' Actual:', playersTest.loc[numtrain+i]['position'])
    if predictions[i] == playersTest.loc[numtrain+i]['position']: correct +=1
print('Accuracy:', correct/numtest)

Accuracy: 0.7083333333333334


### <font color="green">**Your Turn Extra: K-nearest-neighbors on Titanic data**</font>
<font color="green">Predict <i>survived</i> from one or more of <i>gender, age, class, fare, embarked</i></font>

In [8]:
# This cell does all the set-up
f = open('Titanic.csv')
titanic = pd.read_csv(f)
# Convert gender and embarked to numeric values and missing ages to average age
agesNN = []
for i in range(len(titanic)):
  if not pd.isnull(titanic.loc[i]['age']):
    agesNN.append(titanic.loc[i]['age'])
avgage = np.average(agesNN)
ages = []
genders = []
embarkeds = []
#filling up missing datas
for i in range(len(titanic)):
    if pd.isnull(titanic.loc[i]['age']): ages.append(avgage)
    else: ages.append(titanic.loc[i]['age'])
    if titanic.loc[i]['gender'] == 'M': genders.append(0)
    else: genders.append(1)
    if titanic.loc[i]['embarked'] == 'Cherbourg': embarkeds.append(0)
    elif titanic.loc[i]['embarked'] == 'Southampton': embarkeds.append(1)
    else: embarkeds.append(2)
titanic['age'] = ages
titanic['gender'] = genders
titanic['embarked'] = embarkeds
# Create training and test sets
numitems = len(titanic)
percenttrain = 0.92
numtrain = int(numitems*percenttrain)
numtest = numitems - numtrain
print('Training set', numtrain, 'items')
print('Test set', numtest, 'items')
titanicTrain = titanic[0:numtrain]
titanicTest = titanic[numtrain:]

Training set 819 items
Test set 72 items


In [49]:
# This cell does the classification.
# Try different features and different numbers of neighbors.
# What's the highest accuracy you can get?
features = ['gender', 'class']
neighbors = 10
classifier = KNeighborsClassifier(neighbors)
classifier.fit(titanicTrain[features], titanicTrain['survived'])
predictions = classifier.predict(titanicTest[features])
# Calculate accuracy
numtrain = len(titanicTrain)
numtest = len(titanicTest)
correct = 0
for i in range(numtest):
#    print('Predicted:', predictions[i], ' Actual:', titanicTest.loc[numtrain+i]['survived'])
    if predictions[i] == titanicTest.loc[numtrain+i]['survived']: correct +=1
print('Accuracy:', correct/numtest)

Accuracy: 0.8611111111111112


### Decision tree classification

In [61]:
features = ['longitude','latitude']
split = 10
dt = DecisionTreeClassifier(random_state=0, min_samples_split=split) # split parameter is optional
dt.fit(citiesTrain[features], citiesTrain['category'])
predictions = dt.predict(citiesTest[features])
print(predictions)
# Calculate accuracy
numtrain = len(citiesTrain)
numtest = len(citiesTest)
correct = 0
for i in range(numtest):
#    print('Predicted:', predictions[i], ' Actual:', citiesTest.loc[numtrain+i]['category'])
    if predictions[i] == citiesTest.loc[numtrain+i]['category']: correct +=1
print('Accuracy:', correct/numtest)
# Try different values for split, different features

['warm' 'hot' 'warm' 'hot' 'cold' 'cold' 'cool' 'warm' 'warm' 'cold'
 'cold' 'warm' 'cool' 'warm' 'warm' 'cool' 'warm' 'hot' 'cold' 'cold'
 'cold' 'hot' 'warm' 'warm' 'cool' 'cool' 'cool' 'warm' 'warm' 'cool'
 'warm' 'cool']
Accuracy: 0.75


### "Forest" of decision trees

In [None]:
features = ['longitude', 'latitude']
split = 10
trees = 10
rf = RandomForestClassifier(random_state=0, min_samples_split=split, n_estimators=trees)
rf.fit(citiesTrain[features], citiesTrain['category'])
predictions = rf.predict(citiesTest[features])
# Calculate accuracy
numtrain = len(citiesTrain)
numtest = len(citiesTest)
correct = 0
for i in range(numtest):
#    print('Predicted:', predictions[i], ' Actual:', citiesTest.loc[numtrain+i]['category'])
    if predictions[i] == citiesTest.loc[numtrain+i]['category']: correct +=1
print('Accuracy:', correct/numtest)
# Try different values for split and trees, different features

### <font color="green">**Your Turn: Decision tree and forest of trees on World Cup data**</font>

In [None]:
# SINGLE TREE
# Try different features and different values for split.
# What's the highest accuracy you can get?
features = ['minutes', 'shots', 'passes', 'tackles', 'saves']
split = 10
dt = DecisionTreeClassifier(random_state=0, min_samples_split=split) # parameter is optional
dt.fit(playersTrain[features], playersTrain['position'])
predictions = dt.predict(playersTest[features])
# Calculate accuracy
numtrain = len(playersTrain)
numtest = len(playersTest)
correct = 0
for i in range(numtest):
#    print('Predicted:', predictions[i], ' Actual:', playersTest.loc[numtrain+i]['position'])
    if predictions[i] == playersTest.loc[numtrain+i]['position']: correct +=1
print('Accuracy:', correct/numtest)

In [None]:
# FOREST OF TREES
# Try different features and different values for split and trees.
# What's the highest accuracy you can get?
features = ['minutes', 'shots', 'passes', 'tackles', 'saves']
split = 10
trees = 10
rf = RandomForestClassifier(random_state=0, min_samples_split=split, n_estimators=trees)
rf.fit(playersTrain[features], playersTrain['position'])
predictions = rf.predict(playersTest[features])
# Calculate accuracy
numtrain = len(playersTrain)
numtest = len(playersTest)
correct = 0
for i in range(numtest):
#    print('Predicted:', predictions[i], ' Actual:', playersTest.loc[numtrain+i]['position'])
    if predictions[i] == playersTest.loc[numtrain+i]['position']: correct +=1
print('Accuracy:', correct/numtest)

### <font color="green">**Your Turn Extra: Decision tree and forest of trees on Titanic data**</font>

In [63]:
# SINGLE TREE
# Try different features and different values for split.
# What's the highest accuracy you can get?
features = ['gender', 'age', 'class', 'fare', 'embarked']
split = 10
dt = DecisionTreeClassifier(random_state=0, min_samples_split=split) # parameter is optional
dt.fit(titanicTrain[features], titanicTrain['survived'])
predictions = dt.predict(titanicTest[features])
# Calculate accuracy
numtrain = len(titanicTrain)
numtest = len(titanicTest)
correct = 0
for i in range(numtest):
#    print('Predicted:', predictions[i], ' Actual:', titanicTest.loc[numtrain+i]['position'])
    if predictions[i] == titanicTest.loc[numtrain+i]['survived']: correct +=1
print('Accuracy:', correct/numtest)

Accuracy: 0.8055555555555556


In [64]:
features = ['gender', 'age', 'class', 'fare', 'embarked']
split = 10
dt = DecisionTreeClassifier(random_state=0, min_samples_split=split) # parameter is optional
dt.fit(titanicTrain[features], titanicTrain['survived'])
predictions = dt.predict(titanicTest[features])
# Calculate accuracy
numtrain = len(titanicTrain)
numtest = len(titanicTest)
correct = 0
for i in range(numtest):
#    print('Predicted:', predictions[i], ' Actual:', titanicTest.loc[numtrain+i]['position'])
    if predictions[i] == titanicTest.loc[numtrain+i]['survived']: correct +=1
print('Accuracy:', correct/numtest)

Accuracy: 0.8055555555555556


In [65]:
# FOREST OF TREES
# Try different features and different values for split and trees.
# What's the highest accuracy you can get?
features = ['gender', 'age', 'class', 'fare', 'embarked']
split = 10
trees = 10
rf = RandomForestClassifier(random_state=0, min_samples_split=split, n_estimators=trees)
rf.fit(titanicTrain[features], titanicTrain['survived'])
predictions = rf.predict(titanicTest[features])
# Calculate accuracy
numtrain = len(titanicTrain)
numtest = len(titanicTest)
correct = 0
for i in range(numtest):
#    print('Predicted:', predictions[i], ' Actual:', titanicTest.loc[numtrain+i]['position'])
    if predictions[i] == titanicTest.loc[numtrain+i]['survived']: correct +=1
print('Accuracy:', correct/numtest)

Accuracy: 0.7916666666666666


### Naive Bayes classification

In [67]:
features = ['latitude']
nb = GaussianNB()
nb.fit(citiesTrain[features], citiesTrain['category'])
predictions = nb.predict(citiesTest[features])
# Calculate accuracy
numtrain = len(citiesTrain)
numtest = len(citiesTest)
correct = 0
for i in range(numtest):
#    print('Predicted:', predictions[i], ' Actual:', citiesTest.loc[numtrain+i]['category'])
    if predictions[i] == citiesTest.loc[numtrain+i]['category']: correct +=1
print('Accuracy:', correct/numtest)
# Try different features

Accuracy: 0.84375


### <font color="green">**Your Turn: Naive Bayes on World Cup data**</font>

In [68]:
# Try different features. What's the highest accuracy you can get?
features = ['minutes', 'shots', 'passes', 'tackles', 'saves']
nb = GaussianNB()
nb.fit(playersTrain[features], playersTrain['position'])
predictions = nb.predict(playersTest[features])
# Calculate accuracy
numtrain = len(playersTrain)
numtest = len(playersTest)
correct = 0
for i in range(numtest):
#    print('Predicted:', predictions[i], ' Actual:', playersTest.loc[numtrain+i]['position'])
    if predictions[i] == playersTest.loc[numtrain+i]['position']: correct +=1
print('Accuracy:', correct/numtest)

Accuracy: 0.6875


### <font color="green">**Your Turn Extra: Naive Bayes on Titanic data**</font>

In [73]:
# Try different features. What's the highest accuracy you can get?
features = ['gender','class',]
nb = GaussianNB()
nb.fit(titanicTrain[features], titanicTrain['survived'])
predictions = nb.predict(titanicTest[features])
# Calculate accuracy
numtrain = len(titanicTrain)
numtest = len(titanicTest)
correct = 0
for i in range(numtest):
#    print('Predicted:', predictions[i], ' Actual:', titanicTest.loc[numtrain+i]['position'])
    if predictions[i] == titanicTest.loc[numtrain+i]['survived']: correct +=1
print('Accuracy:', correct/numtest)

Accuracy: 0.8194444444444444


### Neural network classification

In [74]:
features = ['longitude', 'latitude']
num_layers = 5 # including input and output, so must be >= 2
num_epochs = 10 # number of iterations over training data
batchsize = 20 # size of each batch during one iteration
layer_outputs = 32 # dimensionality of output of each layer
epoch_tracing = 'yes'
# Normalize feature values
sc = StandardScaler()
featurevals_train = sc.fit_transform(citiesTrain[features])
featurevals_test = sc.fit_transform(citiesTest[features])
# Encode labels
encoder = LabelEncoder()
encoder.fit(cities['category'])
labels_train = encoder.transform(citiesTrain['category'])
labels_test = encoder.transform(citiesTest['category'])
# Set up neural-net classifier
seed(1) # to eliminate some randomness
tensorflow.random.set_seed(1) # to eliminate more randomness
classifier = Sequential()
# Input layer
classifier.add(Dense(layer_outputs, activation='relu', input_dim=len(features)))
# Hidden layers
for i in range(num_layers-2):
    classifier.add(Dense(layer_outputs, activation='relu',))
# Output layer - first arg is number of labels, softmax for multi-class classification
classifier.add(Dense(4, activation='softmax'))
classifier.compile(optimizer ='adam', loss='sparse_categorical_crossentropy', metrics =['accuracy'])
# Fit to training data
if epoch_tracing == 'yes': v = 2
else: v = 0
hist = classifier.fit(featurevals_train, labels_train, batch_size=batchsize, epochs=num_epochs, verbose=v)
print('Final accuracy on training data:', hist.history['accuracy'][-1])
# Evaluate on test data
test_acc = classifier.evaluate(featurevals_test, labels_test, verbose=0)[1]
print('Accuracy on test data:', test_acc)
# Try different values for num_layers, num_epochs, batch size, layer_outputs, and different features

Epoch 1/10
10/10 - 1s - loss: 1.3516 - accuracy: 0.3978 - 1s/epoch - 100ms/step
Epoch 2/10
10/10 - 0s - loss: 1.2857 - accuracy: 0.6298 - 17ms/epoch - 2ms/step
Epoch 3/10
10/10 - 0s - loss: 1.2210 - accuracy: 0.6740 - 28ms/epoch - 3ms/step
Epoch 4/10
10/10 - 0s - loss: 1.1626 - accuracy: 0.6685 - 19ms/epoch - 2ms/step
Epoch 5/10
10/10 - 0s - loss: 1.0949 - accuracy: 0.6740 - 18ms/epoch - 2ms/step
Epoch 6/10
10/10 - 0s - loss: 1.0288 - accuracy: 0.6630 - 19ms/epoch - 2ms/step
Epoch 7/10
10/10 - 0s - loss: 0.9570 - accuracy: 0.6685 - 20ms/epoch - 2ms/step
Epoch 8/10
10/10 - 0s - loss: 0.8957 - accuracy: 0.6630 - 24ms/epoch - 2ms/step
Epoch 9/10
10/10 - 0s - loss: 0.8453 - accuracy: 0.6575 - 19ms/epoch - 2ms/step
Epoch 10/10
10/10 - 0s - loss: 0.8020 - accuracy: 0.6740 - 19ms/epoch - 2ms/step
Final accuracy on training data: 0.6740331649780273
Accuracy on test data: 0.65625


### <font color="green">**Your Turn: Neural network on World Cup data**</font>

In [None]:
# Try different features and different values for num_layers, num_epochs,
#  batch size, and layer_outputs.
# What's the highest accuracy you can get?
# Note: Although some randomness is removed by setting seeds in the code,
#  you may still see somewhat different accuracy on different runs;
#  changing the order of the features can also affect accuracy
features = ['minutes', 'shots', 'tackles', 'saves']
num_layers = 5 # including input and output, so must be >= 2
num_epochs = 10 # number of iterations over training data
batchsize = 20 # size of each batch during one iteration
layer_outputs = 32 # dimensionality of output of each layer
epoch_tracing = 'no'
# Normalize feature values
sc = StandardScaler()
featurevals_train = sc.fit_transform(playersTrain[features])
featurevals_test = sc.fit_transform(playersTest[features])
# Encode labels
encoder = LabelEncoder()
encoder.fit(players['position'])
labels_train = encoder.transform(playersTrain['position'])
labels_test = encoder.transform(playersTest['position'])
# Set up neural-net classifier
seed(1) # to eliminate some randomness
tensorflow.random.set_seed(1) # to eliminate more randomness
classifier = Sequential()
# Input layer
classifier.add(Dense(layer_outputs, activation='relu', input_dim=len(features)))
# Hidden layers
for i in range(num_layers-2):
    classifier.add(Dense(layer_outputs, activation='relu',))
# Output layer - first arg is number of labels, softmax for multi-class classification
classifier.add(Dense(4, activation='softmax'))
classifier.compile(optimizer ='adam', loss='sparse_categorical_crossentropy', metrics =['accuracy'])
# Fit to training data
if epoch_tracing == 'yes': v = 2
else: v = 0
hist = classifier.fit(featurevals_train, labels_train, batch_size=batchsize, epochs=num_epochs, verbose=v)
print('Final accuracy on training data:', hist.history['accuracy'][-1])
# Evaluate on test data
test_acc = classifier.evaluate(featurevals_test, labels_test, verbose=0)[1]
print('Accuracy on test data:', test_acc)

### <font color="green">**Your Turn Extra: Neural network on Titanic data**</font>

In [None]:
# Try different features and different values for num_layers, num_epochs,
#  batch size, and layer_outputs.
# What's the highest accuracy you can get?
# Note: Although some randomness is removed by setting seeds in the code,
#  you may still see somewhat different accuracy on different runs;
#  changing the order of the features can also affect accuracy
features = ['gender', 'age', 'class', 'fare', 'embarked']
num_layers = 5 # including input and output, so must be >= 2
num_epochs = 10 # number of iterations over training data
batchsize = 20 # size of each batch during one iteration
layer_outputs = 32 # dimensionality of output of each layer
epoch_tracing = 'no'
# Normalize feature values
sc = StandardScaler()
featurevals_train = sc.fit_transform(titanicTrain[features])
featurevals_test = sc.fit_transform(titanicTest[features])
# Encode labels
encoder = LabelEncoder()
encoder.fit(titanic['survived'])
labels_train = encoder.transform(titanicTrain['survived'])
labels_test = encoder.transform(titanicTest['survived'])
# Set up neural-net classifier
seed(1) # to eliminate some randomness
tensorflow.random.set_seed(1) # to eliminate more randomness
classifier = Sequential()
# Input layer
classifier.add(Dense(layer_outputs, activation='relu', input_dim=len(features)))
# Hidden layers
for i in range(num_layers-2):
    classifier.add(Dense(layer_outputs, activation='relu',))
# Output layer - first arg is number of labels, softmax for multi-class classification
classifier.add(Dense(4, activation='softmax'))
classifier.compile(optimizer ='adam', loss='sparse_categorical_crossentropy', metrics =['accuracy'])
# Fit to training data
if epoch_tracing == 'yes': v = 2
else: v = 0
hist = classifier.fit(featurevals_train, labels_train, batch_size=batchsize, epochs=num_epochs, verbose=v)
print('Final accuracy on training data:', hist.history['accuracy'][-1])
# Evaluate on test data
test_acc = classifier.evaluate(featurevals_test, labels_test, verbose=0)[1]
print('Accuracy on test data:', test_acc)