# Agenda

# - Machine Learning

# - Iterators and Generators

# ---

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC

# Some Definitions 

## - Classification Problem
### Will a student drop out of their major within the next year?

## - Samples
### (student, course, quarter) combinations

## - Features
### (current GPA, number courses currently enrolled, demographics, etc.)

## - Labels
### 1 if student will drop out of major within next year, else 0

## - Training Set
### 70% of data

## - Testing Set
### 30% of data

# Generate Training Data

In [None]:
n = 100

x1 = np.random.normal(0, 1, n)
y1 = np.random.normal(0, 1, n)

x2 = np.random.normal(5, 1, n)
y2 = np.random.normal(5, 1, n)

In [None]:
plt.scatter(x1, y1, color='b', s=2)
plt.scatter(x2, y2, color='r', s=2)

In [None]:
train_data = np.empty((2*n,2))
train_data[:,0] = np.concatenate([x1,x2])
train_data[:,1] = np.concatenate([y1,y2])

In [None]:
train_data[:10]

In [None]:
train_labels = np.concatenate([np.zeros(n), np.ones(n)])

In [None]:
train_labels

# Classify

In [None]:
clf = SVC(kernel = 'linear')
clf.fit(train_data, train_labels)

In [None]:
train_preds = clf.predict(train_data)

In [None]:
print('Accuracy on Training Data:', np.mean(train_preds == train_labels)*100)

# Generate Some Testing Data

In [None]:
n = 100

x1 = np.random.normal(0, 1, n)
y1 = np.random.normal(0, 1, n)

x2 = np.random.normal(5, 1, n)
y2 = np.random.normal(5, 1, n)

test_data = np.empty((2*n,2))
test_data[:,0] = np.concatenate([x1,x2])
test_data[:,1] = np.concatenate([y1,y2])

test_labels = np.concatenate([np.zeros(n), np.ones(n)])

In [None]:
test_preds = clf.predict(test_data)

In [None]:
print('Accuracy on Testing Data:', np.mean(test_preds== test_labels)*100)

# Visualize

In [None]:
def draw_separating_line(clf, x):
    y = -(clf.coef_[0,0] * x  +  clf.intercept_[0]) / clf.coef_[0,1]
    plt.plot(x,y, color='k', linestyle='--')

In [None]:
plt.scatter(x1, y1, color='b', s=2)
plt.scatter(x2, y2, color='r', s=2)
draw_separating_line(clf, np.linspace(-2,6,100))

# Nonlinear Case

In [None]:
n = 1000

x1 = np.random.normal(0, 1, n)
y1 = np.random.normal(0, 1, n)

r = np.random.normal(10, .5, n)
theta = np.random.normal(0, 10, n)
x2 = r*np.cos(theta)
y2 = r*np.sin(theta)

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(x1, y1, color='b', s=2)
plt.scatter(x2, y2, color='r', s=2)

In [None]:
train_data = np.empty((2*n,2))
train_data[:,0] = np.concatenate([x1,x2])
train_data[:,1] = np.concatenate([y1,y2])

In [None]:
train_labels = np.concatenate([np.zeros(n), np.ones(n)])

# Try to Classify

In [None]:
clf = SVC(kernel = 'linear')
clf.fit(train_data, train_labels)

In [None]:
train_preds = clf.predict(train_data)

In [None]:
print('Accuracy on Training Data:', np.mean(train_preds == train_labels)*100)

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(x1, y1, color='b', s=2)
plt.scatter(x2, y2, color='r', s=2)
draw_separating_line(clf, np.linspace(x2.min(),x2.max(),100))
plt.xlim(-12,12)
plt.ylim(-12,12)

# Transform 

In [None]:
train_data[:10]

In [None]:
x_vals = train_data[:,0]
y_vals = train_data[:,1]

In [None]:
transformed_train_data = np.empty_like(train_data)

In [None]:
r_vals = np.sqrt(x_vals**2 + y_vals**2)
theta_vals = np.arctan(y_vals/x_vals)

In [None]:
transformed_train_data[:,0] = r_vals
transformed_train_data[:,1] = theta_vals

In [None]:
transformed_train_data[:10]

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(r_vals[:n], theta_vals[:n], color='b', s=2)
plt.scatter(r_vals[n:], theta_vals[n:], color='r', s=2)
plt.xlabel('r', fontsize=20)
plt.ylabel('theta', fontsize=20)

# Classify

In [None]:
clf = SVC(kernel = 'linear')
clf.fit(transformed_train_data, train_labels)

In [None]:
train_preds = clf.predict(transformed_train_data)

In [None]:
print('Accuracy on Training Data:', np.mean(train_preds == train_labels)*100)

# Visualize

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(r_vals[:n], theta_vals[:n], color='b', s=2)
plt.scatter(r_vals[n:], theta_vals[n:], color='r', s=2)
plt.xlabel('r', fontsize=20)
plt.ylabel('theta', fontsize=20)
draw_separating_line(clf, np.linspace(r_vals.min(),r_vals.max(),100))
plt.xlim(-1, 12)
plt.ylim(-2,2)

# Apply this to card games

In [None]:
def generate_hands(n_hands, hand_size):
    cards = range(52)
    hands = np.array([np.random.choice(cards, hand_size, replace=False) for _ in range(n_hands)])
    return hands

In [None]:
hands = generate_hands(5, 2)

In [None]:
hands

In [None]:
hands = generate_hands(1000, 2)

## Goal: figure out if hand is pair or not

In [None]:
ranks = hands % 13
pair_hands = hands[ranks[:,0] == ranks[:,1]]
non_pair_hands = hands[ranks[:,0] != ranks[:,1]]

In [None]:
labels = (ranks[:,0] == ranks[:,1]).astype(np.int)

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(pair_hands[:,0], pair_hands[:,1], color='b', s=50, marker='x')
plt.scatter(non_pair_hands[:,0], non_pair_hands[:,1], color='r', s=20)
plt.xlabel('first card', fontsize=20)
plt.ylabel('second card', fontsize=20)

In [None]:
clf = SVC(kernel = 'linear')
clf.fit(hands, labels)

In [None]:
preds = clf.predict(hands)

In [None]:
print('Accuracy on Training Data:', np.mean(preds == labels)*100)

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(pair_hands[:,0], pair_hands[:,1], color='b', s=50, marker='x')
plt.scatter(non_pair_hands[:,0], non_pair_hands[:,1], color='r', s=20)
plt.xlabel('first card', fontsize=20)
plt.ylabel('second card', fontsize=20)
draw_separating_line(clf, np.linspace(0, 52 ,100))

# What accuracy would we get by random guessing?

In [None]:
1 - np.mean(labels)

# Let's think of a transformation

In [None]:
ranks = hands % 13

In [None]:
hands[:5]

In [None]:
ranks[:5]

In [None]:
pair_ranks = ranks[ranks[:,0] == ranks[:,1]]
non_pair_ranks = ranks[ranks[:,0] != ranks[:,1]]

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(pair_ranks[:,0], pair_ranks[:,1], color='b', s=50, marker='x')
plt.scatter(non_pair_ranks[:,0], non_pair_ranks[:,1], color='r', s=20)
plt.xlabel('first card rank', fontsize=20)
plt.ylabel('second card rank', fontsize=20)

In [None]:
pair_ranks[:3]

In [None]:
pair_hands[:3]

In [None]:
clf = SVC(kernel = 'linear')
clf.fit(ranks, labels)

In [None]:
preds = clf.predict(ranks)

In [None]:
print('Accuracy on Training Data:', np.mean(preds == labels)*100)

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(pair_ranks[:,0], pair_ranks[:,1], color='b', s=50, marker='x')
plt.scatter(non_pair_ranks[:,0], non_pair_ranks[:,1], color='r', s=20)
plt.xlabel('first card rank', fontsize=20)
plt.ylabel('second card rank', fontsize=20)
draw_separating_line(clf, np.linspace(0, 52 ,100))

# Hmm ... how can we get a good separation?

In [None]:
rank_diffs = ranks[:,1] - ranks[:,0]

In [None]:
diffs = np.empty_like(ranks)
diffs[:,0] = ranks[:,0]
diffs[:,1] = rank_diffs

In [None]:
diffs[:5]

In [None]:
pair_rank_diffs = diffs[diffs[:,1] == 0]
non_pair_rank_diffs = diffs[diffs[:,1] != 0]

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(pair_rank_diffs[:,0], pair_rank_diffs[:,1], color='b', s=50, marker='x')
plt.scatter(non_pair_rank_diffs[:,0], non_pair_rank_diffs[:,1], color='r', s=20)
plt.xlabel('first card rank', fontsize=20)
plt.ylabel('difference between ranks', fontsize=20)

## Slight adjustment

In [None]:
rank_diffs = abs(ranks[:,1] - ranks[:,0])

diffs = np.empty_like(ranks)
diffs[:,0] = ranks[:,0]
diffs[:,1] = rank_diffs

In [None]:
pair_rank_diffs = diffs[diffs[:,1] == 0]
non_pair_rank_diffs = diffs[diffs[:,1] != 0]

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(pair_rank_diffs[:,0], pair_rank_diffs[:,1], color='b', s=50, marker='x')
plt.scatter(non_pair_rank_diffs[:,0], non_pair_rank_diffs[:,1], color='r', s=20)
plt.xlabel('first card rank', fontsize=20)
plt.ylabel('absolute difference between ranks', fontsize=20)

In [None]:
clf = SVC(kernel = 'linear')
clf.fit(diffs, labels)

In [None]:
preds = clf.predict(diffs)

In [None]:
print('Accuracy on Training Data:', np.mean(preds == labels)*100)

# Generator Expressions

In [None]:
from sys import getsizeof

In [None]:
standard_list_of_nums = [i for i in range(100)]
generator_list_of_nums = (i for i in range(100))

In [None]:
type(standard_list_of_nums)

In [None]:
type(generator_list_of_nums)

## Similar Functionality

In [None]:
total = 0
for item in standard_list_of_nums:
    total += item
total

In [None]:
total = 0
for item in standard_list_of_nums:
    total += item
total

In [None]:
total = 0
for item in generator_list_of_nums:
    total += item
total

In [None]:
total = 0
for item in generator_list_of_nums:
    total += item
total

## Different Indexing Behaviour

In [None]:
standard_list_of_nums[0]

In [None]:
generator_list_of_nums[0]

## Size Comparison

In [None]:
getsizeof(standard_list_of_nums)

In [None]:
getsizeof(generator_list_of_nums)

In [None]:
standard_list_of_nums = [i for i in range(1000000)]
generator_list_of_nums = (i for i in range(1000000))

In [None]:
getsizeof(standard_list_of_nums)

In [None]:
getsizeof(generator_list_of_nums)

## Efficiency

In [None]:
%%timeit
standard_list_of_nums = [i for i in range(1000000)]

In [None]:
%%timeit
generator_list_of_nums = (i for i in range(1000000))

In [None]:
%%timeit
tot = 0
for item in standard_list_of_nums:
    tot += item

In [None]:
%%timeit
generator_list_of_nums = (i for i in range(1000000))
tot = 0
for item in generator_list_of_nums:
    tot += item

# Exersize for you - find a data transform to separate these classes using SVM

In [None]:
n = 1000

class_one_data = np.random.multivariate_normal([3,3], [[2,1.8],[1.8,2]], 1000)
x1 = class_one_data[:,0]
y1 = class_one_data[:,1]

class_two_data = np.random.multivariate_normal([7,-1], [[.5,0],[0,.5]], 500)
class_two_data = np.concatenate([class_two_data, np.random.multivariate_normal([-1,7], [[.5,0],[0,.5]], 500)], axis=0)
x2 = class_two_data[:,0]
y2 = class_two_data[:,1]

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(x1, y1, color='b', s=2)
plt.scatter(x2, y2, color='r', s=2)