In [1]:
"""
Generate synthetic data to test algorithms on
"""
from sklearn.datasets import make_classification
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [2]:
X, y = make_classification(n_samples=2000, n_features=200, n_informative=2, 
                    n_redundant=2, n_repeated=0, n_classes=2, 
                    n_clusters_per_class=2, weights=None, 
                    flip_y=0.01, class_sep=1.0, hypercube=True, 
                    shift=1, scale=1.0, shuffle=True, 
                    random_state=1234)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1234, test_size=0.3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1400, 200), (600, 200), (1400,), (600,))

In [3]:
clf = LinearSVC(random_state=1234, tol=1e-5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)




In [12]:
import time
import random

from sklearn.manifold import TSNE

n_sne = 100

time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=500)
tsne_results = tsne.fit_transform(X_test)


[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 600 samples in 0.004s...
[t-SNE] Computed neighbors for 600 samples in 0.146s...
[t-SNE] Computed conditional probabilities for sample 600 / 600
[t-SNE] Mean sigma: 3.618709
[t-SNE] KL divergence after 250 iterations with early exaggeration: 95.706757
[t-SNE] Error after 500 iterations: 2.063532


In [13]:
import matplotlib.pyplot as plt
%matplotlib notebook

plt.scatter(tsne_results[:,0], tsne_results[:,1], c=y_test)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x180baa39518>

In [5]:
trainData = np.hstack((y_train[:,np.newaxis], X_train))
trainDf = pd.DataFrame(data=trainData, index=None)
trainDf.to_csv("synthetic_train_binary.csv", index=None)
testData = np.hstack((y_test[:,np.newaxis], X_test))
testDf = pd.DataFrame(data=testData, index=None)
testDf.to_csv("synthetic_test_binary.csv", index=None)

In [44]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook
from sklearn import svm

xx, yy = np.meshgrid(np.linspace(-3, 3, 500),
                     np.linspace(-3, 3, 500))
np.random.seed(0)
X = np.random.randn(300, 2)
Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)

# fit the model
clf = svm.NuSVC()
clf.fit(X, Y)

# plot the decision function for each datapoint on the grid
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.imshow(Z, interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()), aspect='auto',
           origin='lower', cmap=plt.cm.PuOr_r)
contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2,
                       linetypes='--')
plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired,
            edgecolors='k')
plt.xticks(())
plt.yticks(())
plt.axis([-3, 3, -3, 3])


<IPython.core.display.Javascript object>

  s)


[-3, 3, -3, 3]

In [16]:
# Make circular data
import matplotlib.pyplot as plt
import random
from random import shuffle

num_samples = 10000
x1, x2, y_ = [], [], []
for i in range(num_samples):
    x = [random.uniform(0, 1), random.uniform(0, 1)]
    temp = x[0]**x[0] + x[1]**x[1]
    if i < num_samples//2: 
        y = 1
    else:
        y = 0
    x1.append(x[0])
    x2.append(x[1])
    y_.append(y)

combined = list(zip(x1, x2, y_))
random.shuffle(combined)

x1[:], x2[:], y_[:] = zip(*combined)

x1_train, x2_train, y_train = x1[:7000], x2[:7000], y_[:7000]
x1_test, x2_test, y_test = x1[7000:], x2[7000:], y_[7000:]

df_all_train = pd.DataFrame(data={'y': y_train, 'x1':x1_train, 'x2':x2_train}) 
df_all_test = pd.DataFrame(data={'y': y_test, 'x1':x1_test, 'x2':x2_test})

df1_train = pd.DataFrame(data={'y': y_train, 'x1':x1_train})
df1_test = pd.DataFrame(data={'y': y_test, 'x1':x1_test})

df2_train = pd.DataFrame(data={'y': y_train, 'x1':x2_train})
df2_test = pd.DataFrame(data={'y': y_test, 'x1':x2_test})

print(y)
df_all_train.to_csv("../dl4j-examples/dl4j-examples/data/convex/convex_train.csv", index=None, header=None)
df_all_test.to_csv("../dl4j-examples/dl4j-examples/data/convex/convex_test.csv", index=None, header=None)

df1_train.to_csv("../dl4j-examples/dl4j-examples/data/convex/convex_train_0.csv", index=None, header=None)
df1_test.to_csv("../dl4j-examples/dl4j-examples/data/convex/convex_test_0.csv", index=None, header=None)

df2_train.to_csv("../dl4j-examples/dl4j-examples/data/convex/convex_train_1.csv", index=None, header=None)
df2_test.to_csv("../dl4j-examples/dl4j-examples/data/convex/convex_test_1.csv", index=None, header=None)

0


In [17]:
y_

[0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,


In [27]:
import numpy as np
documents = ['he knows her but she does not know him',
              'he and i do not know who that is',
              'she knows him but he does not know that'
            ]

all_words = ' '.join(documents).lower().split()
from collections import Counter

vocab = list(set(all_words))
N = len(documents)
tfidf_mat = np.zeros(shape=(N,len(vocab)))
for i, doc in enumerate(documents):
    freqs = Counter(doc.lower().split())
    for j in range(len(vocab)):
        tfidf_mat[i][j] = freqs[vocab[j]]*np.log(N/word_counts[vocab[j]])

print(tfidf_mat)
    




[[0.         0.40546511 0.40546511 0.         0.         0.40546511
  0.         0.         0.         1.09861229 0.40546511 0.
  0.         0.40546511 0.        ]
 [0.40546511 0.         0.         1.09861229 1.09861229 0.
  0.         1.09861229 1.09861229 0.         0.         0.
  0.         0.         1.09861229]
 [0.40546511 0.40546511 0.40546511 0.         0.         0.40546511
  0.         0.         0.         0.         0.40546511 0.
  0.         0.40546511 0.        ]]


In [30]:
all_words = ' '.join(documents).lower().split()
word_counts = Counter(all_words)
vocab = list(set(all_words))
DF = {}
for word in vocab:
    for doc in documents:
        words = doc.split()
        if word in words:
            DF.add()

N = len(documents)
tfidf_mat = np.zeros(shape=(N,len(vocab)))
for i, doc in enumerate(documents):
    freqs = Counter(doc.lower().split())
    for j in range(len(vocab)):
        tfidf_mat[i][j] = freqs[vocab[j]]*np.log(N/len(DF[vocab[j]]))

print(tfidf_mat)

print(DF)

[[0.         0.40546511 0.40546511 0.         0.         0.40546511
  0.         0.         0.         1.09861229 0.40546511 0.
  0.         0.40546511 0.        ]
 [0.40546511 0.         0.         1.09861229 1.09861229 0.
  0.         1.09861229 1.09861229 0.         0.         0.
  0.         0.         1.09861229]
 [0.40546511 0.40546511 0.40546511 0.         0.         0.40546511
  0.         0.         0.         0.         0.40546511 0.
  0.         0.40546511 0.        ]]
{'he': {0, 1, 2}, 'knows': {0, 2}, 'her': {0}, 'but': {0, 2}, 'she': {0, 2}, 'does': {0, 2}, 'not': {0, 1, 2}, 'know': {0, 1, 2}, 'him': {0, 2}, 'and': {1}, 'i': {1}, 'do': {1}, 'who': {1}, 'that': {1, 2}, 'is': {1}}


Fast Food Orders
Recommended Time: 5 minutes

One of the most common formats for storing data is the data frame. Data frames store tabular data in a spreadsheet-like format, where each row represents a data observation and each column represents a particular feature. In Python, we use the DataFrame object from pandas to represent data frames.

You'll write Python 3 code in the code editor to complete the calculate_avg_spend function. The first input argument (orders_df), is a DataFrame object with two columns: 'CustomerName' and 'Order'. Each row of the DataFrame represents a particular fast food order for a customer. The 'CustomerName' field contains the name of the customer while the 'Order' field contains the item he or she ordered.

The second input argument (items_df), is a DataFrame object with two columns: 'ItemName' and 'Price'. Each row of the DataFrame represents the price of a particular fast food item. The 'ItemName' field contains the unique name of the item while the 'Price' field contains the price of the item in dollars.

An example of the inputs to the function would look like the following:

orders_df
   CustomerName         Order
0    John Smith  Cheeseburger
1     Lenny Lee          Soda
2  Ava Williams     Hamburger
3    Holly Jack     Ice Cream
4    John Smith          Taco
5    Holly Jack     Hamburger
6  Ava Williams         Fries
7    John Smith         Salad

items_df
       ItemName  Price
0  Cheeseburger   9.99
1          Soda   1.35
2     Hamburger   8.79
3     Ice Cream   4.99
4          Taco   3.50
5         Fries   4.00
6         Salad   5.19

Your calculate_avg_spend function should return a DataFrame with two columns (in the following order): 'CustomerName' and 'AvgSpend'. Each row of the output DataFrame should have a unique 'CustomerName' value, which represents the name of the customer. The 'AvgSpend' column represents the average amount of dollars the customer spent on fast food items.

Your output DataFrame should be sorted by 'CustomerName', in alphabetical order. For the above example, the output DataFrame would be:

   CustomerName  AvgSpend
0  Ava Williams  6.395000
1    Holly Jack  6.890000
2    John Smith  6.226667
3     Lenny Lee  1.350000

Feel free to make use of the NumPy and pandas libraries in your code. When you finish coding, click the button to get results of the function run on an input set of images. Any output from the testing (print statements, errors, etc.) will be displayed in the console, as well as results from the test cases. The number of test inputs your algorithm is correct on will also be displayed in the console after running your code. Note that when you submit your interview we will test your algorithm on hidden input sets.

In [None]:
orders_df = pd.merge(orders_df, items_df, how = "left", left_on="Order", right_on="ItemName")
orders_df = orders_df.groupby("CustomerName").mean()["Price"].reset_index().rename(columns={"Price":"AvgSpend"})

Non-Cheeseburger Purchasers
Recommended Time: 10 minutes

In this problem you'll be working with a SQL database that contains two tables: Customers and Orders. The Customers table has the following fields:

    ID (int): Unique integer ID for the customer
    FirstName (text): First name of the customer
    LastName (text): Last name of the customer
    City (text): City the customer is from

The Orders table has the following fields:

    OrderID (int): Unique integer ID for the fast food order
    CustomerID (text): Integer ID corresponding to the ID field in the Customers table, represents the customer that made the order
    Purchase (text): Name of the order purchased (e.g. "Cheeseburger", "Onion Rings", etc.)
    Price (float): Price of the order

Your task will be to write a SQL query in the code editor, which returns all customers that have never made a cheeseburger order. A cheeseburger order corresponds to a row in the Orders table where the Purchase field is equal to "Cheeseburger". Remember that the CustomerID field represents the ID of the customer who made the order.

The rows that your SQL query returns will come from the Customers table. Each row will have ID as the first column, FirstName as the second column, and LastName as the third column (omit the City field). Return the table rows sorted by the ID field, in ascending order.

When you finish writing your SQL query, click the button to run the query on the database and return the results. You will be informed whether or not your query is correct, and any table rows extracted by your SQL query will be shown.

In [None]:
SELECT ID, FirstName, LastName FROM 

(SELECT * from Customers C left join Orders A ON C.ID == A.CustomerID
    WHERE A.Purchase == "Cheeseburger" 
UNION ALL 
(SELECT * from Orders A left join Customers C ON  A.CustomerID == C.ID
        WHERE A.Purchase == "Cheeseburger")) B if B.OrderID IS NULL ORDER BY ID ASC; 

In [None]:
SELECT ID, FirstName, LastName from Customers C left join Orders A WHERE C.ID == A.CustomerID IF C.ID NOT IN
(
SELECT DISTINCT CustomerID 
    from 
        (SELECT CustomerID, OrderID from Orders if Purchase == "Cheeseburger"))

In [None]:
SELECT d.type,
         d.color,
         c.type,
         c.color
FROM dogs d
LEFT JOIN cats c USING(color)
UNION ALL
SELECT d.type,
         d.color,
         c.type,
         c.color
FROM cats c
LEFT JOIN dogs d USING(color)
WHERE d.color IS NULL;