# Numpy and Pandas
Numpy and Pandas are packages used for data manipulation. They are extremely useful, and work well with Python's machine learning packages (specifically, scikit-learn and tensorflow).

In [1]:
# imports
import numpy as np
import pandas as pd

## Pandas Basics

In [2]:
# make dataframe from csv file
df = pd.read_csv('customer_data.csv')
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44,72000,No
1,Spain,27,48000,Yes
2,Germany,30,54000,No
3,Spain,38,61000,No
4,Germany,40,50000,Yes
5,France,35,58000,Yes
6,Spain,29,52000,No
7,France,48,79000,Yes
8,Germany,50,83000,No
9,France,37,67000,Yes


In [3]:
# get dataframe column
df.Country, df['Country']

(0     France
 1      Spain
 2    Germany
 3      Spain
 4    Germany
 5     France
 6      Spain
 7     France
 8    Germany
 9     France
 Name: Country, dtype: object, 0     France
 1      Spain
 2    Germany
 3      Spain
 4    Germany
 5     France
 6      Spain
 7     France
 8    Germany
 9     France
 Name: Country, dtype: object)

In [4]:
# find min, max, mean salary
min(df.Salary), max(df.Salary), np.mean(df.Salary)

(48000, 83000, 62400.0)

In [5]:
# get dataframe row
df.iloc[0]
df.iloc[0, :]
df.iloc[0][:]

Country      France
Age              44
Salary        72000
Purchased        No
Name: 0, dtype: object

In [6]:
# index by entry
df[df.Purchased == 'Yes']
np.mean(df[df.Purchased == 'Yes'].Salary)
np.mean(df[df.Purchased == 'No'].Salary)

64400.0

In [7]:
# get dataframe values as a numpy array
df.values

array([['France', 44, 72000, 'No'],
       ['Spain', 27, 48000, 'Yes'],
       ['Germany', 30, 54000, 'No'],
       ['Spain', 38, 61000, 'No'],
       ['Germany', 40, 50000, 'Yes'],
       ['France', 35, 58000, 'Yes'],
       ['Spain', 29, 52000, 'No'],
       ['France', 48, 79000, 'Yes'],
       ['Germany', 50, 83000, 'No'],
       ['France', 37, 67000, 'Yes']], dtype=object)

## Numpy Basics

In [8]:
# make a numpy arrays
a = np.array([4, 5, 6])
b = np.array([[4, 5, 6]])
c = np.array([[4], [5], [6]])

In [9]:
a

array([4, 5, 6])

In [10]:
b

array([[4, 5, 6]])

In [11]:
c

array([[4],
       [5],
       [6]])

In [12]:
# see array sizes, shapes
a.size, b.size, c.size
a.shape, b.shape, c.shape

((3,), (1, 3), (3, 1))

In [24]:
# my favorite functions
a.reshape(3, -1)
np.where(a == 5)
np.ones((3, 3))
np.zeros((3, 3))
a.astype(float)
np.isnan(a)

array([False, False, False])