# Comparison between Pandas and NumPy

| Pandas | Numpy |
| :---- | :---- |
| Tabular Data | Numerical Data |
| above 500K better performance | 50K or less has better performance |
| more memory consumption | less memory consumption |
| 2d table called DataFrame | Multi-dimensional Array |
| Indexing is slow in pandaSeries | Indexing of numpy Arrays is fast |
| implicit defined integer index | explicitly defined index associated with the values |

## NumPy

In [None]:
import numpy as np

#### 1D Array

In [None]:
# 1D array
x = np.array((1, 2, 3, 4, 5))

print('Contents:', x)

print('Type:', type(x))

print('Dimensions:', x.shape)

#### 2D Array

In [None]:
# 2D array
y = np.array([[1,2,3],[4,5,6],[7,8,9], [10,11,12]])

print('Contents:', y)

print('Type:', type(y))

print('Dimensions:', y.shape)

#### Arrange

In [None]:
# Arange
x = np.arange(10)
print(x)

x = np.arange(100,1000, 100)
print(x)

x = np.arange(1,14,2)
print(x)

#### Linspace

In [None]:
# Linspace
x = np.linspace(1,10,7)
print(x)

#### Reshape

In [None]:
# Reshape
x = np.arange(15)
print(x)

x = np.reshape(x, (3, 5))
print(x)

#### Identity matrix

In [None]:
# Identity matrix
mat = np.eye(3)

print(mat)

#### Diagonal matrix

In [None]:
# Diagonal Matrix
diag = np.diag([10,20,30,40,50,60])

print(diag)

#### Slicing

In [None]:
# Slicing

arr = np.arange(20).reshape(4, 5)
print('1: ', arr)

# 3-4 row, 2-5 columns
x = arr[2:4,1:5]
print('2: ',x)

# or
x = arr[2:,1:5]
print('3: ',x)

# 3rd row
x = arr[2,:]
print('4: ',x)

# 3rd column
x = arr[:,2]
print('5: ',x)


#### Random

In [None]:
# Random

arr = np.random.random((3,3))
print('1: ',arr)

# np.random.randint(start, stop, size = shape)
arr = np.random.randint(4,15,size=(3,2))
print('2: ',arr)

# np.random.normal(mean, standard deviation, size=shape)
arr = np.random.normal(0, 1, size=(10,10))
print('3: ',arr)

#### Delete

In [None]:
# Delete
# np.delete(ndarray, elements, axis)

x = np.array([1, 2, 3, 4, 5])

# delete 1st and 5th element
x = np.delete(x, [0,4])

y = np.array([[1,2,3],[4,5,6],[7,8,9]])

# delete 1st row
a = np.delete(y, 0, axis=0) # axis 0 is for rows

# delete 1st and last column
b = np.delete(y, [0,2], axis=1) # axis 1 is for columns

#### Append

In [None]:
# Append
# np.append(ndarray, elements, axis)

x = np.append(x, 6)

# append 7, 8
x = np.append(x, [7,8])

# append row
v = np.append(y, [[10,11,12]], axis=0)

# append column
q = np.append(y,[[13],[14],[15]], axis=1)

#### Insert

In [None]:
# Insert
# np.insert(ndarray, index, elements, axis)

x = np.array([1, 2, 5, 6, 7])
y = np.array([[1,2,3],[7,8,9]])

# insert 3,4 at 3rd index 
x = np.insert(x,2,[3,4])

# insert row at index 1
w = np.insert(y,1,[4,5,6],axis=0)

# insert column with all 5 at index 1
v = np.insert(y,1,5, axis=1)

#### Element along Diagnol

In [None]:
print(x)

# Extract elements along the diagonal
d0 = np.diag(x)
print(d0)

# above diagnol
d1 = np.diag(x, k=1)
print(d1)

# below diagnol
d2 = np.diag(x, k=-1)
print(d2)

#### Sorting

In [None]:
x = np.array([[1, 2, 3, 6,7,8, 4,5, 6]])
print(x)
# Sorting
# When used as a function, it doesn't change the original ndarray
s = np.sort(x)
print(s)
# When used as a method, the original array will be sorted
x.sort()

# sort x but only keep the unique elements in x
s = np.sort(np.unique(x))
print(s)

# sort the columns of X
s = np.sort(x, axis = 0)
print(s)

# sort the rows of X
s = np.sort(x, axis = 1)
print(s)

#### Math Functions

In [None]:
x = np.array([1,2,3,4])
y = np.array([5.5,6.5,7.5,8.5])
print(np.add(x,y))
print(np.subtract(x,y))
print(np.multiply(x,y))
print(np.divide(x,y))

X = np.array([1,2,3,4]).reshape(2,2)
Y = np.array([5.5,6.5,7.5,8.5]).reshape(2,2)
np.add(X,Y)
np.subtract(X,Y)
np.multiply(X,Y)
np.divide(X,Y)

print(np.exp(x))
print(np.sqrt(x))
print(np.power(x,2))

#### Statistical

In [None]:
# Statistical Functions
print('Average of all elements in X:', X.mean())
print('Average of all elements in the columns of X:', X.mean(axis=0))
print('Average of all elements in the rows of X:', X.mean(axis=1))
print()
print('Sum of all elements in X:', X.sum())
print('Standard Deviation of all elements in X:', X.std())
print('Median of all elements in X:', np.median(X))
print('Maximum value of all elements in X:', X.max())
print('Minimum value of all elements in X:', X.min())

## Pandas

In [None]:
import pandas as pd

### Series
1D array-like object that can hold many data types.

`pd.Series(data, index)`

#### Create Series

In [None]:
groceries = pd.Series(data = [30, 6, 'Yes', 'No'], index = ['eggs', 'apples', 'milk', 'bread'])

In [None]:
print('Shape:', groceries.shape)
print('Dimension:', groceries.ndim)
print('Total', groceries.size, 'elements')
print('Content:', groceries.values)
print('Indices:', groceries.index)

#### Access Element

In [None]:
print('Content:', groceries.values)
print('Indices:', groceries.index)

## Index Labels
# single index label
print('How many eggs do we need to buy:', groceries['eggs'])

# access multiple index labels
print('Do we need milk and bread:\n', groceries[['milk', 'bread']]) 

# use loc to access multiple index labels
print('How many eggs and apples do we need to buy:\n', groceries.loc[['eggs', 'apples']]) 

## Numerical Indices
# use multiple numerical indices
print('How many eggs and apples do we need to buy:\n',  groceries[[0, 1]]) 

# use a negative numerical index
print('Do we need bread:\n', groceries[[-1]]) 

# use a single numerical index
print('How many eggs do we need to buy:', groceries[0]) 

# use iloc (stands for integer location) to access multiple numerical indices
print('Do we need milk and bread:\n', groceries.iloc[[2, 3]])

#### Deletion

In [None]:
# Delete Elements
# doesn't change the original Series being modified
groceries.drop('apples')
print(groceries)
# delete items from Series in place by setting keyword inplace to True
groceries.drop('apples', inplace = True)
print(groceries)

### Dataframe
Pandas DataFrames are two-dimensional data structures with labeled rows and columns, that can hold many data types.

#### Axes

In [None]:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

#load data into a DataFrame object:
df = pd.DataFrame(data)
               
print(df)

# understanding axes
df.sum()       
# sums “down” the 0 axis (rows)
df.sum(axis=0) 
# equivalent (since axis=0 is the default)
df.sum(axis=1) 
# sums “across” the 1 axis (columns)

In [None]:
df = pd.read_csv('../datasets/data.csv')

# limit which rows are read when reading in a file
df2 = pd.read_csv('../datasets/data.csv', nrows=10)   

df3 = pd.read_csv('../datasets/data.csv', skiprows=[1,2]) # skip first two rows

train = df.sample(frac=0.75) # 75% data

test = df[~df.index.isin(train.index)]  # 25% data

In [None]:
# create a dictionary of Pandas Series 
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants'])}

# print the type of items to see that it is a dictionary
# print(items) # class 'dict'

# create a Pandas DataFrame by passing it a dictionary of Series
shopping_carts = pd.DataFrame(items)
shopping_carts

# create a DataFrame that only has a subset of the data/columns
bob_shopping_cart = pd.DataFrame(items, columns=['Bob'])
bob_shopping_cart

# create a DataFrame that only has selected keys
sel_shopping_cart = pd.DataFrame(items, index = ['pants', 'book'])
sel_shopping_cart

# combine both of the above - selected keys for selected columns
alice_sel_shopping_cart = pd.DataFrame(items, index = ['glasses', 'bike'], columns = ['Alice'])
shopping_carts


In [None]:
# create DataFrames from a list of Python dictionaries
# create a list of Python dictionaries
items2 = [{'bikes': 20, 'pants': 30, 'watches': 35}, 
          {'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5}]

# create a DataFrame 
store_items = pd.DataFrame(items2)

# create a DataFrame and provide the row index
store_items = pd.DataFrame(items2, index = ['store 1', 'store 2'])

store_items

In [None]:
books = pd.Series(data = ['Great Expectations', 'Of Mice and Men', 'Romeo and Juliet', 'The Time Machine', 'Alice in Wonderland' ])
authors = pd.Series(data = ['Charles Dickens', 'John Steinbeck', 'William Shakespeare', ' H. G. Wells', 'Lewis Carroll' ])
user_1 = pd.Series(data = [3.2, np.nan ,2.5])
user_2 = pd.Series(data = [5., 1.3, 4.0, 3.8])
user_3 = pd.Series(data = [2.0, 2.3, np.nan, 4])
user_4 = pd.Series(data = [4, 3.5, 4, 5, 4.2])


# Create a dictionary with the data given above
a_dict = {'Author':authors,'Book Title':books,'User 1':user_1, 'User 2':user_2, 'User 3':user_3, 'User 4':user_4}

# Use the dictionary to create a Pandas DataFrame
book_ratings = pd.DataFrame(a_dict)
book_ratings

In [None]:
book_ratings_numpy = book_ratings.values
book_ratings_numpy

#### Access Elements

In [None]:
# Access Elements
print()
print('How many bikes are in each store:\n', store_items[['bikes']])
print()
print('How many bikes and pants are in each store:\n', store_items[['bikes', 'pants']])
print()
print('What items are in Store 1:\n', store_items.loc[['store 1']])
print()
print('How many bikes are in Store 2:', store_items['bikes']['store 2'])

In [None]:
store_items['shirts'] = [15,np.nan]

store_items['suits'] = store_items['shirts'] + store_items['pants']

store_items

In [None]:
new_store = pd.DataFrame([{'bikes': 80, 'pants': 1, 'watches': 15}], index=['store_4'])

store_items = store_items.append(new_store)

store_items.insert(3, 'shoes', [8,5,0])

store_items

In [None]:
# Rename the row and column labels
# change the column label
store_items = store_items.rename(columns = {'bikes': 'hats'})
# change the row label
store_items = store_items.rename(index = {'store 3': 'last store'})

In [None]:
store_items

In [None]:
# Dealing with NaN values (missing data)

# create a list of Python dictionaries
items2 = [{'bikes': 20, 'pants': 30, 'watches': 35, 'shirts': 15, 'shoes':8, 'suits':45},
{'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5, 'shirts': 2, 'shoes':5, 'suits':7},
{'bikes': 20, 'pants': 30, 'watches': 35, 'glasses': 4, 'shoes':10}]

# We create a DataFrame and provide the row index
store_items = pd.DataFrame(items2, index = ['store 1', 'store 2', 'store 3'])

store_items

In [None]:
df.isnull().any()

In [None]:
store_items.isnull().sum()

In [None]:
store_items.isnull().sum().sum()

In [None]:
store_items.isnull().count()

In [None]:
store_items.dropna(axis = 0)
store_items.dropna(axis = 1)
store_items

In [None]:
df.head()
df.tail()
df.describe()
# prints max value in each column
df.max()

# display the memory usage of a DataFrame
# total usage
df.info()
# usage by column
df.memory_usage()

In [None]:
df.corr()

In [None]:
store_items

In [None]:
store_items.groupby(['shoes'])['watches'].mean()

#### Reading Files

In [None]:
df = pd.read_table('../datasets/Resume.csv', sep=',', index_col='ID')
df

In [None]:
df

In [None]:
type(df)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.index

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
df.values

In [None]:
df.describe()

In [None]:
df.describe(include=['object'])

In [None]:
df.describe(include='all')