## Loading and Saving to External files

In [1]:
import numpy as np
import pandas as pd

In [5]:
# we can save any structure like this
arr = np.arange(10)
arr
np.save('my_array', arr) # this will persist in the file system
# we can also use savez to save as a compressed file

In [6]:
# read back
arr2 = np.load('my_array.npy') # numpy handles all the file access
arr2

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [17]:
# we can save comma separated values (csv)
np.random.seed(20) # this ensures reproduicible random numbers
r = np.arange(10)
np.random.shuffle(r) # put the members into a random order
r
np.savetxt('my_arr.csv', r, delimiter=',') # create csv file

In [18]:
d = np.loadtxt('my_arr.csv')
d

array([7., 1., 8., 5., 0., 2., 6., 9., 4., 3.])

### Loading Large CSV data

In [22]:
# we can load data from very large data sets (limited only by resources)
df = pd.read_csv('data/salaries.csv') # make sure the file is accessable to this notebook
df.head()
df.tail()
# we cana quickly grasp some useful statistical values
df.describe()

Unnamed: 0,phd,service,salary
count,78.0,78.0,78.0
mean,19.705128,15.051282,108023.782051
std,12.498425,12.139768,28293.661022
min,1.0,0.0,57800.0
25%,10.25,5.25,88612.5
50%,18.5,14.5,104671.0
75%,27.75,20.75,126774.75
max,56.0,51.0,186960.0


In [31]:
# we can explore this dataframe
df.columns
df.size # total number of data points
df.sum()
df.max() # or df.min()
df['salary'].mean()
df['phd']

0     56
1     12
2     23
3     40
4     20
      ..
73    18
74    19
75    17
76    28
77    23
Name: phd, Length: 78, dtype: int64