## Data Cleaning and Preparation

In [2]:
#Handling missing data
import pandas as pd
import numpy as np

string_data = pd.Series(['advark', 'artwork', np.nan, 'avocado'])
string_data.isnull() 
#To drop the nan value, you can use string_data.dropna()
#To fill with a value, use string_data.fillna('not_available')
#To replace values, use data.replace('advark', string_to_use)

0    False
1    False
2     True
3    False
dtype: bool

In [7]:
#Renaming axis indexes
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
     index=['Ohio', 'Colorado', 'New York'],
     columns=['one', 'two', 'three', 'four'])

print(data)
transform = lambda x: x[:4].upper()
print(data.index.map(transform))

data.index = data.index.map(transform)
print(data)

data.rename(index=str.title, columns=str.upper)

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
New York    8    9     10    11
Index(['OHIO', 'COLO', 'NEW '], dtype='object')
      one  two  three  four
OHIO    0    1      2     3
COLO    4    5      6     7
NEW     8    9     10    11


Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [13]:
#Detecting and filtering outliers
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()
#to select all rows having a value exceeding 3 or -3
data[(np.abs(data) > 3).any(1)]
#To cap the values outside of this range, use code below
data[np.abs(data)>3] = np.sign(data) * 3 #np.sign produces 1 or -1 values based on whether the values in data are +ve or -ve
print(data.head(10))

          0         1         2         3
0 -1.504148 -0.358449  0.367153  0.444124
1  0.732885 -0.349412 -1.048279 -0.626143
2 -1.037455 -0.212979 -0.552562 -1.474081
3 -1.531693  0.369302  0.624645 -0.062769
4 -0.815544 -0.404814 -0.448397 -0.989170
5  0.673040 -2.890697 -1.561674  0.178177
6 -0.355101  0.053555  0.816111  0.535184
7 -0.907369 -1.851136  0.484271  1.130043
8  1.377142  1.924302 -0.953519 -0.367355
9  0.964484  0.602929  0.846602  1.794003


In [16]:
#String object methods
val = 'a,b,d,s,  g'
print(val.split(','))
pieces = [x.strip() for x in val.split(',')] #This neglects the space before g
print(pieces)

['a', 'b', 'd', 's', '  g']
['a', 'b', 'd', 's', 'g']


## Data Wrangling: Join, Combine and Reshape

In [22]:
#Hierarchical Indexing: enables you to have multiple index levels on an axis
data = pd.Series(np.random.randn(9),
     index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
     [1, 2, 3, 1, 3, 1, 2, 2, 3]])
print(data, '\n')
print(data.index, '\n')
print(data.unstack()) #to reverse this data.unstack().stack()

a  1    1.634916
   2    1.423921
   3    0.884919
b  1   -2.506914
   3   -0.869352
c  1   -0.828598
   2   -0.095356
d  2   -0.020519
   3    0.763712
dtype: float64


MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1, 1, 2]]) 

          1         2         3
a  1.634916  1.423921  0.884919
b -2.506914       NaN -0.869352
c -0.828598 -0.095356       NaN
d       NaN -0.020519  0.763712


In [29]:
#Combining and merging datasets
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
    'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
    'data2': range(3)})
merged_df = pd.merge(df1,df2, on='key', how='left')
print(df1)
print(df2,'\n')

print(merged_df)

  key  data1
0   b      0
1   b      1
2   a      2
3   c      3
4   a      4
5   a      5
6   b      6
  key  data2
0   a      0
1   b      1
2   d      2 

  key  data1  data2
0   b      0    1.0
1   b      1    1.0
2   a      2    0.0
3   c      3    NaN
4   a      4    0.0
5   a      5    0.0
6   b      6    1.0


In [31]:
#join() instance can also be used for merging by index to combine dataframe objects having similar or same indexes but
#non-overlapping columns

left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],
     index=['a', 'c', 'e'],
     columns=['Ohio', 'Nevada'])
right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
     index=['b', 'c', 'd', 'e'],
     columns=['Missouri', 'Alabama'])
#with merge command
print(pd.merge(left2, right2,how='outer',left_index=True, right_index=True), '\n')
#with join
left2.join(right2, how='outer')


   Ohio  Nevada  Missouri  Alabama
a   1.0     2.0       NaN      NaN
b   NaN     NaN       7.0      8.0
c   3.0     4.0       9.0     10.0
d   NaN     NaN      11.0     12.0
e   5.0     6.0      13.0     14.0 



Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [32]:
#To combine along an axis
arr = np.arange(12).reshape((3, 4))
np.concatenate([arr,arr], axis=1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

## Plotting and Visualization

In [36]:
 #this enables you to interact with your plots in jupyter notebook
%matplotlib notebook
import matplotlib.pyplot as plt
data = np.arange(10)
plt.plot(data)

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x1de93bd6240>]

In [39]:
#figures and subplots
fig = plt.figure() #creates new figures
ax1 = fig.add_subplot(2,2,1) #figure creates four plots in total and 1 selects the first figure
ax2 = fig.add_subplot(2,2,2)
ax3 = fig.add_subplot(2,2,3)
ax4 = fig.add_subplot(2,2,4)
_ = ax1.hist(np.random.randn(100), bins=20, color='k', alpha=0.3)
ax2.scatter(np.arange(30), np.arange(30) + 3 * np.random.randn(30))
#ax2.axis('off')

#a more convenient way of doing same task is given below
fig, axes = plt.subplot(2,3) #2 rows, 3 columns

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x1de9462e6d8>