In [1]:
import pandas as pd
import numpy as np

In [2]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan , 'avocado'])

In [3]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [4]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
data = {'a':[1,2,3,4],'b':[np.nan,2,np.nan,np.nan],'c':[1.2,None,3.3,4.4]}
df = pd.DataFrame(data)

In [6]:
df

Unnamed: 0,a,b,c
0,1,,1.2
1,2,2.0,
2,3,,3.3
3,4,,4.4


In [7]:
series = pd.Series([1,2.2,3,np.nan,None,5.432])

In [8]:
series.isna()

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [9]:
series[series.notnull()]

0    1.000
1    2.200
2    3.000
5    5.432
dtype: float64

In [10]:
series.dropna()

0    1.000
1    2.200
2    3.000
5    5.432
dtype: float64

In [11]:
df

Unnamed: 0,a,b,c
0,1,,1.2
1,2,2.0,
2,3,,3.3
3,4,,4.4


In [12]:
df.dropna(axis=1,how="all")

Unnamed: 0,a,b,c
0,1,,1.2
1,2,2.0,
2,3,,3.3
3,4,,4.4


In [13]:
df.fillna({'a':df['a'].mean(),'b':df['b'].mean(),'c':df['c'].mean()})

Unnamed: 0,a,b,c
0,1,2.0,1.2
1,2,2.0,2.966667
2,3,2.0,3.3
3,4,2.0,4.4


In [14]:
df.fillna(df.mean())

Unnamed: 0,a,b,c
0,1,2.0,1.2
1,2,2.0,2.966667
2,3,2.0,3.3
3,4,2.0,4.4


In [15]:
dup = pd.Series([1,1,2,3,4,5,6,7,7])

In [16]:
dup.duplicated()

0    False
1     True
2    False
3    False
4    False
5    False
6    False
7    False
8     True
dtype: bool

In [17]:
dup.drop_duplicates()

0    1
2    2
3    3
4    4
5    5
6    6
7    7
dtype: int64

In [18]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],'k2': [1, 1, 2, 3, 3, 4, 4]})

In [19]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [20]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [21]:
data['ash'] = [10,1,2,3,4,5,6]

In [22]:
data

Unnamed: 0,k1,k2,ash
0,one,1,10
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [23]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
   ....:                               'Pastrami', 'corned beef', 'Bacon',
   ....:                               'pastrami', 'honey ham', 'nova lox'],
   ....:                      'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [24]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [25]:
meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}

In [26]:
data['food'] = data['food'].str.lower()

In [27]:
data['aminal'] = data['food'].map(meat_to_animal)

In [28]:
data

Unnamed: 0,food,ounces,aminal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [29]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
   ....:                     index=['Ohio', 'Colorado', 'New York'],
   ....:                     columns=['one', 'two', 'three', 'four'])

In [30]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [37]:
fun = lambda x : x.upper()

In [42]:
data.index.map(fun)

Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')

In [43]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [50]:
fun1 = lambda x:x[0]
data.rename(columns=fun1,index = fun1,inplace=True)

In [51]:
data

Unnamed: 0,o,t,t.1,f
O,0,1,2,3
C,4,5,6,7
N,8,9,10,11


In [56]:
n = pd.Series([1,2,3,4,7,8,9,10,11,12,13,14,15,16])

In [57]:
n

0      1
1      2
2      3
3      4
4      7
5      8
6      9
7     10
8     11
9     12
10    13
11    14
12    15
13    16
dtype: int64

In [58]:
bins = [1,5,10,15,20]

In [70]:
binning = pd.cut(n,bins,labels=['one','two','three','four'])

In [71]:
binning.dtype

CategoricalDtype(categories=['one', 'two', 'three', 'four'], ordered=True)

In [72]:
pd.value_counts(binning)

three    5
two      4
one      3
four     1
dtype: int64

In [74]:
binning

0       NaN
1       one
2       one
3       one
4       two
5       two
6       two
7       two
8     three
9     three
10    three
11    three
12    three
13     four
dtype: category
Categories (4, object): [one < two < three < four]

In [78]:
a = pd.DataFrame(np.random.rand(25).reshape(5,5),columns=['a','b','c','d','e'])

In [79]:
a

Unnamed: 0,a,b,c,d,e
0,0.532711,0.726117,0.445131,0.023865,0.800603
1,0.778485,0.298212,0.001928,0.382255,0.751219
2,0.512331,0.106249,0.511247,0.115928,0.574015
3,0.48262,0.732134,0.398209,0.678821,0.000288
4,0.37603,0.896617,0.973497,0.631646,0.235632


In [81]:
a.describe

<bound method NDFrame.describe of           a         b         c         d         e
0  0.532711  0.726117  0.445131  0.023865  0.800603
1  0.778485  0.298212  0.001928  0.382255  0.751219
2  0.512331  0.106249  0.511247  0.115928  0.574015
3  0.482620  0.732134  0.398209  0.678821  0.000288
4  0.376030  0.896617  0.973497  0.631646  0.235632>

In [91]:
a['b'][np.abs(a['b'])>0.5]

0    0.726117
3    0.732134
4    0.896617
Name: b, dtype: float64

In [100]:
a['b'][a['b']>0.5]

0    0.726117
3    0.732134
4    0.896617
Name: b, dtype: float64

In [121]:
a[a['b']>0.5]

Unnamed: 0,a,b,c,d,e
0,3.0,3.0,3.0,3.0,3.0
3,3.0,3.0,3.0,3.0,3.0
4,3.0,3.0,3.0,3.0,3.0


In [122]:
a[a['b']>0.5] = np.sign(a)*a[a['b']>0.5]

In [123]:
a

Unnamed: 0,a,b,c,d,e
0,3.0,3.0,3.0,3.0,3.0
1,0.778485,0.298212,0.001928,0.382255,0.751219
2,0.512331,0.106249,0.511247,0.115928,0.574015
3,3.0,3.0,3.0,3.0,3.0
4,3.0,3.0,3.0,3.0,3.0


In [125]:
np.random.permutation(10)

array([7, 6, 5, 2, 8, 9, 0, 4, 3, 1])

In [139]:
a = a.rename(columns={'a':1,'b':2,'c':3,'d':4,'e':5})

In [140]:
a

Unnamed: 0,1,2,3,4,5
0,3.0,3.0,3.0,3.0,3.0
1,0.778485,0.298212,0.001928,0.382255,0.751219
2,0.512331,0.106249,0.511247,0.115928,0.574015
3,3.0,3.0,3.0,3.0,3.0
4,3.0,3.0,3.0,3.0,3.0


In [145]:
sampler = np.random.permutation(5)
sampler

array([4, 0, 3, 1, 2])

In [151]:
a.iloc[sampler]

Unnamed: 0,1,2,3,4,5
4,3.0,3.0,3.0,3.0,3.0
0,3.0,3.0,3.0,3.0,3.0
3,3.0,3.0,3.0,3.0,3.0
1,0.778485,0.298212,0.001928,0.382255,0.751219
2,0.512331,0.106249,0.511247,0.115928,0.574015


In [152]:
a.sample(n=3)

Unnamed: 0,1,2,3,4,5
4,3.0,3.0,3.0,3.0,3.0
1,0.778485,0.298212,0.001928,0.382255,0.751219
0,3.0,3.0,3.0,3.0,3.0


In [191]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
   .....:                    'data1': range(6)})


In [192]:
a = pd.get_dummies(df['key'],prefix='key')
a

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [193]:
df = df.join(a)

In [194]:
df.pop('key')

0    b
1    b
2    a
3    c
4    a
5    b
Name: key, dtype: object

In [195]:
df

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [197]:
mnames = ['movie_id', 'title', 'genres']

In [251]:
movies = pd.read_table(r'C:\Users\ashwi\Desktop\Ashwin\pydata-book\datasets\movielens\movies.dat', sep='::',header=None, names=mnames)

  """Entry point for launching an IPython kernel.


In [252]:
movies

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [255]:
genres = []
for x in movies.genres:
    genres.extend(x.split('|'))
gen = pd.unique(genres)    

In [264]:
zero_matrix = np.zeros((len(movies), len(gen)))

In [266]:
dummies = pd.DataFrame(zero_matrix, columns=gen)

In [272]:
for i, gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1

In [270]:
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [273]:
movies.join(dummies)

Unnamed: 0,movie_id,title,genres,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,...,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1,Toy Story (1995),Animation|Children's|Comedy,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,6,Heat (1995),Action|Crime|Thriller,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,7,Sabrina (1995),Comedy|Romance,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8,Tom and Huck (1995),Adventure|Children's,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,9,Sudden Death (1995),Action,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,10,GoldenEye (1995),Action|Adventure|Thriller,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
val = 'a,b,c,         ashwin'

In [20]:
[x.strip() for x in val.split(',')]

['a', 'b', 'c', 'ashwin']

In [22]:
val.replace(',',':')

'a:b:c:         ashwin'

In [24]:
val.endswith('n')

True