In [1]:
import os
import sys

import numpy as np
import pandas as pd
from numpy.random import randn

In [2]:
labels = ['a', 'b', 'c']
data = [10, 20, 30]


In [3]:
arr = np.array(data)
arr

array([10, 20, 30])

In [4]:
s = pd.Series(data=data)
s

0    10
1    20
2    30
dtype: int64

In [5]:
s = pd.Series(data=data, index=labels)
print(s)

a    10
b    20
c    30
dtype: int64


In [6]:
print(s['a'])

10


In [8]:
np.random.seed(42)

rand_mat = randn(5, 4)
rand_mat

array([[ 0.49671415, -0.1382643 ,  0.64768854,  1.52302986],
       [-0.23415337, -0.23413696,  1.57921282,  0.76743473],
       [-0.46947439,  0.54256004, -0.46341769, -0.46572975],
       [ 0.24196227, -1.91328024, -1.72491783, -0.56228753],
       [-1.01283112,  0.31424733, -0.90802408, -1.4123037 ]])

In [9]:
df = pd.DataFrame(data=rand_mat)
df

Unnamed: 0,0,1,2,3
0,0.496714,-0.138264,0.647689,1.52303
1,-0.234153,-0.234137,1.579213,0.767435
2,-0.469474,0.54256,-0.463418,-0.46573
3,0.241962,-1.91328,-1.724918,-0.562288
4,-1.012831,0.314247,-0.908024,-1.412304


In [10]:
df[0]

0    0.496714
1   -0.234153
2   -0.469474
3    0.241962
4   -1.012831
Name: 0, dtype: float64

In [13]:
df = pd.DataFrame(data=rand_mat, index='A B C D E'.split())
df

Unnamed: 0,0,1,2,3
A,0.496714,-0.138264,0.647689,1.52303
B,-0.234153,-0.234137,1.579213,0.767435
C,-0.469474,0.54256,-0.463418,-0.46573
D,0.241962,-1.91328,-1.724918,-0.562288
E,-1.012831,0.314247,-0.908024,-1.412304


In [14]:
df = pd.DataFrame(data=rand_mat, index='A B C D E'.split(), columns='W X Y Z'.split())
df

Unnamed: 0,W,X,Y,Z
A,0.496714,-0.138264,0.647689,1.52303
B,-0.234153,-0.234137,1.579213,0.767435
C,-0.469474,0.54256,-0.463418,-0.46573
D,0.241962,-1.91328,-1.724918,-0.562288
E,-1.012831,0.314247,-0.908024,-1.412304


In [15]:
df['Y']

A    0.647689
B    1.579213
C   -0.463418
D   -1.724918
E   -0.908024
Name: Y, dtype: float64

In [16]:
cols = ['X', 'Y']
df[cols]

Unnamed: 0,X,Y
A,-0.138264,0.647689
B,-0.234137,1.579213
C,0.54256,-0.463418
D,-1.91328,-1.724918
E,0.314247,-0.908024


In [17]:
df.Z

A    1.523030
B    0.767435
C   -0.465730
D   -0.562288
E   -1.412304
Name: Z, dtype: float64

In [21]:
df.add_prefix('x_')

Unnamed: 0,x_W,x_X,x_Y,x_Z
A,0.496714,-0.138264,0.647689,1.52303
B,-0.234153,-0.234137,1.579213,0.767435
C,-0.469474,0.54256,-0.463418,-0.46573
D,0.241962,-1.91328,-1.724918,-0.562288
E,-1.012831,0.314247,-0.908024,-1.412304


In [22]:
df.agg(['sum', 'mean'])

Unnamed: 0,W,X,Y,Z
sum,-0.977782,-1.428874,-0.869458,-0.149856
mean,-0.195556,-0.285775,-0.173892,-0.029971


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, A to E
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   W       5 non-null      float64
 1   X       5 non-null      float64
 2   Y       5 non-null      float64
 3   Z       5 non-null      float64
dtypes: float64(4)
memory usage: 372.0+ bytes


In [26]:
df.head()

Unnamed: 0,W,X,Y,Z
A,0.496714,-0.138264,0.647689,1.52303
B,-0.234153,-0.234137,1.579213,0.767435
C,-0.469474,0.54256,-0.463418,-0.46573
D,0.241962,-1.91328,-1.724918,-0.562288
E,-1.012831,0.314247,-0.908024,-1.412304


In [28]:
bool_df = df > 0
bool_df

Unnamed: 0,W,X,Y,Z
A,True,False,True,True
B,False,False,True,True
C,False,True,False,False
D,True,False,False,False
E,False,True,False,False


In [29]:
df[bool_df]

Unnamed: 0,W,X,Y,Z
A,0.496714,,0.647689,1.52303
B,,,1.579213,0.767435
C,,0.54256,,
D,0.241962,,,
E,,0.314247,,


In [30]:
df['X'] > 0

A    False
B    False
C     True
D    False
E     True
Name: X, dtype: bool

In [31]:
df[df['X'] > 0]

Unnamed: 0,W,X,Y,Z
C,-0.469474,0.54256,-0.463418,-0.46573
E,-1.012831,0.314247,-0.908024,-1.412304


In [32]:
df.loc['A']

W    0.496714
X   -0.138264
Y    0.647689
Z    1.523030
Name: A, dtype: float64

In [34]:
df.iloc[0]

W    0.496714
X   -0.138264
Y    0.647689
Z    1.523030
Name: A, dtype: float64

In [36]:
df[(df['W']>0) & (df['Z']>1)]

Unnamed: 0,W,X,Y,Z
A,0.496714,-0.138264,0.647689,1.52303


In [38]:
data = {'Company': ['GOOG', 'GOOG', 'MSFT', 'MSFT', 'FB', 'FB'],
        'Person': ['Sam', 'Charlie', 'Amy', 'Vanessa', 'Carl', 'Sarah'],
        'Sales': [200, 120, 340, 124, 243, 350]
        }
df = pd.DataFrame(data)
df

Unnamed: 0,Company,Person,Sales
0,GOOG,Sam,200
1,GOOG,Charlie,120
2,MSFT,Amy,340
3,MSFT,Vanessa,124
4,FB,Carl,243
5,FB,Sarah,350


In [40]:
df.groupby('Company')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7dcb05815f10>

In [41]:
gb = df.groupby('Company')
gb['Sales'].sum()

Company
FB      593
GOOG    320
MSFT    464
Name: Sales, dtype: int64

In [43]:
def times_two(num):
    return num *2

In [50]:
df['sales_2'] = df['Sales'].apply(times_two)
df

Unnamed: 0,Company,Person,Sales,sales_2
0,GOOG,Sam,200,400
1,GOOG,Charlie,120,240
2,MSFT,Amy,340,680
3,MSFT,Vanessa,124,248
4,FB,Carl,243,486
5,FB,Sarah,350,700


In [51]:
df = df.drop('sales_2', axis=1)
df

Unnamed: 0,Company,Person,Sales
0,GOOG,Sam,200
1,GOOG,Charlie,120
2,MSFT,Amy,340
3,MSFT,Vanessa,124
4,FB,Carl,243
5,FB,Sarah,350


In [52]:
df.sort_values('Sales')

Unnamed: 0,Company,Person,Sales
1,GOOG,Charlie,120
3,MSFT,Vanessa,124
0,GOOG,Sam,200
4,FB,Carl,243
2,MSFT,Amy,340
5,FB,Sarah,350
