# NumPy: number Python

NumPy is the cornerstone of scientific computing in the python community.

* The `numpy` package (module) is used in almost all numerical computations using Python. It is a package that provides high-performance vector, matrix, and higher-dimensional data structures for Python. It is implemented in C and Fortran, and the computational performance is very efficient.


In [1]:
# import the library
import numpy as np  #now you can use np as the abbreviation for numpy. 

##  1. NumPy Arrays
The objects in NumPy are numpy arrays and are handled in a way similar to Python lists with many additional functions. 

In [2]:
# create a Python list
my_list = [1,2,7,18,4]
my_list

[1, 2, 7, 18, 4]

In [3]:
# create a numpy array out of the list
my_array = np.array(my_list)
my_array

array([ 1,  2,  7, 18,  4])

In [4]:
type(my_list)

list

In [5]:
# check the data type for my_array
type(my_array)

numpy.ndarray

In [6]:
# Built in methods
# np.arange(a,b,c) :Return evenly spaced values within a given interval [a,b) with a step of c.
# step is the number that defines the spacing (difference) between each two consecutive values in the array and defaults to 1
np.arange(0,4)

array([0, 1, 2, 3])

In [7]:
# c=3
np.arange(0,10,3)

array([0, 3, 6, 9])

In [8]:
# Return a new array of given shape and type, filled with zeros.
np.zeros(5) 

array([0., 0., 0., 0., 0.])

In [9]:
np.zeros((5,5)) 
# generate a 5*5 array filled with zeros.

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [10]:
np.zeros((5,5),dtype=int) # if you want all values to be interger

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [11]:
# Return a new array of given shape and type, filled with ones.
np.ones(5)

array([1., 1., 1., 1., 1.])

In [12]:
np.ones((5,5))

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

In [13]:
# rand : create random samples from a uniform distribution from 0 to 1
y=np.random.rand(3)  
y

array([0.35439072, 0.02437259, 0.63007634])

In [14]:
x = np.random.rand(3, 3)  # uniform distribution from 0 to 1. a 3*3 array
x

array([[0.70383076, 0.81394   , 0.76931436],
       [0.88832202, 0.3737811 , 0.47416959],
       [0.4396101 , 0.22561131, 0.32282499]])

In [15]:
# np.random.    #use Tab to search the available functions

In [16]:
# randn : create random samples from standard normal distribution （mean 0 and std 1）
np.random.randn(5)

array([-1.74162762,  0.95683044, -0.15282362,  0.4675587 , -1.4683796 ])

In [17]:
np.random.randn(5,5)

array([[ 1.25487712,  0.77393837, -0.46627387, -0.48920982,  0.87150751],
       [-1.56511044, -0.74850147,  0.1092666 ,  0.40238865,  0.08737294],
       [ 1.50056235,  0.85014037,  0.23438452, -0.70409758,  1.45063604],
       [ 0.82982006,  2.56920153,  0.64178998, -0.63943294, -0.9794917 ],
       [-1.37424461, -1.07312177,  1.71668945, -0.91889073, -0.331414  ]])

In [18]:
# randint(a,b) : create random sample of integers from a (including a) to b (excluding b) 
np.random.randint(1,5)

4

In [19]:
np.random.randint(1,5,20) 
# generate 20 random numbers, each of them are random interger drawn from [1,5)

array([2, 1, 4, 2, 2, 3, 3, 4, 4, 1, 1, 4, 3, 1, 4, 2, 3, 4, 4, 3])

In [20]:
# run multiple times, each time you will get different random numbers 
np.random.randn(5)

array([ 0.72251164, -0.79830767,  0.84793415, -0.48114253,  2.04130115])

In [21]:
# seed is used to fix the random state.
np.random.seed(100)
np.random.randn(5)

array([-1.74976547,  0.3426804 ,  1.1530358 , -0.25243604,  0.98132079])

In [22]:
# you can choose another seed, once a seed is selected, the random number generated will always keep the same
# however, seed(100) generates different numbers from seed(123)
np.random.seed(123)
np.random.randn(5)

array([-1.0856306 ,  0.99734545,  0.2829785 , -1.50629471, -0.57860025])

In [23]:
# seed is only valid (generate the same random numbers) once
np.random.seed(100)
x=np.random.randn(5)
print(x)

# np.random.seed(100)
y=np.random.randn(5)
print(y)

[-1.74976547  0.3426804   1.1530358  -0.25243604  0.98132079]
[ 0.51421884  0.22117967 -1.07004333 -0.18949583  0.25500144]


In [24]:
# array methods
my_array = np.arange(1,10)
my_array

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [25]:
# reshape the above array into a 3*3 array
new_array = my_array.reshape(3,3)
new_array

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [26]:
# append additional objects to the end of my_array
my_array = np.append(my_array, [100,-100])
my_array

array([   1,    2,    3,    4,    5,    6,    7,    8,    9,  100, -100])

In [27]:
#calculate the maximum and minimum of my_array
print("max value: ", my_array.max())
print("min value: ", my_array.min())


max value:  100
min value:  -100


## 2. Numpy indexing and extraction

Like lists you access elements in a NumPy array the same way by indexing and slicing.

In [28]:
my_array = np.arange(1,11)
my_array

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [29]:
# extraction is very similar to list extraction
my_array[1]

2

In [30]:
my_array[6:9]

array([7, 8, 9])

In [31]:
# With NumPy arrays, you can broadcast a single value across a larger set of values. This is not possible using lists. 
my_array[0:5]=100
my_array

array([100, 100, 100, 100, 100,   6,   7,   8,   9,  10])

In [32]:
my_list = list(range(1,11))
my_list

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [33]:
# my_list[0:2]=100

In [34]:
# you have to list all values you want to change
my_list[0:2]=[100,100]
my_list

[100, 100, 3, 4, 5, 6, 7, 8, 9, 10]

### Extracting with conditional selection


In [35]:
my_array

array([100, 100, 100, 100, 100,   6,   7,   8,   9,  10])

In [36]:
condition = (my_array >= 10)

In [37]:
my_array[condition]

array([100, 100, 100, 100, 100,  10])

In [38]:
#another example 
my_array = np.arange(100)
my_array

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

In [39]:
my_array[my_array > 50]

array([51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

## 3. Numpy operations

In [40]:
x = [2, 4]
y = [3, 5]
x+y

[2, 4, 3, 5]

List: +  concatenation of two lists

Numpy arrays: +: performing arithmetic operations on each object of the numpy arrays


In [41]:
# recall from previous class on Python Lists: 
x = [2, 3]  # x is a list
y = [4, 5] # y is another list

k=x+y 
j=x*3  #for a list: *3: meaning concatenate the same list three times.

print (k)
print (j)

#q=x/3  # / (division): you can not divide a list
#i=x-y   # - : also not supported for Python lists. 
# Key take-away: don't think of Python lists as Vectors. 

[2, 3, 4, 5]
[2, 3, 2, 3, 2, 3]


In [42]:
# now for np arrays: 
x = [2,3]  # x is a list
y = [4,5] # y is another list

x_array = np.array(x)
y_array = np.array(y)

In [43]:
x_array + y_array   # think about arrays as two vectors: 
# recall how do we do vector addition: add the elements one by one. 

array([6, 8])

In [44]:
(x_array + y_array)*3

array([18, 24])

In [45]:
x_array - y_array 

array([-2, -2])

In [46]:
x_array * y_array 

array([ 8, 15])

In [47]:
x_array / y_array 

array([0.5, 0.6])

### Easy access to basic stats


In [48]:
np.random.seed(123)
x = np.random.randn(100)  # generate 100 random numbers from the normal distrubtion
x

array([-1.0856306 ,  0.99734545,  0.2829785 , -1.50629471, -0.57860025,
        1.65143654, -2.42667924, -0.42891263,  1.26593626, -0.8667404 ,
       -0.67888615, -0.09470897,  1.49138963, -0.638902  , -0.44398196,
       -0.43435128,  2.20593008,  2.18678609,  1.0040539 ,  0.3861864 ,
        0.73736858,  1.49073203, -0.93583387,  1.17582904, -1.25388067,
       -0.6377515 ,  0.9071052 , -1.4286807 , -0.14006872, -0.8617549 ,
       -0.25561937, -2.79858911, -1.7715331 , -0.69987723,  0.92746243,
       -0.17363568,  0.00284592,  0.68822271, -0.87953634,  0.28362732,
       -0.80536652, -1.72766949, -0.39089979,  0.57380586,  0.33858905,
       -0.01183049,  2.39236527,  0.41291216,  0.97873601,  2.23814334,
       -1.29408532, -1.03878821,  1.74371223, -0.79806274,  0.02968323,
        1.06931597,  0.89070639,  1.75488618,  1.49564414,  1.06939267,
       -0.77270871,  0.79486267,  0.31427199, -1.32626546,  1.41729905,
        0.80723653,  0.04549008, -0.23309206, -1.19830114,  0.19

In [49]:
# what should be the mean of x? 
# to see the list of built-in functions for a numpy array x: 
# x. #press tab

In [50]:
print("mean: ", x.mean())
print("std: ", x.std())
print("max:", x.max())
print("min:", x.min())
print("sum:", x.sum())

mean:  0.027109073490359778
std:  1.128240470477961
max: 2.392365265937726
min: -2.7985891054607244
sum: 2.7109073490359776


In [51]:
# When you generate random numbers multiple times, the more observations you have, the closer your mean and std are to the true distribution (standard normal distribution)
np.random.seed(123)
x = np.random.randn(100) 
print("Below are the mean and std for 100 observations")
print("mean: ", x.mean())
print("std: ", x.std(), "\n")

np.random.seed(123)
x = np.random.randn(1000000) 
print("Below are the mean and std for 1000000 observations")
print("mean: ", x.mean())
print("std: ", x.std())

Below are the mean and std for 100 observations
mean:  0.027109073490359778
std:  1.128240470477961 

Below are the mean and std for 1000000 observations
mean:  0.0006295013073554934
std:  1.0002585536020998


# Pandas

Topics to be covered:

1. Series
2. DataFrame

## 1. Series

**Series are very similar to NumPy arrays.** The difference is that a Series can have axis labels, meaning it can be indexed by a label, instead of just a number location. We can convert a list or a numpy array to a Series.

In [52]:
#import both numpy and pandas library
import numpy as np
import pandas as pd

In [53]:
my_list = ["NY", "CA" , "London", "HK"]
my_list

['NY', 'CA', 'London', 'HK']

In [54]:
# create a pandas series
my_series = pd.Series(my_list)  #be careful about the capitalized letter: S in pd.Series
my_series

0        NY
1        CA
2    London
3        HK
dtype: object

In [55]:
my_series[0]  

'NY'

In [56]:
labels = ['a','b','c','d']
my_series_withlabels = pd.Series(my_list,index=labels)
my_series_withlabels

a        NY
b        CA
c    London
d        HK
dtype: object

In [57]:
my_series_withlabels[0]  

'NY'

In [58]:
#unlike numpy arrays, we can extract objects by both index number and label. 
my_series_withlabels['a']  

'NY'

## 2. DataFrame

Although Series are useful, DataFrame is what we normally use pandas for.

In [59]:
# A dataframe is Two-dimensional, size-mutable, potentially heterogeneous tabular data. 
# (consisting of multiple columns/series)

df = pd.DataFrame(np.random.rand(10,4))  #generate random numbers: 20 rows: 3 columns
df

Unnamed: 0,0,1,2,3
0,0.425735,0.698077,0.569767,0.986204
1,0.79134,0.492444,0.402151,0.333946
2,0.298938,0.140941,0.639572,0.957413
3,0.532439,0.112969,0.488943,0.248365
4,0.623985,0.903104,0.401284,0.808012
5,0.511983,0.289461,0.592615,0.275621
6,0.667187,0.658118,0.165401,0.583045
7,0.432402,0.848647,0.361279,0.879314
8,0.988927,0.012057,0.962098,0.052259
9,0.34181,0.357579,0.912219,0.086617


In [60]:
# We can name each column
df = pd.DataFrame(np.random.rand(10,4), columns= ['Tencent', 'Alibaba', 'Baidu','JD'])   #give each column a name of one listed Chinese company
df


Unnamed: 0,Tencent,Alibaba,Baidu,JD
0,0.708777,0.889998,0.953656,0.457647
1,0.614,0.060706,0.377732,0.903219
2,0.86496,0.769361,0.608371,0.342222
3,0.108061,0.585095,0.194292,0.344749
4,0.803792,0.003262,0.859627,0.816037
5,0.673859,0.324808,0.246341,0.299904
6,0.561033,0.171065,0.205809,0.585177
7,0.363142,0.979281,0.401054,0.755531
8,0.137862,0.62728,0.235453,0.435365
9,0.341459,0.061194,0.636531,0.225624


In [61]:
# we can pass a date index
dates = pd.date_range('20180101', periods=10)
dates

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08',
               '2018-01-09', '2018-01-10'],
              dtype='datetime64[ns]', freq='D')

In [62]:
# dates = pd.date_range(start='20180101', periods=10,freq="W")
# dates

In [63]:
df = pd.DataFrame(np.random.rand(10,4), columns= ['Tencent', 'Alibaba', 'Baidu','JD'], index=dates)
df

Unnamed: 0,Tencent,Alibaba,Baidu,JD
2018-01-01,0.338399,0.155333,0.564068,0.470597
2018-01-02,0.265059,0.354282,0.909707,0.534934
2018-01-03,0.1319,0.1576,0.283244,0.723055
2018-01-04,0.988508,0.736285,0.368495,0.805311
2018-01-05,0.302819,0.770362,0.320613,0.827408
2018-01-06,0.457602,0.132751,0.666649,0.54642
2018-01-07,0.153805,0.380403,0.402076,0.203379
2018-01-08,0.080599,0.51581,0.868683,0.869231
2018-01-09,0.097541,0.013034,0.995757,0.469287
2018-01-10,0.219531,0.034979,0.356078,0.481622


In [64]:
np.random.seed(100) # add this if you want to observe the same results as mine 
df = pd.DataFrame(np.random.rand(10,4), columns= ['Tencent', 'Alibaba', 'Baidu','JD'], index=dates)
df

Unnamed: 0,Tencent,Alibaba,Baidu,JD
2018-01-01,0.543405,0.278369,0.424518,0.844776
2018-01-02,0.004719,0.121569,0.670749,0.825853
2018-01-03,0.136707,0.575093,0.891322,0.209202
2018-01-04,0.185328,0.108377,0.219697,0.978624
2018-01-05,0.811683,0.171941,0.816225,0.274074
2018-01-06,0.431704,0.94003,0.817649,0.336112
2018-01-07,0.17541,0.372832,0.005689,0.252426
2018-01-08,0.795663,0.015255,0.598843,0.603805
2018-01-09,0.105148,0.381943,0.036476,0.890412
2018-01-10,0.980921,0.059942,0.890546,0.576901


## Access data

In [65]:
# the head and tail allows you to print the begining of the end of df
df.head()  # show you the first 5 rows
# df.head(2)

Unnamed: 0,Tencent,Alibaba,Baidu,JD
2018-01-01,0.543405,0.278369,0.424518,0.844776
2018-01-02,0.004719,0.121569,0.670749,0.825853
2018-01-03,0.136707,0.575093,0.891322,0.209202
2018-01-04,0.185328,0.108377,0.219697,0.978624
2018-01-05,0.811683,0.171941,0.816225,0.274074


In [66]:
df.tail()

Unnamed: 0,Tencent,Alibaba,Baidu,JD
2018-01-06,0.431704,0.94003,0.817649,0.336112
2018-01-07,0.17541,0.372832,0.005689,0.252426
2018-01-08,0.795663,0.015255,0.598843,0.603805
2018-01-09,0.105148,0.381943,0.036476,0.890412
2018-01-10,0.980921,0.059942,0.890546,0.576901


In [67]:
# you can print the columns' names
df.columns

Index(['Tencent', 'Alibaba', 'Baidu', 'JD'], dtype='object')

In [68]:
# you can access the index
df.index

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08',
               '2018-01-09', '2018-01-10'],
              dtype='datetime64[ns]', freq='D')

In [69]:
# you can have a description of the data (summary statistics of each column)
df.describe()

Unnamed: 0,Tencent,Alibaba,Baidu,JD
count,10.0,10.0,10.0,10.0
mean,0.417069,0.302535,0.537171,0.579218
std,0.3482,0.283118,0.345629,0.295192
min,0.004719,0.015255,0.005689,0.209202
25%,0.146383,0.111675,0.270903,0.289583
50%,0.308516,0.225155,0.634796,0.590353
75%,0.732598,0.379666,0.817293,0.840045
max,0.980921,0.94003,0.891322,0.978624


In [70]:
# df.T
df.T.describe()

Unnamed: 0,2018-01-01,2018-01-02,2018-01-03,2018-01-04,2018-01-05,2018-01-06,2018-01-07,2018-01-08,2018-01-09,2018-01-10
count,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
mean,0.522767,0.405722,0.453081,0.373007,0.518481,0.631374,0.201589,0.503391,0.353495,0.627078
std,0.240485,0.403442,0.349529,0.406418,0.343726,0.292697,0.153808,0.33808,0.387843,0.415845
min,0.278369,0.004719,0.136707,0.108377,0.171941,0.336112,0.005689,0.015255,0.036476,0.059942
25%,0.387981,0.092357,0.191078,0.16609,0.248541,0.407806,0.13298,0.452946,0.08798,0.447662
50%,0.483961,0.396159,0.392148,0.202513,0.542878,0.624677,0.213918,0.601324,0.243546,0.733724
75%,0.618748,0.709525,0.65415,0.409429,0.812819,0.848244,0.282528,0.651769,0.50906,0.91314
max,0.844776,0.825853,0.891322,0.978624,0.816225,0.94003,0.372832,0.795663,0.890412,0.980921


### Indexing and extraction

In [71]:
df

Unnamed: 0,Tencent,Alibaba,Baidu,JD
2018-01-01,0.543405,0.278369,0.424518,0.844776
2018-01-02,0.004719,0.121569,0.670749,0.825853
2018-01-03,0.136707,0.575093,0.891322,0.209202
2018-01-04,0.185328,0.108377,0.219697,0.978624
2018-01-05,0.811683,0.171941,0.816225,0.274074
2018-01-06,0.431704,0.94003,0.817649,0.336112
2018-01-07,0.17541,0.372832,0.005689,0.252426
2018-01-08,0.795663,0.015255,0.598843,0.603805
2018-01-09,0.105148,0.381943,0.036476,0.890412
2018-01-10,0.980921,0.059942,0.890546,0.576901


In [72]:
# you can access a specific column
df['Tencent']
# type(df['Tencent'])

2018-01-01    0.543405
2018-01-02    0.004719
2018-01-03    0.136707
2018-01-04    0.185328
2018-01-05    0.811683
2018-01-06    0.431704
2018-01-07    0.175410
2018-01-08    0.795663
2018-01-09    0.105148
2018-01-10    0.980921
Freq: D, Name: Tencent, dtype: float64

In [73]:
df[['Tencent']]
# type(df[['Tencent']])

Unnamed: 0,Tencent
2018-01-01,0.543405
2018-01-02,0.004719
2018-01-03,0.136707
2018-01-04,0.185328
2018-01-05,0.811683
2018-01-06,0.431704
2018-01-07,0.17541
2018-01-08,0.795663
2018-01-09,0.105148
2018-01-10,0.980921


In [74]:
df[['Tencent']].describe()

Unnamed: 0,Tencent
count,10.0
mean,0.417069
std,0.3482
min,0.004719
25%,0.146383
50%,0.308516
75%,0.732598
max,0.980921


In [75]:
df[['Tencent', 'Baidu']]

Unnamed: 0,Tencent,Baidu
2018-01-01,0.543405,0.424518
2018-01-02,0.004719,0.670749
2018-01-03,0.136707,0.891322
2018-01-04,0.185328,0.219697
2018-01-05,0.811683,0.816225
2018-01-06,0.431704,0.817649
2018-01-07,0.17541,0.005689
2018-01-08,0.795663,0.598843
2018-01-09,0.105148,0.036476
2018-01-10,0.980921,0.890546


In [76]:
df['sum_Tencent&Baidu'] = df['Tencent'] + df['Baidu']
df

Unnamed: 0,Tencent,Alibaba,Baidu,JD,sum_Tencent&Baidu
2018-01-01,0.543405,0.278369,0.424518,0.844776,0.967923
2018-01-02,0.004719,0.121569,0.670749,0.825853,0.675468
2018-01-03,0.136707,0.575093,0.891322,0.209202,1.028029
2018-01-04,0.185328,0.108377,0.219697,0.978624,0.405026
2018-01-05,0.811683,0.171941,0.816225,0.274074,1.627908
2018-01-06,0.431704,0.94003,0.817649,0.336112,1.249354
2018-01-07,0.17541,0.372832,0.005689,0.252426,0.181099
2018-01-08,0.795663,0.015255,0.598843,0.603805,1.394506
2018-01-09,0.105148,0.381943,0.036476,0.890412,0.141624
2018-01-10,0.980921,0.059942,0.890546,0.576901,1.871467


In [77]:
df.drop('2018-01-01',axis=0)

Unnamed: 0,Tencent,Alibaba,Baidu,JD,sum_Tencent&Baidu
2018-01-02,0.004719,0.121569,0.670749,0.825853,0.675468
2018-01-03,0.136707,0.575093,0.891322,0.209202,1.028029
2018-01-04,0.185328,0.108377,0.219697,0.978624,0.405026
2018-01-05,0.811683,0.171941,0.816225,0.274074,1.627908
2018-01-06,0.431704,0.94003,0.817649,0.336112,1.249354
2018-01-07,0.17541,0.372832,0.005689,0.252426,0.181099
2018-01-08,0.795663,0.015255,0.598843,0.603805,1.394506
2018-01-09,0.105148,0.381943,0.036476,0.890412,0.141624
2018-01-10,0.980921,0.059942,0.890546,0.576901,1.871467


In [78]:
df.drop('sum_Tencent&Baidu',axis=1)  

Unnamed: 0,Tencent,Alibaba,Baidu,JD
2018-01-01,0.543405,0.278369,0.424518,0.844776
2018-01-02,0.004719,0.121569,0.670749,0.825853
2018-01-03,0.136707,0.575093,0.891322,0.209202
2018-01-04,0.185328,0.108377,0.219697,0.978624
2018-01-05,0.811683,0.171941,0.816225,0.274074
2018-01-06,0.431704,0.94003,0.817649,0.336112
2018-01-07,0.17541,0.372832,0.005689,0.252426
2018-01-08,0.795663,0.015255,0.598843,0.603805
2018-01-09,0.105148,0.381943,0.036476,0.890412
2018-01-10,0.980921,0.059942,0.890546,0.576901


In [79]:
df

Unnamed: 0,Tencent,Alibaba,Baidu,JD,sum_Tencent&Baidu
2018-01-01,0.543405,0.278369,0.424518,0.844776,0.967923
2018-01-02,0.004719,0.121569,0.670749,0.825853,0.675468
2018-01-03,0.136707,0.575093,0.891322,0.209202,1.028029
2018-01-04,0.185328,0.108377,0.219697,0.978624,0.405026
2018-01-05,0.811683,0.171941,0.816225,0.274074,1.627908
2018-01-06,0.431704,0.94003,0.817649,0.336112,1.249354
2018-01-07,0.17541,0.372832,0.005689,0.252426,0.181099
2018-01-08,0.795663,0.015255,0.598843,0.603805,1.394506
2018-01-09,0.105148,0.381943,0.036476,0.890412,0.141624
2018-01-10,0.980921,0.059942,0.890546,0.576901,1.871467


In [80]:
df.drop('sum_Tencent&Baidu',axis=1,inplace=True)    
# or alternatively use: df = df.drop('new', 1)

In [81]:
df

Unnamed: 0,Tencent,Alibaba,Baidu,JD
2018-01-01,0.543405,0.278369,0.424518,0.844776
2018-01-02,0.004719,0.121569,0.670749,0.825853
2018-01-03,0.136707,0.575093,0.891322,0.209202
2018-01-04,0.185328,0.108377,0.219697,0.978624
2018-01-05,0.811683,0.171941,0.816225,0.274074
2018-01-06,0.431704,0.94003,0.817649,0.336112
2018-01-07,0.17541,0.372832,0.005689,0.252426
2018-01-08,0.795663,0.015255,0.598843,0.603805
2018-01-09,0.105148,0.381943,0.036476,0.890412
2018-01-10,0.980921,0.059942,0.890546,0.576901


### Conditional extraction
This is very similar to numpy array conditional extraction

In [82]:
df

Unnamed: 0,Tencent,Alibaba,Baidu,JD
2018-01-01,0.543405,0.278369,0.424518,0.844776
2018-01-02,0.004719,0.121569,0.670749,0.825853
2018-01-03,0.136707,0.575093,0.891322,0.209202
2018-01-04,0.185328,0.108377,0.219697,0.978624
2018-01-05,0.811683,0.171941,0.816225,0.274074
2018-01-06,0.431704,0.94003,0.817649,0.336112
2018-01-07,0.17541,0.372832,0.005689,0.252426
2018-01-08,0.795663,0.015255,0.598843,0.603805
2018-01-09,0.105148,0.381943,0.036476,0.890412
2018-01-10,0.980921,0.059942,0.890546,0.576901


In [83]:
df>0.5

Unnamed: 0,Tencent,Alibaba,Baidu,JD
2018-01-01,True,False,False,True
2018-01-02,False,False,True,True
2018-01-03,False,True,True,False
2018-01-04,False,False,False,True
2018-01-05,True,False,True,False
2018-01-06,False,True,True,False
2018-01-07,False,False,False,False
2018-01-08,True,False,True,True
2018-01-09,False,False,False,True
2018-01-10,True,False,True,True


In [84]:
df[df>0.5]

Unnamed: 0,Tencent,Alibaba,Baidu,JD
2018-01-01,0.543405,,,0.844776
2018-01-02,,,0.670749,0.825853
2018-01-03,,0.575093,0.891322,
2018-01-04,,,,0.978624
2018-01-05,0.811683,,0.816225,
2018-01-06,,0.94003,0.817649,
2018-01-07,,,,
2018-01-08,0.795663,,0.598843,0.603805
2018-01-09,,,,0.890412
2018-01-10,0.980921,,0.890546,0.576901


In [85]:
df[df['Tencent']>0.5]

Unnamed: 0,Tencent,Alibaba,Baidu,JD
2018-01-01,0.543405,0.278369,0.424518,0.844776
2018-01-05,0.811683,0.171941,0.816225,0.274074
2018-01-08,0.795663,0.015255,0.598843,0.603805
2018-01-10,0.980921,0.059942,0.890546,0.576901


In [86]:
df[(df['Tencent']>0.5) & (df['Baidu'] < 0.5)]

Unnamed: 0,Tencent,Alibaba,Baidu,JD
2018-01-01,0.543405,0.278369,0.424518,0.844776


### Generate Indicator Variable
we want to generate a binary variable that equals 1 if Tencent is higher than 0.5, and 0 if Tencent is equal or lower than 0.5

In [87]:
df

Unnamed: 0,Tencent,Alibaba,Baidu,JD
2018-01-01,0.543405,0.278369,0.424518,0.844776
2018-01-02,0.004719,0.121569,0.670749,0.825853
2018-01-03,0.136707,0.575093,0.891322,0.209202
2018-01-04,0.185328,0.108377,0.219697,0.978624
2018-01-05,0.811683,0.171941,0.816225,0.274074
2018-01-06,0.431704,0.94003,0.817649,0.336112
2018-01-07,0.17541,0.372832,0.005689,0.252426
2018-01-08,0.795663,0.015255,0.598843,0.603805
2018-01-09,0.105148,0.381943,0.036476,0.890412
2018-01-10,0.980921,0.059942,0.890546,0.576901


In [88]:
df['Tencent_higher_than_half'] = 0 
df

Unnamed: 0,Tencent,Alibaba,Baidu,JD,Tencent_higher_than_half
2018-01-01,0.543405,0.278369,0.424518,0.844776,0
2018-01-02,0.004719,0.121569,0.670749,0.825853,0
2018-01-03,0.136707,0.575093,0.891322,0.209202,0
2018-01-04,0.185328,0.108377,0.219697,0.978624,0
2018-01-05,0.811683,0.171941,0.816225,0.274074,0
2018-01-06,0.431704,0.94003,0.817649,0.336112,0
2018-01-07,0.17541,0.372832,0.005689,0.252426,0
2018-01-08,0.795663,0.015255,0.598843,0.603805,0
2018-01-09,0.105148,0.381943,0.036476,0.890412,0
2018-01-10,0.980921,0.059942,0.890546,0.576901,0


In [89]:
condition = ( df['Tencent'] > 0.5 ) 
condition

2018-01-01     True
2018-01-02    False
2018-01-03    False
2018-01-04    False
2018-01-05     True
2018-01-06    False
2018-01-07    False
2018-01-08     True
2018-01-09    False
2018-01-10     True
Freq: D, Name: Tencent, dtype: bool

In [90]:
# df.loc[condition, "variable name"] = alpha, this will go through the column "variable name" row by row: 
# for each row where the condition is True, assign a new value alpha to that row. 

df.loc[condition, 'Tencent_higher_than_half'] = 1 

In [91]:
df

Unnamed: 0,Tencent,Alibaba,Baidu,JD,Tencent_higher_than_half
2018-01-01,0.543405,0.278369,0.424518,0.844776,1
2018-01-02,0.004719,0.121569,0.670749,0.825853,0
2018-01-03,0.136707,0.575093,0.891322,0.209202,0
2018-01-04,0.185328,0.108377,0.219697,0.978624,0
2018-01-05,0.811683,0.171941,0.816225,0.274074,1
2018-01-06,0.431704,0.94003,0.817649,0.336112,0
2018-01-07,0.17541,0.372832,0.005689,0.252426,0
2018-01-08,0.795663,0.015255,0.598843,0.603805,1
2018-01-09,0.105148,0.381943,0.036476,0.890412,0
2018-01-10,0.980921,0.059942,0.890546,0.576901,1
