# Dropping Entries

In [2]:
import numpy as np
import pandas as pd

In [3]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
 index=['Ohio', 'Colorado', 'Utah', 'New York'],
 columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [4]:
 data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [5]:
data.drop(['two'], axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [13]:
data.iloc[:,[True, True, False, True]]

Unnamed: 0,one,two,four
Ohio,0,1,3
Colorado,4,5,7
Utah,8,9,11
New York,12,13,15


In [16]:
data.iloc[[3,2],[3,1]]

Unnamed: 0,four,two
New York,15,13
Utah,11,9


# Sorting

In [17]:
 frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
         index=['three', 'one'],
         columns=['d', 'a', 'b', 'c'])

In [18]:
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [20]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [23]:
frame.sort_index(axis='columns')

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [25]:
frame.sort_index(axis='columns', ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [36]:
frame.sort_values(by='c')

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [37]:
frame1 = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})

In [38]:
frame1

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


# Axis Indexes with Duplicate Labels

In [45]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])

In [41]:
df

Unnamed: 0,0,1,2
a,2.020057,2.149966,0.51012
a,1.185806,0.157061,1.000257
b,-0.142131,-0.527118,-1.186606
b,0.675529,0.16305,1.13979


In [44]:
df.index.is_unique

False

#  Summarizing and Computing Descriptive Statistics

In [51]:
 df = pd.DataFrame([[1.4, np.nan,2.3], 
                    [7.1, -4.5,5],
                    [np.nan, np.nan,4.5], 
                    [0.75, -1.3,1.2],
                    [1.2, np.nan,2.2]],
                    index=['a', 'b', 'c', 'd','e'],
                    columns=['one', 'two','three'])

In [52]:
df

Unnamed: 0,one,two,three
a,1.4,,2.3
b,7.1,-4.5,5.0
c,,,4.5
d,0.75,-1.3,1.2
e,1.2,,2.2


In [53]:
df.sum()

one      10.45
two      -5.80
three    15.20
dtype: float64

In [54]:
df.sum(axis=1)

a    3.70
b    7.60
c    4.50
d    0.65
e    3.40
dtype: float64

In [55]:
df.mean(axis='columns', skipna=False)

a         NaN
b    2.533333
c         NaN
d    0.216667
e         NaN
dtype: float64

In [56]:
df.mean(axis='columns', skipna=True)

a    1.850000
b    2.533333
c    4.500000
d    0.216667
e    1.700000
dtype: float64

In [58]:
df.cumsum()

Unnamed: 0,one,two,three
a,1.4,,2.3
b,8.5,-4.5,7.3
c,,,11.8
d,9.25,-5.8,13.0
e,10.45,,15.2


In [59]:
df.cumprod()

Unnamed: 0,one,two,three
a,1.4,,2.3
b,9.94,-4.5,11.5
c,,,51.75
d,7.455,5.85,62.1
e,8.946,,136.62


In [66]:
df.describe()

Unnamed: 0,one,two,three
count,4.0,2.0,5.0
mean,2.6125,-2.9,3.04
std,3.00399,2.262742,1.628803
min,0.75,-4.5,1.2
25%,1.0875,-3.7,2.2
50%,1.3,-2.9,2.3
75%,2.825,-2.1,4.5
max,7.1,-1.3,5.0


In [65]:
x=round(df.describe(),2)
x

Unnamed: 0,one,two,three
count,4.0,2.0,5.0
mean,2.61,-2.9,3.04
std,3.0,2.26,1.63
min,0.75,-4.5,1.2
25%,1.09,-3.7,2.2
50%,1.3,-2.9,2.3
75%,2.82,-2.1,4.5
max,7.1,-1.3,5.0


In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, a to e
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   one     4 non-null      float64
 1   two     2 non-null      float64
 2   three   5 non-null      float64
dtypes: float64(3)
memory usage: 320.0+ bytes


In [68]:
df1 = pd.DataFrame([[1.4,3,2.3], 
                    [7.1, -4.5,5],
                    [2, 3.5,4.5], 
                    [0.75, -1.3,1.2],
                    [1.2, 5.7,2.2]],
                    index=['a', 'b', 'c', 'd','e'],
                    columns=['one', 'two','three'])

In [69]:
df1

Unnamed: 0,one,two,three
a,1.4,3.0,2.3
b,7.1,-4.5,5.0
c,2.0,3.5,4.5
d,0.75,-1.3,1.2
e,1.2,5.7,2.2


In [70]:
df1.count()

one      5
two      5
three    5
dtype: int64

In [71]:
df1.describe()

Unnamed: 0,one,two,three
count,5.0,5.0,5.0
mean,2.49,1.28,3.04
std,2.615913,4.107554,1.628803
min,0.75,-4.5,1.2
25%,1.2,-1.3,2.2
50%,1.4,3.0,2.3
75%,2.0,3.5,4.5
max,7.1,5.7,5.0


In [75]:
df1.min()

one      0.75
two     -4.50
three    1.20
dtype: float64

In [76]:
df1.max()

one      7.1
two      5.7
three    5.0
dtype: float64

In [77]:
df1.argmin()

AttributeError: 'DataFrame' object has no attribute 'argmin'

In [78]:
df1.idxmin()

one      d
two      b
three    d
dtype: object

In [79]:
df1.idxmax()

one      b
two      e
three    b
dtype: object

In [80]:
df1.quantile()

one      1.4
two      3.0
three    2.3
Name: 0.5, dtype: float64

In [81]:
df1.sum()

one      12.45
two       6.40
three    15.20
dtype: float64

In [82]:
df1.mean()

one      2.49
two      1.28
three    3.04
dtype: float64

In [83]:
df1.median()

one      1.4
two      3.0
three    2.3
dtype: float64

In [84]:
df1.mode()

Unnamed: 0,one,two,three
0,0.75,-4.5,1.2
1,1.2,-1.3,2.2
2,1.4,3.0,2.3
3,2.0,3.5,4.5
4,7.1,5.7,5.0


In [85]:
df1.mad()

one      1.844
two      3.344
three    1.368
dtype: float64

In [86]:
df1.prod()

one       17.8920
two      350.1225
three    136.6200
dtype: float64

In [87]:
df1.std()

one      2.615913
two      4.107554
three    1.628803
dtype: float64

In [88]:
df1.skew()

one      2.074985
two     -0.648687
three    0.329213
dtype: float64

In [89]:
df1.kurt()

one      4.414314
two     -1.083342
three   -2.394750
dtype: float64

In [92]:
df1.cumsum()

Unnamed: 0,one,two,three
a,1.4,3.0,2.3
b,8.5,-1.5,7.3
c,10.5,2.0,11.8
d,11.25,0.7,13.0
e,12.45,6.4,15.2


In [93]:
df1.cummin()

Unnamed: 0,one,two,three
a,1.4,3.0,2.3
b,1.4,-4.5,2.3
c,1.4,-4.5,2.3
d,0.75,-4.5,1.2
e,0.75,-4.5,1.2


In [94]:
df1.cummax()

Unnamed: 0,one,two,three
a,1.4,3.0,2.3
b,7.1,3.0,5.0
c,7.1,3.5,5.0
d,7.1,3.5,5.0
e,7.1,5.7,5.0


In [95]:
df1.cumprod()

Unnamed: 0,one,two,three
a,1.4,3.0,2.3
b,9.94,-13.5,11.5
c,19.88,-47.25,51.75
d,14.91,61.425,62.1
e,17.892,350.1225,136.62


In [96]:
df1.diff()

Unnamed: 0,one,two,three
a,,,
b,5.7,-7.5,2.7
c,-5.1,8.0,-0.5
d,-1.25,-4.8,-3.3
e,0.45,7.0,1.0


In [97]:
df1.pct_change()

Unnamed: 0,one,two,three
a,,,
b,4.071429,-2.5,1.173913
c,-0.71831,-1.777778,-0.1
d,-0.625,-1.371429,-0.733333
e,0.6,-5.384615,0.833333


In [98]:
import pandas_datareader.data as web

In [107]:
all_data = {ticker: web.get_data_yahoo(ticker)
 for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

In [109]:
all_data

{'AAPL':                   High         Low        Open       Close       Volume  \
 Date                                                                      
 2016-01-26   25.219999   24.517500   24.982500   24.997499  300308000.0   
 2016-01-27   24.157499   23.334999   24.010000   23.355000  533478800.0   
 2016-01-28   23.629999   23.097500   23.447500   23.522499  222715200.0   
 2016-01-29   24.334999   23.587500   23.697500   24.334999  257666000.0   
 2016-02-01   24.177500   23.850000   24.117500   24.107500  163774000.0   
 ...                ...         ...         ...         ...          ...   
 2021-01-19  128.710007  126.940002  127.779999  127.830002   90757300.0   
 2021-01-20  132.490005  128.550003  128.660004  132.029999  104319500.0   
 2021-01-21  139.669998  133.589996  133.800003  136.869995  120529500.0   
 2021-01-22  139.850006  135.020004  136.279999  139.070007  113907200.0   
 2021-01-25  145.080002  142.939804  143.070007  143.960007   52406368.0   
 
  

In [111]:
price = pd.DataFrame({ticker: data['Adj Close']
 
                      for ticker, data in all_data.items()})
price

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-01-26,23.159468,98.612000,47.606789,713.039978
2016-01-27,21.637739,97.300804,46.739880,699.989990
2016-01-28,21.792925,98.314354,47.506405,730.960022
2016-01-29,22.545681,100.381676,50.271385,742.950012
2016-02-01,22.334911,100.413864,49.924614,752.000000
...,...,...,...,...
2021-01-19,127.830002,129.020004,216.440002,1790.859985
2021-01-20,132.029999,130.080002,224.339996,1886.900024
2021-01-21,136.869995,131.649994,224.970001,1891.250000
2021-01-22,139.070007,118.610001,225.949997,1901.050049


In [112]:
volume = pd.DataFrame({ticker: data['Volume']
                             for ticker, data in all_data.items()})

In [113]:
volume

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-01-26,300308000.0,4617800.0,28900800.0,1331700
2016-01-27,533478800.0,5026400.0,36775200.0,2194200
2016-01-28,222715200.0,3942500.0,62513800.0,2676400
2016-01-29,257666000.0,8248100.0,83611700.0,3474300
2016-02-01,163774000.0,3574900.0,44208500.0,5139200
...,...,...,...,...
2021-01-19,90757300.0,5398000.0,30480900.0,1734600
2021-01-20,104319500.0,5598700.0,37777300.0,2490300
2021-01-21,120529500.0,12819200.0,30749600.0,2063900
2021-01-22,113907200.0,38031500.0,30124900.0,1272100


In [115]:
returns = price.pct_change()
returns

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-01-26,,,,
2016-01-27,-0.065707,-0.013297,-0.018210,-0.018302
2016-01-28,0.007172,0.010417,0.016400,0.044244
2016-01-29,0.034541,0.021028,0.058202,0.016403
2016-02-01,-0.009349,0.000321,-0.006898,0.012181
...,...,...,...,...
2021-01-19,0.005427,0.004907,0.017823,0.031489
2021-01-20,0.032856,0.008216,0.036500,0.053628
2021-01-21,0.036658,0.012069,0.002808,0.002305
2021-01-22,0.016074,-0.099050,0.004356,0.005182


In [116]:
returns['MSFT'].corr(returns['IBM'])

0.5410166061046487

In [117]:
returns['MSFT'].cov(returns['IBM'])

0.0001553008959670692