# Pandas

In [11]:
import pandas as pd
import numpy as np

In [12]:
from numpy.random import randn
np.random.seed(101)

In [13]:
df = pd.DataFrame(randn(5,4),index='A B C D E'.split(),columns='W X Y Z'.split())

In [14]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [15]:
df.describe()

Unnamed: 0,W,X,Y,Z
count,5.0,5.0,5.0,5.0
mean,0.343858,0.453764,0.452287,0.431871
std,1.681131,1.061385,1.454516,0.594708
min,-2.018168,-0.758872,-0.933237,-0.589001
25%,0.188695,-0.319318,-0.848077,0.503826
50%,0.190794,0.628133,0.528813,0.605965
75%,0.651118,0.740122,0.907969,0.683509
max,2.70685,1.978757,2.605967,0.955057


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, A to E
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   W       5 non-null      float64
 1   X       5 non-null      float64
 2   Y       5 non-null      float64
 3   Z       5 non-null      float64
dtypes: float64(4)
memory usage: 200.0+ bytes


# Selection and Indexing

In [17]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [18]:
# removing columns
df.drop('Z',axis=1)

Unnamed: 0,W,X,Y
A,2.70685,0.628133,0.907969
B,0.651118,-0.319318,-0.848077
C,-2.018168,0.740122,0.528813
D,0.188695,-0.758872,-0.933237
E,0.190794,1.978757,2.605967


In [9]:
# Not inplace unless specified!
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [11]:
df.drop('Z',axis=1,inplace=True)

In [12]:
df

Unnamed: 0,W,X,Y
A,0.302665,1.693723,-1.706086
B,-0.134841,0.390528,0.166905
C,0.807706,0.07296,0.638787
D,-0.497104,-0.75407,-0.943406
E,-0.116773,1.901755,0.238127


In [13]:
# Selection Rows
df.loc['A']

W    0.302665
X    1.693723
Y   -1.706086
Name: A, dtype: float64

In [14]:
df.iloc[2]

W    0.807706
X    0.072960
Y    0.638787
Name: C, dtype: float64

In [20]:
# Create dataframe
data = pd.DataFrame({'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
       'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
       'Sales':[200,120,340,124,243,350]})

In [21]:
data

Unnamed: 0,Company,Person,Sales
0,GOOG,Sam,200
1,GOOG,Charlie,120
2,MSFT,Amy,340
3,MSFT,Vanessa,124
4,FB,Carl,243
5,FB,Sarah,350


## Basic Function

In [9]:

df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]))

print("\nDataFrame:")
print(df)

print("\nShape:",df.shape)

print("\nNumber of indices:",len(df.index))

print("\nNumber of elements in 1st column of dataframe:",df[0].count())

print("\nColumn Names:",df.columns.values)


DataFrame:
   0  1  2
0  1  2  3
1  4  5  6

Shape: (2, 3)

Number of indices: 2

Number of elements in 1st column of dataframe: 2

Column Names: [0 1 2]


# Writing dataframe to CSV file

In [28]:
df.to_csv('myDataFrame.csv', sep='\t', index=False)

In [29]:
#pd.read...just check the list and be aware that it exists!
new_df = pd.read_csv('myDataFrame.csv', sep='\t')
new_df

Unnamed: 0,W,X,Y
0,0.302665,1.693723,-1.706086
1,-0.134841,0.390528,0.166905
2,0.807706,0.07296,0.638787
3,-0.497104,-0.75407,-0.943406
4,-0.116773,1.901755,0.238127
