## Pandas Tutorial
Pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.

In [1]:
## First step is to import pandas

import pandas as pd
import numpy as np

In [14]:
## Playing with Dataframe

df = pd.DataFrame(np.arange(0,20).reshape(5,4),index=['Row1', 'Row2', 'Row3', 'Row4', 'Row5'], columns = ['Column1', 'Column2', 'Column3', 'Column4'])

In [15]:
df.head()

Unnamed: 0,Column1,Column2,Column3,Column4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [16]:
## Accessing the elements

df.loc['Row1']

Column1    0
Column2    1
Column3    2
Column4    3
Name: Row1, dtype: int32

In [17]:
## Check the type

type(df.loc['Row1'])

pandas.core.series.Series

In [18]:
df.iloc[:,:] # all rows and columns

Unnamed: 0,Column1,Column2,Column3,Column4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [19]:
## Take the elements from the Column2
df.iloc[:,1:]

Unnamed: 0,Column2,Column3,Column4
Row1,1,2,3
Row2,5,6,7
Row3,9,10,11
Row4,13,14,15
Row5,17,18,19


In [20]:
#convert Dataframes into array
df.iloc[:,1:].values

array([[ 1,  2,  3],
       [ 5,  6,  7],
       [ 9, 10, 11],
       [13, 14, 15],
       [17, 18, 19]])

In [22]:
df['Column1'].value_counts() # counts no of occurrence

0     1
4     1
8     1
12    1
16    1
Name: Column1, dtype: int64

In [24]:
df['Column1'].unique() # Gives array if unique values from column1

array([ 0,  4,  8, 12, 16])

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, Row1 to Row5
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Column1  5 non-null      int32
 1   Column2  5 non-null      int32
 2   Column3  5 non-null      int32
 3   Column4  5 non-null      int32
dtypes: int32(4)
memory usage: 292.0+ bytes


In [26]:
df.describe()

Unnamed: 0,Column1,Column2,Column3,Column4
count,5.0,5.0,5.0,5.0
mean,8.0,9.0,10.0,11.0
std,6.324555,6.324555,6.324555,6.324555
min,0.0,1.0,2.0,3.0
25%,4.0,5.0,6.0,7.0
50%,8.0,9.0,10.0,11.0
75%,12.0,13.0,14.0,15.0
max,16.0,17.0,18.0,19.0


## CSV

In [27]:
from io import StringIO, BytesIO

In [28]:
data = ('col1,col2,col3\n'
            'x,y,1\n'
            'a,b,2\n'
            'c,d,3')

In [29]:
type(data)

str

In [30]:
pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,x,y,1
1,a,b,2
2,c,d,3


In [33]:
## Read from specific columns
df=pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['COL1', 'COL3'])
df

Unnamed: 0,col1,col3
0,x,1
1,a,2
2,c,3


In [32]:
df.to_csv('Test.csv')

In [34]:
## Specifying columns data types

data = ('a,b,c,d\n'
            '1,2,3,4\n'
            '5,6,7,8\n'
            '9,10,11')

In [35]:
print(data)

a,b,c,d
1,2,3,4
5,6,7,8
9,10,11


In [36]:
df=pd.read_csv(StringIO(data),dtype=object)

In [37]:
df

Unnamed: 0,a,b,c,d
0,1,2,3,4.0
1,5,6,7,8.0
2,9,10,11,


In [38]:
df['a'][1]

'5'

In [40]:
df=pd.read_csv(StringIO(data),dtype={'b':int,'c':float,'a':'Int64'}) # datatypes passed as a dictionary

In [41]:
df

Unnamed: 0,a,b,c,d
0,1,2,3.0,4.0
1,5,6,7.0,8.0
2,9,10,11.0,


In [42]:
df['a'][1]

5

In [43]:
## check the datatype
df.dtypes

a      Int64
b      int32
c    float64
d    float64
dtype: object

In [71]:
## Index columns
data = ('index,a,b,c\n'
           '4,apple,bat,5.7\n'
            '8,orange,cow,10') # all rows has 4 values so default indexing introduced

In [57]:
pd.read_csv(StringIO(data)) # by default a new no name column is introduced and rows are numbered

Unnamed: 0,index,a,b,c
0,4,apple,bat,5.7
1,8,orange,cow,10.0


In [58]:
pd.read_csv(StringIO(data),index_col=0) # specify that use the column 0 as first column

Unnamed: 0_level_0,a,b,c
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,apple,bat,5.7
8,orange,cow,10.0


In [72]:
pd.read_csv(StringIO(data),index_col=2) # specify that use the column 2 as first column

Unnamed: 0_level_0,index,a,c
b,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bat,4,apple,5.7
cow,8,orange,10.0


In [115]:
 data = ('a,b,c\n'
           '4,apple,bat, \n'
            '8,orange,cow, ') # see the extra commas in the row2 and row 3 so there are 4 columns in total
                              # all rows don't have same no of items so the first element in row 2 and row 3 
                              # are pushed in column 0 to handle this index_col = false will ensure that 
                              # no data is used under column 0 or row 0

In [87]:
pd.read_csv(StringIO(data)) # a b c are taken as columns the dtyoe is none and
                            # in row 2 and row 3 , 4 and 8 are numbers so taken as indexes(x and y works same way)

Unnamed: 0,a,b,c
4,apple,bat,
8,orange,cow,


In [116]:
pd.read_csv(StringIO(data),index_col=False)

  pd.read_csv(StringIO(data),index_col=False)


Unnamed: 0,a,b,c
0,4,apple,bat
1,8,orange,cow


In [117]:
## Combining usecols and index_col
data = ('a,b,c\n'
           '4,apple,bat,\n'
            '8,orange,cow,')

In [118]:
pd.read_csv(StringIO(data), usecols=['b', 'c'],index_col=False)

Unnamed: 0,b,c
0,apple,bat
1,orange,cow


In [98]:
## Quoting and Escape Characters¶. Very useful in NLP

data = 'a,b\n"hello, \\"Bob\\", nice to see you",5'

In [99]:
pd.read_csv(StringIO(data),escapechar='\\')

Unnamed: 0,a,b
0,"hello, ""Bob"", nice to see you",5


In [109]:
## URL to CSV

## url = "https://download.bls.gov/pub/time.series/cu/cu.item"

## df = pd.read_csv(url, sep="\t", engine="python") # blocking the request

df = pd.read_csv("cu.item", sep="\t")

In [110]:
df.head()

Unnamed: 0,item_code,item_name,display_level,selectable,sort_sequence
0,AA0,All items - old base,0,T,2
1,AA0R,Purchasing power of the consumer dollar - old ...,0,T,400
2,SA0,All items,0,T,1
3,SA0E,Energy,1,T,375
4,SA0L1,All items less food,1,T,359
