## This Notebook is used to understand Pandas module to be used alongwith slides of Dr. Naveen Aggarwal

In [2]:
import numpy as np
import pandas as pd

### Creating a Series by passing a list of values, letting pandas create a default integer index

In [3]:
s = pd.Series([1,3,5,np.nan,6,8])
print(s)
dates = pd.date_range('20130101',periods=6)
dates

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

### Creating Dataframe using dictionary

In [6]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,1.710686,-0.107537,0.970582,0.252704
2013-01-02,-0.393422,0.561274,1.23825,2.438959
2013-01-03,-2.475899,-1.57309,1.606383,-1.04022
2013-01-04,0.920399,0.568459,-0.216855,0.847831
2013-01-05,0.103886,0.711909,-0.190667,-0.035993
2013-01-06,0.737788,-1.764198,-0.680107,-0.371813


### Creating simple np array

In [4]:
# A structured array
# Structured arrays allow users to manipulate the data by named fields: 
#in the example below, a structured array of three tuples is created. 
#The first element of each tuple will be called ‘foo’ and will be of type int, 
#while the second element will be named ‘bar’ and will be a float.
my_array = np.ones(3, dtype=([('foo', int), ('bar', float)]))
# Print the structured array
print(my_array['foo'])

print(my_array['bar'])

[1 1 1]
[ 1.  1.  1.]


In [7]:
# A record array
my_array2 = my_array.view(np.recarray)
# Print the record array
print(my_array2.foo)
print(my_array2.bar)

[1 1 1]
[ 1.  1.  1.]


### Creating Dataframe from np array

In [3]:
data = np.array([['','Col1','Col2'],
                 ['Row1',1,2],
                ['Row2',3,4]])
                
print(pd.DataFrame(data=data[1:,1:],
                  index=data[1:,0],
                  columns=data[0,1:]))

     Col1 Col2
Row1    1    2
Row2    3    4


### Creating a DataFrame by passing a dict of objects that can be converted to series-like.


In [10]:
df2 = pd.DataFrame({ 'A' : 1.,
   ...:              'B' : pd.Timestamp('20130102'),
   ...:              'C' : pd.Series(1,index=range(4),dtype='float32'),
   ...:              'D' : np.array([3] * 4,dtype='int32'),
   ...:              'E' : 'foo' })
print(df2)
df2.dtypes

     A          B    C  D    E
0  1.0 2013-01-02  1.0  3  foo
1  1.0 2013-01-02  1.0  3  foo
2  1.0 2013-01-02  1.0  3  foo
3  1.0 2013-01-02  1.0  3  foo


A           float64
B    datetime64[ns]
C           float32
D             int32
E            object
dtype: object

### Creating Dataframes from array, dictionary and series

In [8]:
# Take a 2D array as input to your DataFrame 
my_2darray = np.array([[1, 2, 3], [4, 5, 6]])
print("Data Frame From Array\n", pd.DataFrame(my_2darray))

Data Frame From Array
    0  1  2
0  1  2  3
1  4  5  6


In [9]:
# Take a dictionary as input to your DataFrame 
my_dict = {1: ['1', '3'], 2: ['1', '2'], 3: ['2', '4']}
print("Data Frame From Dictionary\n",pd.DataFrame(my_dict))

Data Frame From Dictionary
    1  2  3
0  1  1  2
1  3  2  4


In [10]:
# Take a DataFrame as input to your DataFrame 
my_df = pd.DataFrame(data=[4,5,6,7], index=range(0,4), columns=['A'])
print("Data Frame From another dataframe\n",pd.DataFrame(my_df))

Data Frame From another dataframe
    A
0  4
1  5
2  6
3  7


In [11]:
# Take a Series as input to your DataFrame
my_series = pd.Series({"United Kingdom":"London", "India":"New Delhi"})
print("Data Frame From Series\n",pd.DataFrame(my_series))

Data Frame From Series
                         0
India           New Delhi
United Kingdom     London


### Shape and Length Property

In [5]:
df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]))

# Use the `shape` property
print(df.shape)

# Or use the `len()` function with the `index` property
print(len(df.index))

(2, 3)
2


In [7]:
list(df.columns.values)

[0, 1, 2]

### Accesing elements of the dataframe

In [14]:
df = pd.DataFrame(data=[[1,2,3],[4,5,6],[7,8,9]], columns=['A', 'B','C'])
print(df)

# Using `iloc[]`
print(df.iloc[0][0])

# Using `loc[]`
print(df.loc[0]['B'])

# Using `at[]`
print(df.at[1,'C'])

# Using `iat[]`
print(df.iat[2,2])

# Using `get_value(index, column)`
print(df.get_value(1, 'A'))

   A  B  C
0  1  2  3
1  4  5  6
2  7  8  9
1
2
6
9
4


### Adding and Accesing an index to dataframe

In [12]:
df = pd.DataFrame(data=np.array([[1, 2, 3], 
                                 [4, 5, 6], 
                                 [7, 8, 9]]), 
                  index= [2, 'A', 4], 
                  columns=[48, 49, 50])

print(df)

   48  49  50
2   1   2   3
A   4   5   6
4   7   8   9


In [13]:
# Pass `2` to `loc`
print("Accesing you loc\n",df.loc[2])

# Pass `2` to `iloc`
print("Accessing you iloc\n",df.iloc[2])

# Pass `2` to `ix`
print("Accesing you ix\n",df.ix[2])

Accesing you loc
 48    1
49    2
50    3
Name: 2, dtype: int32
Accessing you iloc
 48    7
49    8
50    9
Name: 4, dtype: int32
Accesing you ix
 48    7
49    8
50    9
Name: 4, dtype: int32


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


### Adding a row to data frame

In [14]:
df = pd.DataFrame(data=np.array([[1, 2, 3], 
                                 [4, 5, 6], 
                                 [7, 8, 9]]), 
                  index= [2.5, 12.6, 4.8], 
                  columns=[48, 49, 50])

# There's no index labeled `2`, 
#so you will change the index at position `2`
df.ix[2] = [60, 50, 40]
print(df)

# This will make an index labeled `2` 
#and add the new values
df.loc[2] = [11, 12, 13]
print(df)

      48  49  50
2.5    1   2   3
12.6   4   5   6
4.8   60  50  40
      48  49  50
2.5    1   2   3
12.6   4   5   6
4.8   60  50  40
2.0   11  12  13


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':


### Adding a Column to data frame

In [36]:
df = pd.DataFrame(data=np.array([[1, 2, 3], 
                                 [4, 5, 6], 
                                 [7, 8, 9]]), 
                  columns=['A', 'B', 'C'])

print(df)

# Use `.index`
df['D'] = df.index

# Print `df`
print("After Adding the Column\n", df)

   A  B  C
0  1  2  3
1  4  5  6
2  7  8  9
After Adding the Column
    A  B  C  D
0  1  2  3  0
1  4  5  6  1
2  7  8  9  2


### Another way of adding column

In [34]:
df['E']=[11,12,13]

# Print `df`
print("After Adding another Column\n", df)

df.loc[:, 5] = pd.Series(['5', '6', '7'], index=df.index)
# Print `df`
print("After Adding another Column\n", df)


After Adding another Column
    A  B  C  D   E  5
0  1  2  3  0  11  5
1  4  5  6  1  12  6
2  7  8  9  2  13  7
After Adding another Column
    A  B  C  D   E  5
0  1  2  3  0  11  5
1  4  5  6  1  12  6
2  7  8  9  2  13  7


### Resetting the index

In [3]:
# Check out the weird index of your dataframe
df = pd.DataFrame(data=np.array([[1, 2, 3], 
                                 [4, 5, 6], 
                                 [7, 8, 9]]), 
                  columns=['A', 'B', 'C'])

print(df)

# Use `reset_index()` to reset the values
print(df.reset_index(level=0, drop=True))

# Use `reset_index()` to reset the values
df.reset_index(level=0, inplace=True)
print(df)


   A  B  C
0  1  2  3
1  4  5  6
2  7  8  9
   A  B  C
0  1  2  3
1  4  5  6
2  7  8  9
   index  A  B  C
0      0  1  2  3
1      1  4  5  6
2      2  7  8  9


### Deleting the Column

In [62]:
df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), 
                  index= [2.5, 12.6, 4.8], 
                  columns=['A','B','C'])
                  
print(df)
# Drop the column with label 'A'                  
df.drop('A', axis=1, inplace=True)
print("After Droping the Column \n",df)
# Drop the column at position 2
print("After Dropping another Column")
print(df.drop(df.columns[1], axis=1))

      A  B  C
2.5   1  2  3
12.6  4  5  6
4.8   7  8  9
After Droping the Column 
       B  C
2.5   2  3
12.6  5  6
4.8   8  9
After Dropping another Column
      B
2.5   2
12.6  5
4.8   8


### Deleting the Row

In [74]:
df = pd.DataFrame(data=np.array([[1, 2, 3], 
                                 [4, 5, 6], 
                                 [7, 8, 9], 
                                 [7, 8, 10]]), 
                  index= [2.5, 12.6, 4.8, 4.8], 
                  columns=[48, 49, 50])
                  

# Check out your DataFrame `df`
print(df)
print("\n")
# Drop the duplicates in `df`
df=pd.DataFrame(data=df.drop_duplicates([48], keep='last'))
print(df)

print("\nDrop the index at position 1")
print(df.drop(df.index[1]))


      48  49  50
2.5    1   2   3
12.6   4   5   6
4.8    7   8   9
4.8    7   8  10


      48  49  50
2.5    1   2   3
12.6   4   5   6
4.8    7   8  10

Drop the index at position 1
     48  49  50
2.5   1   2   3
4.8   7   8  10


### Renaming the Index or Coulmns of Dataframe

In [83]:
df = pd.DataFrame(data=np.array([[1, 2, 3], 
                                 [4, 5, 6], 
                                 [7, 8, 9], 
                                 [7, 8, 10]]), 
                  index= [2.5, 12.6, 4.8, 4.8], 
                  columns=[48, 49, 50])

print(df)
# Define the new names of your columns
newcols = {
    48: 'A', 
    49: 'B', 
    50: 'C'
}

# Use `rename()` to rename your columns
df.rename(columns=newcols, inplace=True)

print("After Renaming the Columns\n", df)

# Rename your index
df=pd.DataFrame(data=df.rename(index={2.5: 'a'}))

print("After Renaming the Index\n", df)

      48  49  50
2.5    1   2   3
12.6   4   5   6
4.8    7   8   9
4.8    7   8  10
After Renaming the Columns
       A  B   C
2.5   1  2   3
12.6  4  5   6
4.8   7  8   9
4.8   7  8  10
After Renaming the Index
       A  B   C
a     1  2   3
12.6  4  5   6
4.8   7  8   9
4.8   7  8  10


### Reading from CSV Files

In [15]:
vehicle=pd.read_csv("D:\onedrive\Python\dataset\Vehicle.csv")
print(vehicle)

     compactness  circularity  distance circularity  radius ratio  \
0             95           48                    83           178   
1             91           41                    84           141   
2            104           50                   106           209   
3             93           41                    82           159   
4             85           44                    70           205   
5            107           57                   106           172   
6             97           43                    73           173   
7             90           43                    66           157   
8             86           34                    62           140   
9             93           44                    98           197   
10            86           36                    70           143   
11            90           34                    66           136   
12            88           46                    74           171   
13            89           42     

### Displaying the top and bottom rows of the data

In [17]:
vehicle.head()

Unnamed: 0,compactness,circularity,distance circularity,radius ratio,pr.axis aspect ratio,max.length aspect ratio,scatter ratio,elongatedness,pr.axis rectangularity,max.length rectangularity,scaled variance along major axis,scaled variance along minor axis,scaled radius of gyration,skewness about major axis,skewness about minor axis,kurtosis about minor axis,kurtosis about major axis,hollows ratio,vehicle
0,95,48,83,178,72,10,162,42,20,159,176,379,184,70,6,16,187,197,van
1,91,41,84,141,57,9,149,45,19,143,170,330,158,72,9,14,189,199,van
2,104,50,106,209,66,10,207,32,23,158,223,635,220,73,14,9,188,196,saab
3,93,41,82,159,63,9,144,46,19,143,160,309,127,63,6,10,199,207,van
4,85,44,70,205,103,52,149,45,19,144,241,325,188,127,9,11,180,183,bus


In [14]:
vehicle.tail(4)

Unnamed: 0,compactness,circularity,distance circularity,radius ratio,pr.axis aspect ratio,max.length aspect ratio,scatter ratio,elongatedness,pr.axis rectangularity,max.length rectangularity,scaled variance along major axis,scaled variance along minor axis,scaled radius of gyration,skewness about major axis,skewness about minor axis,kurtosis about minor axis,kurtosis about major axis,hollows ratio,vehicle
842,89,46,84,163,66,11,159,43,20,159,173,368,176,72,1,20,186,197,van
843,106,54,101,222,67,12,222,30,25,173,228,721,200,70,3,4,187,201,saab
844,86,36,78,146,58,7,135,50,18,124,155,270,148,66,0,25,190,195,saab
845,85,36,66,123,55,5,120,56,17,128,140,212,131,73,1,18,186,190,van


### Displaying underlying numpy data

In [22]:
vehicle.columns

Index(['compactness', 'circularity', 'distance circularity', 'radius ratio',
       'pr.axis aspect ratio', 'max.length aspect ratio', 'scatter ratio',
       'elongatedness', 'pr.axis rectangularity', 'max.length rectangularity',
       'scaled variance along major axis', 'scaled variance along minor axis',
       'scaled radius of gyration', 'skewness about major axis',
       'skewness about minor axis', 'kurtosis about minor axis',
       'kurtosis about major axis', 'hollows ratio', 'vehicle'],
      dtype='object')

In [23]:
vehicle.values

array([[95, 48, 83, ..., 187, 197, 'van'],
       [91, 41, 84, ..., 189, 199, 'van'],
       [104, 50, 106, ..., 188, 196, 'saab'],
       ..., 
       [106, 54, 101, ..., 187, 201, 'saab'],
       [86, 36, 78, ..., 190, 195, 'saab'],
       [85, 36, 66, ..., 186, 190, 'van']], dtype=object)

### Describing the statistics of the data

In [26]:
vehicle.describe()

Unnamed: 0,compactness,circularity,distance circularity,radius ratio,pr.axis aspect ratio,max.length aspect ratio,scatter ratio,elongatedness,pr.axis rectangularity,max.length rectangularity,scaled variance along major axis,scaled variance along minor axis,scaled radius of gyration,skewness about major axis,skewness about minor axis,kurtosis about minor axis,kurtosis about major axis,hollows ratio
count,846.0,846.0,846.0,846.0,846.0,846.0,846.0,846.0,846.0,846.0,846.0,846.0,846.0,846.0,846.0,846.0,846.0,846.0
mean,93.678487,44.861702,82.088652,168.940898,61.693853,8.567376,168.839243,40.933806,20.582742,147.998818,188.625296,439.911348,174.70331,72.462175,6.377069,12.599291,188.932624,195.632388
std,8.234474,6.169866,15.771533,33.472183,7.888251,4.601217,33.244978,7.81156,2.592138,14.515652,31.394837,176.692614,32.54649,7.486974,4.918353,8.93124,6.163949,7.438797
min,73.0,33.0,40.0,104.0,47.0,2.0,112.0,26.0,17.0,118.0,130.0,184.0,109.0,59.0,0.0,0.0,176.0,181.0
25%,87.0,40.0,70.0,141.0,57.0,7.0,146.25,33.0,19.0,137.0,167.0,318.25,149.0,67.0,2.0,5.0,184.0,190.25
50%,93.0,44.0,80.0,167.0,61.0,8.0,157.0,43.0,20.0,146.0,178.5,364.0,173.0,71.5,6.0,11.0,188.0,197.0
75%,100.0,49.0,98.0,195.0,65.0,10.0,198.0,46.0,23.0,159.0,217.0,587.0,198.0,75.0,9.0,19.0,193.0,201.0
max,119.0,59.0,112.0,333.0,138.0,55.0,265.0,61.0,29.0,188.0,320.0,1018.0,268.0,135.0,22.0,41.0,206.0,211.0


### Basic Operations on data

In [32]:
df.T # Transpose of data

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,1.710686,-0.393422,-2.475899,0.920399,0.103886,0.737788
B,-0.107537,0.561274,-1.57309,0.568459,0.711909,-1.764198
C,0.970582,1.23825,1.606383,-0.216855,-0.190667,-0.680107
D,0.252704,2.438959,-1.04022,0.847831,-0.035993,-0.371813


In [34]:
df.sort_index(axis=1, ascending=False)  #Sorting of Columns

Unnamed: 0,D,C,B,A
2013-01-01,0.252704,0.970582,-0.107537,1.710686
2013-01-02,2.438959,1.23825,0.561274,-0.393422
2013-01-03,-1.04022,1.606383,-1.57309,-2.475899
2013-01-04,0.847831,-0.216855,0.568459,0.920399
2013-01-05,-0.035993,-0.190667,0.711909,0.103886
2013-01-06,-0.371813,-0.680107,-1.764198,0.737788


In [35]:
df.sort_values(by='B') #Sorting of rows by Column

Unnamed: 0,A,B,C,D
2013-01-06,0.737788,-1.764198,-0.680107,-0.371813
2013-01-03,-2.475899,-1.57309,1.606383,-1.04022
2013-01-01,1.710686,-0.107537,0.970582,0.252704
2013-01-02,-0.393422,0.561274,1.23825,2.438959
2013-01-04,0.920399,0.568459,-0.216855,0.847831
2013-01-05,0.103886,0.711909,-0.190667,-0.035993


### Selecting the Data

#Selecting a single column, which yields a Series, equivalent to df.A
print(df['A'])
print("\n Same As Above\n",df.A)


### Selecting via [], which slices the rows.

In [47]:
vehicle[0:3]

Unnamed: 0,compactness,circularity,distance circularity,radius ratio,pr.axis aspect ratio,max.length aspect ratio,scatter ratio,elongatedness,pr.axis rectangularity,max.length rectangularity,scaled variance along major axis,scaled variance along minor axis,scaled radius of gyration,skewness about major axis,skewness about minor axis,kurtosis about minor axis,kurtosis about major axis,hollows ratio,vehicle
0,95,48,83,178,72,10,162,42,20,159,176,379,184,70,6,16,187,197,van
1,91,41,84,141,57,9,149,45,19,143,170,330,158,72,9,14,189,199,van
2,104,50,106,209,66,10,207,32,23,158,223,635,220,73,14,9,188,196,saab


In [48]:
vehicle[10:15]

Unnamed: 0,compactness,circularity,distance circularity,radius ratio,pr.axis aspect ratio,max.length aspect ratio,scatter ratio,elongatedness,pr.axis rectangularity,max.length rectangularity,scaled variance along major axis,scaled variance along minor axis,scaled radius of gyration,skewness about major axis,skewness about minor axis,kurtosis about minor axis,kurtosis about major axis,hollows ratio,vehicle
10,86,36,70,143,61,9,133,50,18,130,153,266,127,66,2,10,194,202,van
11,90,34,66,136,55,6,123,54,17,118,148,224,118,65,5,26,196,202,saab
12,88,46,74,171,68,6,152,43,19,148,180,349,192,71,5,11,189,195,bus
13,89,42,85,144,58,10,152,44,19,144,173,345,161,72,8,13,187,197,van
14,94,49,79,203,71,5,174,37,21,154,196,465,206,71,6,2,197,199,bus


### Selection By Label

In [49]:
#For Getting Cross Section using Labels
df.loc[dates[0]]

A    1.710686
B   -0.107537
C    0.970582
D    0.252704
Name: 2013-01-01 00:00:00, dtype: float64

In [51]:
#For Selecting multiple Columns
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,1.710686,-0.107537
2013-01-02,-0.393422,0.561274
2013-01-03,-2.475899,-1.57309
2013-01-04,0.920399,0.568459
2013-01-05,0.103886,0.711909
2013-01-06,0.737788,-1.764198


In [52]:
# Showing label slicing, both endpoints are included
df.loc['20130102':'20130104',['A','B']]

Unnamed: 0,A,B
2013-01-02,-0.393422,0.561274
2013-01-03,-2.475899,-1.57309
2013-01-04,0.920399,0.568459


In [53]:
#Reduction in Dimension of Returned Object
df.loc['20130102',['A','B']]

A   -0.393422
B    0.561274
Name: 2013-01-02 00:00:00, dtype: float64

In [54]:
#For Getting Scalar Values, There are two methods
print(df.loc[dates[0],'A'])
print(df.at[dates[0],'A'])

1.71068592894
1.71068592894


### Selection By Position

In [58]:
#Select via the position of the passed integers
df.iloc[3]

A    0.920399
B    0.568459
C   -0.216855
D    0.847831
Name: 2013-01-04 00:00:00, dtype: float64

In [59]:
#By integer slices, acting similar to numpy/python
vehicle.iloc[3:5,0:2]

Unnamed: 0,compactness,circularity
3,93,41
4,85,44


In [60]:
#By lists of integer position locations, similar to the numpy/python style
vehicle.iloc[[1,2,4],[0,2]]

Unnamed: 0,compactness,distance circularity
1,91,84
2,104,106
4,85,70


In [63]:
#For slicing rows explicitly
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2013-01-02,-0.393422,0.561274,1.23825,2.438959
2013-01-03,-2.475899,-1.57309,1.606383,-1.04022


In [64]:
#For slicing cols explicitly
df.iloc[:,1:4]

Unnamed: 0,B,C,D
2013-01-01,-0.107537,0.970582,0.252704
2013-01-02,0.561274,1.23825,2.438959
2013-01-03,-1.57309,1.606383,-1.04022
2013-01-04,0.568459,-0.216855,0.847831
2013-01-05,0.711909,-0.190667,-0.035993
2013-01-06,-1.764198,-0.680107,-0.371813


### Conditional Selection

In [65]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-01,1.710686,-0.107537,0.970582,0.252704
2013-01-04,0.920399,0.568459,-0.216855,0.847831
2013-01-05,0.103886,0.711909,-0.190667,-0.035993
2013-01-06,0.737788,-1.764198,-0.680107,-0.371813


In [66]:
#Selecting values from a DataFrame where a boolean condition is met.
df[df>0]

Unnamed: 0,A,B,C,D
2013-01-01,1.710686,,0.970582,0.252704
2013-01-02,,0.561274,1.23825,2.438959
2013-01-03,,,1.606383,
2013-01-04,0.920399,0.568459,,0.847831
2013-01-05,0.103886,0.711909,,
2013-01-06,0.737788,,,


In [71]:
# Using the isin method for filtering
df2 = df.copy()
df2['E'] = ['one', 'one','two','three','four','three']
print(df2)
print(df2[df2['E'].isin(['two','four'])])

                   A         B         C         D      E
2013-01-01  1.710686 -0.107537  0.970582  0.252704    one
2013-01-02 -0.393422  0.561274  1.238250  2.438959    one
2013-01-03 -2.475899 -1.573090  1.606383 -1.040220    two
2013-01-04  0.920399  0.568459 -0.216855  0.847831  three
2013-01-05  0.103886  0.711909 -0.190667 -0.035993   four
2013-01-06  0.737788 -1.764198 -0.680107 -0.371813  three
                   A         B         C         D     E
2013-01-03 -2.475899 -1.573090  1.606383 -1.040220   two
2013-01-05  0.103886  0.711909 -0.190667 -0.035993  four
