# Pandas in Python

Pandas is a software library written for the Python programming language for data manipulation and analysis. In particular, it offers data structures and operations for manipulating numerical tables and time series. It is free software released under the three-clause BSD license.

In [1]:
import pandas as pd

### Creating DataFrames

#### From Lists

In [8]:
df = pd.DataFrame([['Bob', 55],
                   ['Vien', 52]])

In [20]:
type(df)

pandas.core.frame.DataFrame

In [9]:
print(df)

      0   1
0   Bob  55
1  Vien  52


#### From NumPy arrays

In [13]:
import numpy as np

arr = np.arange(100).reshape(10,10)
df = pd.DataFrame(arr)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,1,2,3,4,5,6,7,8,9
1,10,11,12,13,14,15,16,17,18,19
2,20,21,22,23,24,25,26,27,28,29
3,30,31,32,33,34,35,36,37,38,39
4,40,41,42,43,44,45,46,47,48,49
5,50,51,52,53,54,55,56,57,58,59
6,60,61,62,63,64,65,66,67,68,69
7,70,71,72,73,74,75,76,77,78,79
8,80,81,82,83,84,85,86,87,88,89
9,90,91,92,93,94,95,96,97,98,99


In [15]:
df = pd.read_csv('Book1.csv')

In [16]:
df

Unnamed: 0,Revenue,profit
0,3000,6000
1,4000,10000
2,5000,15000


## Indexing in DataFrame

In [17]:
df['Revenue']

0    3000
1    4000
2    5000
Name: Revenue, dtype: int64

In [18]:
df[['Revenue','profit']]

Unnamed: 0,Revenue,profit
0,3000,6000
1,4000,10000
2,5000,15000


### Methods and Attributes in DataFrame

Max and Min

In [19]:
df['Revenue'].max()

5000

In [22]:
df['Revenue'].min()

3000

Describing the dataframe

In [13]:
df.describe()

Unnamed: 0,Revenue,profit
count,3.0,3.0
mean,4000.0,10333.333333
std,1000.0,4509.249753
min,3000.0,6000.0
25%,3500.0,8000.0
50%,4000.0,10000.0
75%,4500.0,12500.0
max,5000.0,15000.0


In [24]:
print(df.columns)

Index(['Revenue', 'profit'], dtype='object')


In [25]:
df.shape

(3, 2)

In [26]:
df.size

6

In [27]:
df.values

array([[ 3000,  6000],
       [ 4000, 10000],
       [ 5000, 15000]], dtype=int64)

#### Slicing DataFrame using comparision operator

In [15]:
df[df['Revenue']>3000]

Unnamed: 0,Revenue,profit
1,4000,10000
2,5000,15000


### Converting DataFrame to NumPy ndarray

In [16]:
df.as_matrix()

  """Entry point for launching an IPython kernel.


array([[ 3000,  6000],
       [ 4000, 10000],
       [ 5000, 15000]], dtype=int64)

## Importing csv in customized format

In [32]:
iris_data = pd.read_csv('iris.csv')

In [33]:
print(iris_data.head())

   150    4  setosa  versicolor  virginica
0  5.1  3.5     1.4         0.2          0
1  4.9  3.0     1.4         0.2          0
2  4.7  3.2     1.3         0.2          0
3  4.6  3.1     1.5         0.2          0
4  5.0  3.6     1.4         0.2          0


In [29]:
iris_data = pd.read_csv('iris.csv', skiprows = 1, usecols = [0, 1, 2, 3], 
                        names = ['Sepal-Length', 'Spepal-Width', 'Petal-Length', 'Petal-Widht'])

In [31]:
print(iris_data.head())

   Sepal-Length  Spepal-Width  Petal-Length  Petal-Widht
0           5.1           3.5           1.4          0.2
1           4.9           3.0           1.4          0.2
2           4.7           3.2           1.3          0.2
3           4.6           3.1           1.5          0.2
4           5.0           3.6           1.4          0.2
