## Putting Some Pandas In Your Python

Command: <code>pip3 install pandas</code>

<img src = '1.jpg'>

In [4]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    'col-1': ['Item-1', 'Item-2', 'Item-3', 'Item-4'],
    'col-2': ['Gold', 'Bronze', 'Gold', 'Silver'],
    'col-3': [1, 2, np.nan, 4]
})

In [5]:
print(df)

Unnamed: 0,col-1,col-2,col-3
0,Item-1,Gold,1.0
1,Item-2,Bronze,2.0
2,Item-3,Gold,
3,Item-4,Silver,4.0


## Creating a DataFrame from Dictionary

In [6]:
data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'], 'Age':[28,34,29,42]}
df = pd.DataFrame(data)
print(df)

Unnamed: 0,Name,Age
0,Tom,28
1,Jack,34
2,Steve,29
3,Ricky,42


In [9]:
# Creating indexed dataframe

data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'], 'Age':[28,34,29,42]}
df = pd.DataFrame(data, index=['I-1', 'I-2', 'I-3', 'I-4'])
print(df)

Unnamed: 0,Name,Age
I-1,Tom,28
I-2,Jack,34
I-3,Steve,29
I-4,Ricky,42


## DataFrame Basic Functionality

In [10]:
import pandas as pd
import numpy as np

# Create Dictionary of Series
dict = {'Name':pd.Series(['Tom', 'Jack', 'Steve', 'Ricky', 'Vin', 'James', 'Smith']),
       'Age':pd.Series([25,26,25,35,23,33,31]),
       'Rating':pd.Series([4.23,4.1,3.4,5,2.9,4.7,3.1])}

df = pd.DataFrame(dict)
print(df)

Unnamed: 0,Name,Age,Rating
0,Tom,25,4.23
1,Jack,26,4.1
2,Steve,25,3.4
3,Ricky,35,5.0
4,Vin,23,2.9
5,James,33,4.7
6,Smith,31,3.1


In [11]:
# Transpose-> returns transpose of DataFrame

print(df.T)

           0     1      2      3    4      5      6
Name     Tom  Jack  Steve  Ricky  Vin  James  Smith
Age       25    26     25     35   23     33     31
Rating  4.23   4.1    3.4      5  2.9    4.7    3.1


In [13]:
# Axes-> returns list of row axis labels and column axis labels

print(df.axes)

[RangeIndex(start=0, stop=7, step=1), Index(['Name', 'Age', 'Rating'], dtype='object')]


In [14]:
# dtypes-> return datatype of each column

print(df.dtypes)

Name       object
Age         int64
Rating    float64
dtype: object


In [15]:
# shape-> returns tuple representing dimensionallity

print(df.shape)

(7, 3)


In [16]:
# values-> returns actual data as ndarray

print(df.values)

[['Tom' 25 4.23]
 ['Jack' 26 4.1]
 ['Steve' 25 3.4]
 ['Ricky' 35 5.0]
 ['Vin' 23 2.9]
 ['James' 33 4.7]
 ['Smith' 31 3.1]]


In [18]:
# head-> by default head returns first n rows

print(df.head())

print('*'*50)

print(df.head(2))

    Name  Age  Rating
0    Tom   25    4.23
1   Jack   26    4.10
2  Steve   25    3.40
3  Ricky   35    5.00
4    Vin   23    2.90
**************************************************
   Name  Age  Rating
0   Tom   25    4.23
1  Jack   26    4.10


In [19]:
# tail-> by default tail returns last n rows

print(df.tail())

print('*'*50)

print(df.tail(2))

    Name  Age  Rating
2  Steve   25     3.4
3  Ricky   35     5.0
4    Vin   23     2.9
5  James   33     4.7
6  Smith   31     3.1
**************************************************
    Name  Age  Rating
5  James   33     4.7
6  Smith   31     3.1


## Statistics

In [20]:
# sum()-> returns the sum of values for requested axis. by default axis = 0

print(df.sum())

Name      TomJackSteveRickyVinJamesSmith
Age                                  198
Rating                             27.43
dtype: object


In [24]:
# axis = 1 -> row wise sum

print(df.sum(1))

0    29.23
1    30.10
2    28.40
3    40.00
4    25.90
5    37.70
6    34.10
dtype: float64


In [25]:
# mean()

print(df.mean())

Age       28.285714
Rating     3.918571
dtype: float64


In [26]:
# std()

print(df.std())

Age       4.644505
Rating    0.804828
dtype: float64


In [27]:
# describe() -> summarizing the data

print(df.describe())

             Age    Rating
count   7.000000  7.000000
mean   28.285714  3.918571
std     4.644505  0.804828
min    23.000000  2.900000
25%    25.000000  3.250000
50%    26.000000  4.100000
75%    32.000000  4.465000
max    35.000000  5.000000


In [30]:
# include object, number, all

print(df.describe(include=['object']))

         Name
count       7
unique      7
top     Smith
freq        1


In [33]:
print(df.describe(include=['number']))

             Age    Rating
count   7.000000  7.000000
mean   28.285714  3.918571
std     4.644505  0.804828
min    23.000000  2.900000
25%    25.000000  3.250000
50%    26.000000  4.100000
75%    32.000000  4.465000
max    35.000000  5.000000


In [32]:
# Don't pass 'all' as a list

print(df.describe(include='all'))

         Name        Age    Rating
count       7   7.000000  7.000000
unique      7        NaN       NaN
top     Smith        NaN       NaN
freq        1        NaN       NaN
mean      NaN  28.285714  3.918571
std       NaN   4.644505  0.804828
min       NaN  23.000000  2.900000
25%       NaN  25.000000  3.250000
50%       NaN  26.000000  4.100000
75%       NaN  32.000000  4.465000
max       NaN  35.000000  5.000000


## Working with .csv

In [2]:
import pandas as pd

df = pd.read_csv('Iris.csv')

print(df.head())

   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa


In [3]:
df.tail()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica
149,150,5.9,3.0,5.1,1.8,Iris-virginica


In [6]:
print(df.shape)

(150, 6)


In [8]:
print(df.columns)

Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')


In [10]:
print(df.mean())

Id               75.500000
SepalLengthCm     5.843333
SepalWidthCm      3.054000
PetalLengthCm     3.758667
PetalWidthCm      1.198667
dtype: float64


In [11]:
print(df.std())

Id               43.445368
SepalLengthCm     0.828066
SepalWidthCm      0.433594
PetalLengthCm     1.764420
PetalWidthCm      0.763161
dtype: float64


In [9]:
df.describe()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5
