In [132]:
from sklearn import datasets
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

### Reading Data from Source

In [165]:
iris = datasets.load_iris()

In [78]:
# using first 2 features as independent variables
x = iris.data[:,:2]               # 2-D array   [start_row : end_row, start_col : end_col]
Y = iris.target[:]                # 1-D array   [start_pos : end_pos]

In [167]:
df = pd.read_csv('iris.csv')

### Understanding Data

In [43]:
df.shape

(150, 5)

In [45]:
df.head(2)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa


In [66]:
df.tail(3)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica
149,5.9,3.0,5.1,1.8,Virginica


In [47]:
df.columns

Index(['sepal.length', 'sepal.width', 'petal.length', 'petal.width',
       'variety'],
      dtype='object')

In [48]:
list(df.columns)

['sepal.length', 'sepal.width', 'petal.length', 'petal.width', 'variety']

In [63]:
df.dtypes

sepal.length    float64
sepal.width     float64
petal.length    float64
petal.width     float64
variety          object
dtype: object

In [84]:
df['petal.width']                         # Returns Series
df['petal.width'].values                  # Returns an array
df['petal.width'].values[67]              # Returns value at mentioned Index

1.0

In [88]:
df[148:150]                               # Returns a DataFrame

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
148,6.2,3.4,5.4,2.3,Virginica
149,5.9,3.0,5.1,1.8,Virginica


In [91]:
df.loc[149]                              # Value at particular Location - Result is not a DataFrame
                                         # Label based indexing - When we have strings/labels as indexes

sepal.length          5.9
sepal.width             3
petal.length          5.1
petal.width           1.8
variety         Virginica
Name: 149, dtype: object

In [89]:
df.iloc[149]                             # i stand for index/position based

sepal.length          5.9
sepal.width             3
petal.length          5.1
petal.width           1.8
variety         Virginica
Name: 149, dtype: object

In [94]:
df.ix[149]                              # Deprecated method - similar to loc/iloc

sepal.length          5.9
sepal.width             3
petal.length          5.1
petal.width           1.8
variety         Virginica
Name: 149, dtype: object

In [96]:
# Create a new column in dataframe
df['child'] = 0
df.head(2)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety,child
0,5.1,3.5,1.4,0.2,Setosa,0
1,4.9,3.0,1.4,0.2,Setosa,0


In [99]:
df['variety'].value_counts()                    # Counts the distinct value in any column in descending order

Versicolor    50
Setosa        50
Virginica     50
Name: variety, dtype: int64

In [103]:
df['sepal.length'].value_counts().head(2)       # Top 2 sepal length

5.0    10
6.3     9
Name: sepal.length, dtype: int64

In [112]:
# df['child'] = int('nan')            # Won't work   -   'nan' is Not a Nummber
df['child'] = float('nan')            # Assigns NULL to 'child' column
                                      # Data type of 'nan' is Float

### Counting NULL Values

In [113]:
df.count()                            # Count the number of values in the columns excluding NULLs

sepal.length    150
sepal.width     150
petal.length    150
petal.width     150
variety         150
child             0
dtype: int64

In [121]:
df.isnull().sum()                     # Count the sum of all null values in any column

sepal.length      0
sepal.width       0
petal.length      0
petal.width       0
variety           0
child           150
dtype: int64

### Summarising Data

In [138]:
print(df['sepal.length'].mean())
print(df['sepal.length'].median())
print(df['sepal.length'].mode())
print(df['sepal.length'].min())
print(df['sepal.length'].max())

5.843333333333335
5.8
0    5.0
dtype: float64
4.3
7.9


In [127]:
df.describe()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,child
count,150.0,150.0,150.0,150.0,0.0
mean,5.843333,3.057333,3.758,1.199333,
std,0.828066,0.435866,1.765298,0.762238,
min,4.3,2.0,1.0,0.1,
25%,5.1,2.8,1.6,0.3,
50%,5.8,3.0,4.35,1.3,
75%,6.4,3.3,5.1,1.8,
max,7.9,4.4,6.9,2.5,


### Type Conversion

In [134]:
my_list = list(df['sepal.length'].values)           # vlaues - Returns array

In [135]:
np.array(my_list)                                   # Convert list to array

array([5.1, 4.9, 4.7, 4.6, 5. , 5.4, 4.6, 5. , 4.4, 4.9, 5.4, 4.8, 4.8,
       4.3, 5.8, 5.7, 5.4, 5.1, 5.7, 5.1, 5.4, 5.1, 4.6, 5.1, 4.8, 5. ,
       5. , 5.2, 5.2, 4.7, 4.8, 5.4, 5.2, 5.5, 4.9, 5. , 5.5, 4.9, 4.4,
       5.1, 5. , 4.5, 4.4, 5. , 5.1, 4.8, 5.1, 4.6, 5.3, 5. , 7. , 6.4,
       6.9, 5.5, 6.5, 5.7, 6.3, 4.9, 6.6, 5.2, 5. , 5.9, 6. , 6.1, 5.6,
       6.7, 5.6, 5.8, 6.2, 5.6, 5.9, 6.1, 6.3, 6.1, 6.4, 6.6, 6.8, 6.7,
       6. , 5.7, 5.5, 5.5, 5.8, 6. , 5.4, 6. , 6.7, 6.3, 5.6, 5.5, 5.5,
       6.1, 5.8, 5. , 5.6, 5.7, 5.7, 6.2, 5.1, 5.7, 6.3, 5.8, 7.1, 6.3,
       6.5, 7.6, 4.9, 7.3, 6.7, 7.2, 6.5, 6.4, 6.8, 5.7, 5.8, 6.4, 6.5,
       7.7, 7.7, 6. , 6.9, 5.6, 7.7, 6.3, 6.7, 7.2, 6.2, 6.1, 6.4, 7.2,
       7.4, 7.9, 6.4, 6.3, 6.1, 7.7, 6.3, 6.4, 6. , 6.9, 6.7, 6.9, 5.8,
       6.8, 6.7, 6.7, 6.3, 6.5, 6.2, 5.9])

### Zipping

In [139]:
names = ['pankaj','suruchi']
age = [29, 28]
ex = list(zip(names,age))
ex

[('pankaj', 29), ('suruchi', 28)]

### Sorting

In [144]:
sorted = df.sort_values(['sepal.length'],ascending=False)               # Sorting method in Dataframe
sorted.head(2)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety,child
131,7.9,3.8,6.4,2.0,Virginica,
135,7.7,3.0,6.1,2.3,Virginica,


In [143]:
sorted = df.sort_values(['sepal.length','sepal.width'],ascending=False) # Sorting multiple columns
sorted.head(2)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety,child
131,7.9,3.8,6.4,2.0,Virginica,
117,7.7,3.8,6.7,2.2,Virginica,


### Subsetting

#### Subsetting by Rows

In [150]:
df_sub = df[df['sepal.length'] < 5]
df_sub.head(2)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety,child
1,4.9,3.0,1.4,0.2,Setosa,
2,4.7,3.2,1.3,0.2,Setosa,


In [157]:
df[df['sepal.width'] == 3].count()

sepal.length    26
sepal.width     26
petal.length    26
petal.width     26
variety         26
child            0
dtype: int64

#### Subsetting by Column

In [185]:
df1 = df['variety']
df1.head(2)

0    Setosa
1    Setosa
Name: variety, dtype: object

In [179]:
df2  = df[['variety','petal.length']]
df2.head(2)

Unnamed: 0,variety,petal.length
0,Setosa,1.4
1,Setosa,1.4


In [186]:
df1.drop_duplicates()

0          Setosa
50     Versicolor
100     Virginica
Name: variety, dtype: object

In [187]:
df2 = df1.reset_index()
df2.head(3)

Unnamed: 0,index,variety
0,0,Setosa
1,1,Setosa
2,2,Setosa


In [189]:
df2 = df1.reset_index(drop = True)
df2.head(2)

0    Setosa
1    Setosa
Name: variety, dtype: object

In [1]:
df.set_index('sepal_length')
df.head(2)

NameError: name 'df' is not defined

#### Renaming Columns

In [194]:
df.rename(columns = {'sepal.length':'sepal_length'}, inplace = True)
df.head(2)

Unnamed: 0,sepal_length,petal.length,variety
0,5.1,1.4,Setosa
1,4.9,1.4,Setosa


#### Dropping Columns

In [168]:
df = df.drop(['petal.width'], axis = 1)                   # axis = 1 represents remove column and 0 for removing indexes
df.head()                                                 # DataFrame assignment is necessary without INPLACE option

Unnamed: 0,sepal.length,sepal.width,petal.length,variety
0,5.1,3.5,1.4,Setosa
1,4.9,3.0,1.4,Setosa
2,4.7,3.2,1.3,Setosa
3,4.6,3.1,1.5,Setosa
4,5.0,3.6,1.4,Setosa


In [170]:
df.drop(['sepal.width'], axis = 1, inplace = True)      # Inplace we are doing change for the same DataFrame
df.head()

Unnamed: 0,sepal.length,petal.length,variety
0,5.1,1.4,Setosa
1,4.9,1.4,Setosa
2,4.7,1.3,Setosa
3,4.6,1.5,Setosa
4,5.0,1.4,Setosa


In [202]:
df.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
sepal_length,5.1,4.9,4.7,4.6,5,5.4,4.6,5,4.4,4.9,...,6.7,6.9,5.8,6.8,6.7,6.7,6.3,6.5,6.2,5.9
petal.length,1.4,1.4,1.3,1.5,1.4,1.7,1.4,1.5,1.4,1.5,...,5.6,5.1,5.1,5.9,5.7,5.2,5,5.2,5.4,5.1
variety,Setosa,Setosa,Setosa,Setosa,Setosa,Setosa,Setosa,Setosa,Setosa,Setosa,...,Virginica,Virginica,Virginica,Virginica,Virginica,Virginica,Virginica,Virginica,Virginica,Virginica


### Writing to Files from DataFrames

In [190]:
df.to_csv('my_data.csv', index = False)                 # Default index = True