# Pandas Library
> Generally used to import/export/handle large datasets. <br> <i> pip install pandas</i>

1. What is a dataframe and series? Creating our own dataframe and series.
2. Reading some real datasets. - csv, excel and text data.
3. Getting the number of rows and number of columns. 
4. Viewing the data - head and tail.
5. Accessing the data from dataframe - loc and iloc.
6. Accessing data based on conditions.
7. Getting the summary and datatype of a column.
8. Writing to a csv.
9. <b>Finding the missing values in a dataset and how to deal with it.</b>

In [1]:
import pandas as pd

In [3]:
df=pd.DataFrame({'A':[24,34],'B':[36,67]},index=['row1','row2'])
df

Unnamed: 0,A,B
row1,24,36
row2,34,67


In [6]:
myseries=pd.Series([24,34,54],name="Ser",index=['r1','r2','r3'])
myseries

r1    24
r2    34
r3    54
Name: Ser, dtype: int64

In [7]:
data1=pd.read_csv('IRIS.csv')
data1.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [8]:
data1.shape

(150, 5)

In [11]:
data1.tail(7)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
143,6.8,3.2,5.9,2.3,Iris-virginica
144,6.7,3.3,5.7,2.5,Iris-virginica
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica
149,5.9,3.0,5.1,1.8,Iris-virginica


In [12]:
data2=pd.read_excel("Book2.xlsx")
data2.head()

Unnamed: 0,dt_iso,temp_min,temp_max,temp_base
0,1989-01-01 00:00:00,22.5,24.36,26
1,1989-01-01 01:00:00,22.4,24.35,26
2,1989-01-01 02:00:00,23.31,25.0,26
3,1989-01-01 03:00:00,25.0,27.75,26
4,1989-01-01 04:00:00,25.0,28.67,26


In [13]:
data2.head(2)

Unnamed: 0,dt_iso,temp_min,temp_max,temp_base
0,1989-01-01 00:00:00,22.5,24.36,26
1,1989-01-01 01:00:00,22.4,24.35,26


In [15]:
data2.shape

(271728, 4)

In [16]:
data2.tail()

Unnamed: 0,dt_iso,temp_min,temp_max,temp_base
271723,2019-12-31 19:00:00,25.0,27.4,26
271724,2019-12-31 20:00:00,24.5,26.21,26
271725,2019-12-31 21:00:00,24.4,26.19,26
271726,2019-12-31 22:00:00,24.0,26.1,26
271727,2019-12-31 23:00:00,24.75,25.96,26


In [19]:
data3=pd.read_csv("LinearRegData.txt",sep='\t',header=None)
data3.head()

Unnamed: 0,0,1,2
0,1.0,0.067732,3.176513
1,1.0,0.42781,3.816464
2,1.0,0.995731,4.550095
3,1.0,0.738336,4.256571
4,1.0,0.981083,4.560815


In [20]:
data3.columns=['col1','col2','col3']
data3.head()

Unnamed: 0,col1,col2,col3
0,1.0,0.067732,3.176513
1,1.0,0.42781,3.816464
2,1.0,0.995731,4.550095
3,1.0,0.738336,4.256571
4,1.0,0.981083,4.560815


In [21]:
data3.shape

(200, 3)

In [19]:
data=pd.read_csv('IRIS.csv')
data.shape

(150, 5)

In [6]:
data['species'][0]

'Iris-setosa'

In [9]:
data.iloc[[0,3],[2,3]]

Unnamed: 0,petal_length,petal_width
0,1.4,0.2
3,1.5,0.2


In [11]:
data.loc[[0,3],['petal_length','species']]

Unnamed: 0,petal_length,species
0,1.4,Iris-setosa
3,1.5,Iris-setosa


In [20]:
newdata=data[(data['sepal_length']>5) & (data.species=='Iris-setosa')]
newdata.shape

(22, 5)

In [15]:
newdata['sepal_length'].describe()

count    22.000000
mean      5.313636
std       0.223171
min       5.100000
25%       5.100000
50%       5.250000
75%       5.400000
max       5.800000
Name: sepal_length, dtype: float64

In [18]:
newdata.sepal_width.dtypes

dtype('float64')

In [21]:
data.to_csv('output.csv')

# Finding and Handling the missing values

In [1]:
import pandas as pd
import numpy as np

In [2]:
data=pd.read_csv('titanic.csv')
data.shape

(891, 12)

In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
data.isnull().sum()*100/data.shape[0]

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.104377
Embarked        0.224467
dtype: float64

In [7]:
data.drop(['Cabin'],axis=1,inplace=True)
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [8]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
dtype: int64

In [9]:
data.Age.describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [12]:
data.Age.fillna(value=np.mean(data.Age),inplace=True)

In [13]:
data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       2
dtype: int64

In [14]:
data[(data.Embarked.isnull()==True)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,


In [16]:
data.Embarked.fillna(method='ffill',inplace=True)

In [17]:
data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64