# Pandas

In [8]:
import numpy as np
import pandas as pd

In [9]:
#create series from  array
a = np.array(['a','b','c','d'])
print(a)

['a' 'b' 'c' 'd']


In [10]:
s = pd.Series(a)          #initializing a series from a given array
print(s)    #output is index and then value

0    a
1    b
2    c
3    d
dtype: object


In [11]:
pd.Series(a,index=[10,20,30,40])         #changing the index of elements in the series

10    a
20    b
30    c
40    d
dtype: object

In [12]:
b = np.array(['a','b','c','d','e','f'])         #initializing an array
print(b)

['a' 'b' 'c' 'd' 'e' 'f']


In [13]:
p = pd.Series(b)        #converting an array to a string
print(p)

0    a
1    b
2    c
3    d
4    e
5    f
dtype: object


In [14]:
pd.Series(b,index=[10,20,30,40])       #error as the number of values does not match the number of indexes 

ValueError: Length of values (6) does not match length of index (4)

In [15]:
#create a series from dictionary
a = {'a':1,'b':2,'c':3,'d':4}
a

{'a': 1, 'b': 2, 'c': 3, 'd': 4}

In [16]:
c = pd.Series(a)            #a dictionary 'a' converted to a series
c

a    1
b    2
c    3
d    4
dtype: int64

In [17]:
pd.Series(c,index=[10,20,30,40])       #here index is the fixed key from dictionary, thus at new indexes the value are not transferred

10   NaN
20   NaN
30   NaN
40   NaN
dtype: float64

In [18]:
c

a    1
b    2
c    3
d    4
dtype: int64

In [19]:
c['b']      #display the element at index position 'b'

2

In [20]:
c[1:3]     #slicing with index number method

b    2
c    3
dtype: int64

In [21]:
c[['b','c']]     #slicing with index name method

b    2
c    3
dtype: int64

# Dataframe

In [22]:
a = {'Name':['Sita','Gita','John','Sofy'], 'Roll':[23,45,63,12],
    'Course':['AIML','Python','Java','Data Analytics']}             #'a' is a dictionary, whose values are in the form of list 
print(a)

{'Name': ['Sita', 'Gita', 'John', 'Sofy'], 'Roll': [23, 45, 63, 12], 'Course': ['AIML', 'Python', 'Java', 'Data Analytics']}


In [23]:
d = pd.DataFrame(a)       #the dictionary 'a' is converted to a dataframe 'd'
d

Unnamed: 0,Name,Roll,Course
0,Sita,23,AIML
1,Gita,45,Python
2,John,63,Java
3,Sofy,12,Data Analytics


In [24]:
d.head()         #by default will return 5 rows from top of the dataframe

Unnamed: 0,Name,Roll,Course
0,Sita,23,AIML
1,Gita,45,Python
2,John,63,Java
3,Sofy,12,Data Analytics


In [25]:
d.head(2)       #2 rows from top of the dataframe

Unnamed: 0,Name,Roll,Course
0,Sita,23,AIML
1,Gita,45,Python


In [26]:
d.tail()     #by default 5 rows from bottom of the dataframe

Unnamed: 0,Name,Roll,Course
0,Sita,23,AIML
1,Gita,45,Python
2,John,63,Java
3,Sofy,12,Data Analytics


In [27]:
d.tail(1)       #1 row from bottom of the dataframe

Unnamed: 0,Name,Roll,Course
3,Sofy,12,Data Analytics


In [28]:
#rename column
d = d.rename(columns={'Name':'Stu_Name'})    #renames the column 'name' of the dataframe as 'Stu_Name'
d

Unnamed: 0,Stu_Name,Roll,Course
0,Sita,23,AIML
1,Gita,45,Python
2,John,63,Java
3,Sofy,12,Data Analytics


In [29]:
d[0] = d.replace('Sita','Ram',inplace=True)            #replace the value of the dataframe from 'Sita' to 'Ram'
d

Unnamed: 0,Stu_Name,Roll,Course,0
0,Ram,23,AIML,
1,Gita,45,Python,
2,John,63,Java,
3,Sofy,12,Data Analytics,


In [30]:
import pandas as pd

a = {'Name':['Sita','Gita','John','Sofy'], 'Roll':[23,45,63,12],
    'Course':['AIML','Python','Java','Data Analytics']}

d = pd.DataFrame(a)            #getting data frame from a given dictionary

d


Unnamed: 0,Name,Roll,Course
0,Sita,23,AIML
1,Gita,45,Python
2,John,63,Java
3,Sofy,12,Data Analytics


In [31]:
d.set_index('Name',inplace=True)     #now there will not be any index like 0,1,2.... but the index will now be set as the column 'Name'


In [32]:
d

Unnamed: 0_level_0,Roll,Course
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Sita,23,AIML
Gita,45,Python
John,63,Java
Sofy,12,Data Analytics


In [33]:
d1=d             #copying a dataframe to another dataframe with the same name

In [34]:
d1

Unnamed: 0_level_0,Roll,Course
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Sita,23,AIML
Gita,45,Python
John,63,Java
Sofy,12,Data Analytics


# Statistical operations in DataFrame


In [35]:
b = pd.DataFrame({'Days':[1,2,3,4],'Visitors':[1000,500,680,1350],'Place':['Agra','Manali','Kashmir','Puri']})  
b        #initializing a dataframe

Unnamed: 0,Days,Visitors,Place
0,1,1000,Agra
1,2,500,Manali
2,3,680,Kashmir
3,4,1350,Puri


In [36]:
b.describe()           #describes about the numerical values in a given dataset

Unnamed: 0,Days,Visitors
count,4.0,4.0
mean,2.5,882.5
std,1.290994,374.020944
min,1.0,500.0
25%,1.75,635.0
50%,2.5,840.0
75%,3.25,1087.5
max,4.0,1350.0


In [37]:
b.describe(include='all')      #describes both the numeric type and string type data

Unnamed: 0,Days,Visitors,Place
count,4.0,4.0,4
unique,,,4
top,,,Agra
freq,,,1
mean,2.5,882.5,
std,1.290994,374.020944,
min,1.0,500.0,
25%,1.75,635.0,
50%,2.5,840.0,
75%,3.25,1087.5,


In [38]:
b.describe(include='O')       #describes only the object type data

Unnamed: 0,Place
count,4
unique,4
top,Agra
freq,1


In [39]:
b.isnull()   #checks if missing values are present or not

Unnamed: 0,Days,Visitors,Place
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False


In [40]:
b.isnull().sum()     #checks how many null values are present in each column

Days        0
Visitors    0
Place       0
dtype: int64

# IRIS dataset

In [41]:
data = pd.read_csv(r"C:\Users\CTTC1\Downloads\iris (2).data")  
data           #for reading a dataset we use the different variations of the 'read_' command and then the 'r' keyword inside tells to read

Unnamed: 0,5.1,3.5,1.4,0.2,Iris-setosa
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
...,...,...,...,...,...
144,6.7,3.0,5.2,2.3,Iris-virginica
145,6.3,2.5,5.0,1.9,Iris-virginica
146,6.5,3.0,5.2,2.0,Iris-virginica
147,6.2,3.4,5.4,2.3,Iris-virginica


In [42]:
data.head()            #from the top 5 rows

Unnamed: 0,5.1,3.5,1.4,0.2,Iris-setosa
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [43]:
data.tail()          #from the bottom 5 rows

Unnamed: 0,5.1,3.5,1.4,0.2,Iris-setosa
144,6.7,3.0,5.2,2.3,Iris-virginica
145,6.3,2.5,5.0,1.9,Iris-virginica
146,6.5,3.0,5.2,2.0,Iris-virginica
147,6.2,3.4,5.4,2.3,Iris-virginica
148,5.9,3.0,5.1,1.8,Iris-virginica


In [44]:
data = pd.read_csv(r"C:\Users\CTTC1\Downloads\iris (2).data", header=None)   #don't make the first row as header
data

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [45]:
data.columns = ['sepal_length','sepal_width','petal_length','petal_width','flower']    #rename columns with the attributes of the dataset
data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,flower
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [46]:
data.isnull().sum()    #total number of missing values in eaxh column

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
flower          0
dtype: int64

In [47]:
data.dtypes           #data type of each column

sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
flower           object
dtype: object

In [48]:
data['sepal_length'].unique()    #unique values in the 'sepal_length' column

array([5.1, 4.9, 4.7, 4.6, 5. , 5.4, 4.4, 4.8, 4.3, 5.8, 5.7, 5.2, 5.5,
       4.5, 5.3, 7. , 6.4, 6.9, 6.5, 6.3, 6.6, 5.9, 6. , 6.1, 5.6, 6.7,
       6.2, 6.8, 7.1, 7.6, 7.3, 7.2, 7.7, 7.4, 7.9])

In [49]:
data['flower'].value_counts()    #counts the number of entries in a given column

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: flower, dtype: int64

In [50]:
data.flower.unique()   #unique values in the flower column

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [51]:
data.shape     #returns the shape of the dataset

(150, 5)

In [52]:
data.size    #total number of elements in the data set

750