In [1]:
import pandas as pd

##### Series 

In [3]:
s = pd.Series([45,12,32,76,38,29,19,42])

In [4]:
s

0    45
1    12
2    32
3    76
4    38
5    29
6    19
7    42
dtype: int64

In [5]:
type(s)

pandas.core.series.Series

In [7]:
s.dtype

dtype('int64')

In [8]:
import numpy as np

In [9]:
s = pd.Series([45,12,32,76,38,29,19,42], dtype = np.int8)
s

0    45
1    12
2    32
3    76
4    38
5    29
6    19
7    42
dtype: int8

In [10]:
s[4]

38

In [11]:
s[4] = 49

In [12]:
s

0    45
1    12
2    32
3    76
4    49
5    29
6    19
7    42
dtype: int8

In [15]:
s = pd.Series([45, 12, 32.67, 76, 38, 29.18, 19, 42])

In [16]:
s

0    45.00
1    12.00
2    32.67
3    76.00
4    38.00
5    29.18
6    19.00
7    42.00
dtype: float64

In [17]:
s.min()

12.0

In [18]:
s.max()

76.0

In [19]:
s.mean()

36.73125

In [20]:
s.std()

19.417677834900854

In [21]:
s.sum()

293.85

##### DataFrame

In [26]:
s1 = pd.Series([1,2,3,4,5])
s2 = pd.Series(['riya','anil','prem','rajat','karan'])
s3 = pd.Series([56.89, 67.34, 72.76, 62.47, 81.45])

In [27]:
df = pd.DataFrame({
    'roll': s1,
    'name': s2,
    'marks': s3
})

In [28]:
df

Unnamed: 0,roll,name,marks
0,1,riya,56.89
1,2,anil,67.34
2,3,prem,72.76
3,4,rajat,62.47
4,5,karan,81.45


In [29]:
type(df)

pandas.core.frame.DataFrame

In [30]:
df.shape

(5, 3)

In [31]:
df.columns

Index(['roll', 'name', 'marks'], dtype='object')

In [32]:
df.dtypes

roll       int64
name      object
marks    float64
dtype: object

##### Working with dataset

In [35]:
# Location: https://mitu.co.in/dataset
# Download student-info.csv and save in current working directory

In [36]:
# How to find current working directory?
import os
os.getcwd()

'/home/mitu/programs/ml/ds-sep25'

In [37]:
# import the data
df = pd.read_csv("student-info.csv")

In [38]:
df

Unnamed: 0,roll,name,class,marks,age
0,1,Aditya Kulkarni,FY,85.5,15
1,2,Swapnil Joshi,FY,92.1,16
2,3,Aniket Deshpande,FY,78.3,16
3,4,Gauri Patil,FY,65.8,16
4,5,Rohan Sawant,FY,88.2,17
5,6,Pooja More,FY,72.9,16
6,7,Shreyas Shinde,FY,95.0,16
7,8,Sneha Jadhav,FY,58.6,15
8,9,Mahesh Pawar,FY,76.4,16
9,10,Priya Chavan,FY,89.7,15


In [39]:
# rows and columns
df.shape

(30, 5)

In [40]:
# names of columns
df.columns

Index(['roll', 'name', 'class', 'marks', 'age'], dtype='object')

In [41]:
df.dtypes

roll       int64
name      object
class     object
marks    float64
age        int64
dtype: object

In [42]:
type(df)

pandas.core.frame.DataFrame

##### Data Slicing

In [46]:
# iloc needs row sequence and column sequence
df.iloc[5:13,1:4]

Unnamed: 0,name,class,marks
5,Pooja More,FY,72.9
6,Shreyas Shinde,FY,95.0
7,Sneha Jadhav,FY,58.6
8,Mahesh Pawar,FY,76.4
9,Priya Chavan,FY,89.7
10,Amruta Kadam,SY,91.2
11,Sandeep Mane,SY,68.5
12,Neha Gaikwad,SY,79.9


In [47]:
x = df.iloc[5:13,1:4]
x

Unnamed: 0,name,class,marks
5,Pooja More,FY,72.9
6,Shreyas Shinde,FY,95.0
7,Sneha Jadhav,FY,58.6
8,Mahesh Pawar,FY,76.4
9,Priya Chavan,FY,89.7
10,Amruta Kadam,SY,91.2
11,Sandeep Mane,SY,68.5
12,Neha Gaikwad,SY,79.9


In [48]:
df.iloc[:13,1:4]

Unnamed: 0,name,class,marks
0,Aditya Kulkarni,FY,85.5
1,Swapnil Joshi,FY,92.1
2,Aniket Deshpande,FY,78.3
3,Gauri Patil,FY,65.8
4,Rohan Sawant,FY,88.2
5,Pooja More,FY,72.9
6,Shreyas Shinde,FY,95.0
7,Sneha Jadhav,FY,58.6
8,Mahesh Pawar,FY,76.4
9,Priya Chavan,FY,89.7


In [49]:
df.iloc[6:13,1:]

Unnamed: 0,name,class,marks,age
6,Shreyas Shinde,FY,95.0,16
7,Sneha Jadhav,FY,58.6,15
8,Mahesh Pawar,FY,76.4,16
9,Priya Chavan,FY,89.7,15
10,Amruta Kadam,SY,91.2,17
11,Sandeep Mane,SY,68.5,17
12,Neha Gaikwad,SY,79.9,17


In [50]:
df.iloc[22:,1:]

Unnamed: 0,name,class,marks,age
22,Sonali Kamble,TY,96.2,17
23,Harshal Shelar,TY,71.5,19
24,Prachi Deshmukh,TY,48.8,19
25,Milind Patil,TY,84.3,18
26,Rohini Sawant,TY,59.7,18
27,Yogesh Kulkarni,TY,93.0,18
28,Supriya Waghmare,TY,67.2,19
29,Girish Joshi,TY,75.6,19


In [52]:
df.iloc[[2,6,12,15,25],[1,3]]

Unnamed: 0,name,marks
2,Aniket Deshpande,78.3
6,Shreyas Shinde,95.0
12,Neha Gaikwad,79.9
15,Prasad Desai,61.3
25,Milind Patil,84.3


In [53]:
df.loc[:,'name']

0      Aditya Kulkarni
1        Swapnil Joshi
2     Aniket Deshpande
3          Gauri Patil
4         Rohan Sawant
5           Pooja More
6       Shreyas Shinde
7         Sneha Jadhav
8         Mahesh Pawar
9         Priya Chavan
10        Amruta Kadam
11        Sandeep Mane
12        Neha Gaikwad
13         Sachin Kale
14        Manisha Rane
15        Prasad Desai
16        Aditi Phadke
17        Sameer Tambe
18         Reshma Apte
19       Kiran Godbole
20      Archana Ingale
21       Vivek Bhosale
22       Sonali Kamble
23      Harshal Shelar
24     Prachi Deshmukh
25        Milind Patil
26       Rohini Sawant
27     Yogesh Kulkarni
28    Supriya Waghmare
29        Girish Joshi
Name: name, dtype: object

In [54]:
df.loc[:,['name','marks']]

Unnamed: 0,name,marks
0,Aditya Kulkarni,85.5
1,Swapnil Joshi,92.1
2,Aniket Deshpande,78.3
3,Gauri Patil,65.8
4,Rohan Sawant,88.2
5,Pooja More,72.9
6,Shreyas Shinde,95.0
7,Sneha Jadhav,58.6
8,Mahesh Pawar,76.4
9,Priya Chavan,89.7


In [55]:
df['name']

0      Aditya Kulkarni
1        Swapnil Joshi
2     Aniket Deshpande
3          Gauri Patil
4         Rohan Sawant
5           Pooja More
6       Shreyas Shinde
7         Sneha Jadhav
8         Mahesh Pawar
9         Priya Chavan
10        Amruta Kadam
11        Sandeep Mane
12        Neha Gaikwad
13         Sachin Kale
14        Manisha Rane
15        Prasad Desai
16        Aditi Phadke
17        Sameer Tambe
18         Reshma Apte
19       Kiran Godbole
20      Archana Ingale
21       Vivek Bhosale
22       Sonali Kamble
23      Harshal Shelar
24     Prachi Deshmukh
25        Milind Patil
26       Rohini Sawant
27     Yogesh Kulkarni
28    Supriya Waghmare
29        Girish Joshi
Name: name, dtype: object

In [56]:
df[['name','marks']]

Unnamed: 0,name,marks
0,Aditya Kulkarni,85.5
1,Swapnil Joshi,92.1
2,Aniket Deshpande,78.3
3,Gauri Patil,65.8
4,Rohan Sawant,88.2
5,Pooja More,72.9
6,Shreyas Shinde,95.0
7,Sneha Jadhav,58.6
8,Mahesh Pawar,76.4
9,Priya Chavan,89.7


In [57]:
df.head()

Unnamed: 0,roll,name,class,marks,age
0,1,Aditya Kulkarni,FY,85.5,15
1,2,Swapnil Joshi,FY,92.1,16
2,3,Aniket Deshpande,FY,78.3,16
3,4,Gauri Patil,FY,65.8,16
4,5,Rohan Sawant,FY,88.2,17


In [58]:
df.head(3)

Unnamed: 0,roll,name,class,marks,age
0,1,Aditya Kulkarni,FY,85.5,15
1,2,Swapnil Joshi,FY,92.1,16
2,3,Aniket Deshpande,FY,78.3,16


In [59]:
df.tail()

Unnamed: 0,roll,name,class,marks,age
25,26,Milind Patil,TY,84.3,18
26,27,Rohini Sawant,TY,59.7,18
27,28,Yogesh Kulkarni,TY,93.0,18
28,29,Supriya Waghmare,TY,67.2,19
29,30,Girish Joshi,TY,75.6,19


In [60]:
df.tail(3)

Unnamed: 0,roll,name,class,marks,age
27,28,Yogesh Kulkarni,TY,93.0,18
28,29,Supriya Waghmare,TY,67.2,19
29,30,Girish Joshi,TY,75.6,19


In [61]:
df.drop(5)

Unnamed: 0,roll,name,class,marks,age
0,1,Aditya Kulkarni,FY,85.5,15
1,2,Swapnil Joshi,FY,92.1,16
2,3,Aniket Deshpande,FY,78.3,16
3,4,Gauri Patil,FY,65.8,16
4,5,Rohan Sawant,FY,88.2,17
6,7,Shreyas Shinde,FY,95.0,16
7,8,Sneha Jadhav,FY,58.6,15
8,9,Mahesh Pawar,FY,76.4,16
9,10,Priya Chavan,FY,89.7,15
10,11,Amruta Kadam,SY,91.2,17


In [62]:
df.drop([2,6,12,23])

Unnamed: 0,roll,name,class,marks,age
0,1,Aditya Kulkarni,FY,85.5,15
1,2,Swapnil Joshi,FY,92.1,16
3,4,Gauri Patil,FY,65.8,16
4,5,Rohan Sawant,FY,88.2,17
5,6,Pooja More,FY,72.9,16
7,8,Sneha Jadhav,FY,58.6,15
8,9,Mahesh Pawar,FY,76.4,16
9,10,Priya Chavan,FY,89.7,15
10,11,Amruta Kadam,SY,91.2,17
11,12,Sandeep Mane,SY,68.5,17


In [63]:
df.drop('class', axis = 1)

Unnamed: 0,roll,name,marks,age
0,1,Aditya Kulkarni,85.5,15
1,2,Swapnil Joshi,92.1,16
2,3,Aniket Deshpande,78.3,16
3,4,Gauri Patil,65.8,16
4,5,Rohan Sawant,88.2,17
5,6,Pooja More,72.9,16
6,7,Shreyas Shinde,95.0,16
7,8,Sneha Jadhav,58.6,15
8,9,Mahesh Pawar,76.4,16
9,10,Priya Chavan,89.7,15


In [64]:
df.drop(['class','age'], axis = 1)

Unnamed: 0,roll,name,marks
0,1,Aditya Kulkarni,85.5
1,2,Swapnil Joshi,92.1
2,3,Aniket Deshpande,78.3
3,4,Gauri Patil,65.8
4,5,Rohan Sawant,88.2
5,6,Pooja More,72.9
6,7,Shreyas Shinde,95.0
7,8,Sneha Jadhav,58.6
8,9,Mahesh Pawar,76.4
9,10,Priya Chavan,89.7


##### Descriptive Operations on data

In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   roll    30 non-null     int64  
 1   name    30 non-null     object 
 2   class   30 non-null     object 
 3   marks   30 non-null     float64
 4   age     30 non-null     int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 1.3+ KB


In [67]:
df.describe()

Unnamed: 0,roll,marks,age
count,30.0,30.0,30.0
mean,15.5,77.443333,17.066667
std,8.803408,13.080828,1.25762
min,1.0,48.8,15.0
25%,8.25,67.525,16.0
50%,15.5,78.05,17.0
75%,22.75,89.325,18.0
max,30.0,96.2,19.0


In [68]:
df['marks'].count()

30

In [69]:
df['marks'].max()

96.2

In [70]:
df['marks'].min()

48.8

In [71]:
df['marks'].sum()

2323.2999999999997

In [72]:
df['marks'].median()

78.05

In [73]:
df['marks'].mean()

77.44333333333333

In [74]:
df.max()

roll                  30
name     Yogesh Kulkarni
class                 TY
marks               96.2
age                   19
dtype: object

In [76]:
df.mean(numeric_only=True)

roll     15.500000
marks    77.443333
age      17.066667
dtype: float64

##### Custom Column operations

In [78]:
df['roll'] + 100

0     101
1     102
2     103
3     104
4     105
5     106
6     107
7     108
8     109
9     110
10    111
11    112
12    113
13    114
14    115
15    116
16    117
17    118
18    119
19    120
20    121
21    122
22    123
23    124
24    125
25    126
26    127
27    128
28    129
29    130
Name: roll, dtype: int64

In [80]:
tf = df.iloc[:5,]

In [81]:
tf

Unnamed: 0,roll,name,class,marks,age
0,1,Aditya Kulkarni,FY,85.5,15
1,2,Swapnil Joshi,FY,92.1,16
2,3,Aniket Deshpande,FY,78.3,16
3,4,Gauri Patil,FY,65.8,16
4,5,Rohan Sawant,FY,88.2,17


In [82]:
tf['cgpa'] = tf['marks'] / 10 + 0.27

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tf['cgpa'] = tf['marks'] / 10 + 0.27


In [83]:
tf

Unnamed: 0,roll,name,class,marks,age,cgpa
0,1,Aditya Kulkarni,FY,85.5,15,8.82
1,2,Swapnil Joshi,FY,92.1,16,9.48
2,3,Aniket Deshpande,FY,78.3,16,8.1
3,4,Gauri Patil,FY,65.8,16,6.85
4,5,Rohan Sawant,FY,88.2,17,9.09


In [84]:
x = ['M','M','M','F','M']

In [85]:
tf['gender'] = x
tf

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tf['gender'] = x


Unnamed: 0,roll,name,class,marks,age,cgpa,gender
0,1,Aditya Kulkarni,FY,85.5,15,8.82,M
1,2,Swapnil Joshi,FY,92.1,16,9.48,M
2,3,Aniket Deshpande,FY,78.3,16,8.1,M
3,4,Gauri Patil,FY,65.8,16,6.85,F
4,5,Rohan Sawant,FY,88.2,17,9.09,M


In [87]:
# Download -> student-info1.csv and student-info2.csv
# Link: https://mitu.co.in/dataset

In [91]:
df1 = pd.read_csv('student-info1.csv', names=['roll','name','class','marks','age'])
df2 = pd.read_csv('student-info2.csv')

In [92]:
df1

Unnamed: 0,roll,name,class,marks,age
0,31,Ajit Gawde,FY,68.2,15
1,32,Kavita Shelar,FY,81.9,16
2,33,Sunil Bhosale,SY,74.5,18
3,34,Sarika Kamble,SY,88.0,17
4,35,Nilesh Desai,TY,53.4,18
5,36,Usha Phadke,TY,91.7,19
6,37,Jayant Rane,TY,76.1,18


In [93]:
newdf = pd.concat([df, df1])

In [95]:
newdf.shape

(37, 5)

In [96]:
df2

Unnamed: 0,roll,gender,city
0,1,Male,Pune
1,2,Male,Mumbai
2,3,Male,Pune
3,4,Female,Nagpur
4,5,Male,Sangli
5,6,Female,Mumbai
6,7,Male,Nashik
7,8,Female,Pune
8,9,Male,Mumbai
9,10,Female,Pune


In [97]:
newdf = df.merge(df2, on='roll')

In [98]:
newdf

Unnamed: 0,roll,name,class,marks,age,gender,city
0,1,Aditya Kulkarni,FY,85.5,15,Male,Pune
1,2,Swapnil Joshi,FY,92.1,16,Male,Mumbai
2,3,Aniket Deshpande,FY,78.3,16,Male,Pune
3,4,Gauri Patil,FY,65.8,16,Female,Nagpur
4,5,Rohan Sawant,FY,88.2,17,Male,Sangli
5,6,Pooja More,FY,72.9,16,Female,Mumbai
6,7,Shreyas Shinde,FY,95.0,16,Male,Nashik
7,8,Sneha Jadhav,FY,58.6,15,Female,Pune
8,9,Mahesh Pawar,FY,76.4,16,Male,Mumbai
9,10,Priya Chavan,FY,89.7,15,Female,Pune


##### Grouping and aggregation

In [100]:
df['class'].value_counts()

class
FY    10
SY    10
TY    10
Name: count, dtype: int64

In [101]:
newdf['gender'].value_counts()

gender
Male      16
Female    14
Name: count, dtype: int64

In [102]:
newdf['city'].value_counts()

city
Pune        12
Mumbai       7
Nagpur       4
Nashik       3
Sangli       1
Kolhapur     1
Satara       1
Amravati     1
Name: count, dtype: int64

In [103]:
gr = df.groupby('class')

In [104]:
gr.max()

Unnamed: 0_level_0,roll,name,marks,age
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FY,10,Swapnil Joshi,95.0,17
SY,20,Sandeep Mane,94.6,18
TY,30,Yogesh Kulkarni,96.2,19


In [106]:
gr.mean(numeric_only=True)

Unnamed: 0_level_0,roll,marks,age
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FY,5.5,80.25,15.8
SY,15.5,80.25,17.1
TY,25.5,71.83,18.3


##### Boolean filtering

In [110]:
df['marks'] > 75

0      True
1      True
2      True
3     False
4      True
5     False
6      True
7     False
8      True
9      True
10     True
11    False
12     True
13     True
14     True
15    False
16     True
17     True
18     True
19    False
20    False
21    False
22     True
23    False
24    False
25     True
26    False
27     True
28    False
29     True
Name: marks, dtype: bool

In [111]:
x = df[df['marks'] > 75]

In [112]:
x

Unnamed: 0,roll,name,class,marks,age
0,1,Aditya Kulkarni,FY,85.5,15
1,2,Swapnil Joshi,FY,92.1,16
2,3,Aniket Deshpande,FY,78.3,16
4,5,Rohan Sawant,FY,88.2,17
6,7,Shreyas Shinde,FY,95.0,16
8,9,Mahesh Pawar,FY,76.4,16
9,10,Priya Chavan,FY,89.7,15
10,11,Amruta Kadam,SY,91.2,17
12,13,Neha Gaikwad,SY,79.9,17
13,14,Sachin Kale,SY,83.1,18


In [116]:
x = df[(df['marks'] > 75) & (df['class'] == 'SY')]

In [117]:
x

Unnamed: 0,roll,name,class,marks,age
10,11,Amruta Kadam,SY,91.2,17
12,13,Neha Gaikwad,SY,79.9,17
13,14,Sachin Kale,SY,83.1,18
14,15,Manisha Rane,SY,94.6,18
16,17,Aditi Phadke,SY,77.8,16
17,18,Sameer Tambe,SY,82.0,18
18,19,Reshma Apte,SY,90.4,17


##### Save / export the dataframe

In [119]:
x.to_csv('output.csv', index=False)