In [4]:
import pandas as pd

## Create DataFrame (DF)

In [5]:
# Passing in a dictionary:
data = {'Name':['Anthony', 'Maria'], 'Age':[30, 28]}
df1 = pd.DataFrame(data)
print(df1)
print('\n')

# Passing in a list of lists:
data = [['Tom', 20], ['Jack', 30], ['Meera', 25]]
df2 = pd.DataFrame(data, columns = ['Name', 'Age'])
print(df2)

      Name  Age
0  Anthony   30
1    Maria   28


    Name  Age
0    Tom   20
1   Jack   30
2  Meera   25


In [6]:
#Exporting df to csv
df2.to_csv('students.csv', sep=';', index=False)

In [7]:
#Reading csv
df3 = pd.read_csv('students.csv', sep=';')
df3

Unnamed: 0,Name,Age
0,Tom,20
1,Jack,30
2,Meera,25


## Concatening df and appending data to df

In [8]:
df3 = pd.concat([df1, df2])
df3

Unnamed: 0,Name,Age
0,Anthony,30
1,Maria,28
0,Tom,20
1,Jack,30
2,Meera,25


In [9]:
df3 = df3.reset_index(drop = True)
df3

Unnamed: 0,Name,Age
0,Anthony,30
1,Maria,28
2,Tom,20
3,Jack,30
4,Meera,25


In [10]:
df3 = df1.append(df2)
df3.reset_index(drop = True, inplace = True)
df3

Unnamed: 0,Name,Age
0,Anthony,30
1,Maria,28
2,Tom,20
3,Jack,30
4,Meera,25


In [11]:
df3 = df3.append({'Name':'Max', 'Age':18}, ignore_index=True)
df3

Unnamed: 0,Name,Age
0,Anthony,30
1,Maria,28
2,Tom,20
3,Jack,30
4,Meera,25
5,Max,18


## Accessing columns and row

In [12]:
df3.head() #5 rows by default

Unnamed: 0,Name,Age
0,Anthony,30
1,Maria,28
2,Tom,20
3,Jack,30
4,Meera,25


In [13]:
df3.tail()

Unnamed: 0,Name,Age
1,Maria,28
2,Tom,20
3,Jack,30
4,Meera,25
5,Max,18


In [14]:
df3['Age'] 

0    30
1    28
2    20
3    30
4    25
5    18
Name: Age, dtype: int64

In [15]:
df3.Age

0    30
1    28
2    20
3    30
4    25
5    18
Name: Age, dtype: int64

In [16]:
df3.Age[0]

30

In [17]:
df3[df3.Age > 22]

Unnamed: 0,Name,Age
0,Anthony,30
1,Maria,28
3,Jack,30
4,Meera,25


In [18]:
df3[(df3.Age >= 30) | (df3.Name == 'Max')] # 'and' is '&'

Unnamed: 0,Name,Age
0,Anthony,30
3,Jack,30
5,Max,18


## Adding and dropping column

In [19]:
df3['Matricule'] = 0
df3

Unnamed: 0,Name,Age,Matricule
0,Anthony,30,0
1,Maria,28,0
2,Tom,20,0
3,Jack,30,0
4,Meera,25,0
5,Max,18,0


In [20]:
df3.drop(['Matricule'], axis=1, inplace=True)
df3

Unnamed: 0,Name,Age
0,Anthony,30
1,Maria,28
2,Tom,20
3,Jack,30
4,Meera,25
5,Max,18


In [21]:
df3['Matricule'] = df3['Age'] * 10000
df3

Unnamed: 0,Name,Age,Matricule
0,Anthony,30,300000
1,Maria,28,280000
2,Tom,20,200000
3,Jack,30,300000
4,Meera,25,250000
5,Max,18,180000


## loc, iloc, ix

### - iloc

In [22]:
#Rows:
df3.iloc[0] # first row of data frame
df3.iloc[1] # second row of data frame
df3.iloc[-1] # last row of data frame
# Columns:
df3.iloc[:,0] # first column of data frame
df3.iloc[:,1] # second column of data frame
df3.iloc[:,-1] # last column of data frame
#Mix:
df3.iloc[0:2] # first three rows of dataframe
df3.iloc[:, 0:2] # first two columns of data frame with all rows
df3.iloc[[0,3], [1]] # 1st, 4th row + 2nd columns.

Unnamed: 0,Age
0,30
3,30


### - loc

In [23]:
df4 = df3.set_index('Name')
df4

Unnamed: 0_level_0,Age,Matricule
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Anthony,30,300000
Maria,28,280000
Tom,20,200000
Jack,30,300000
Meera,25,250000
Max,18,180000


In [24]:
df4.loc['Max']

Age              18
Matricule    180000
Name: Max, dtype: int64

In [25]:
df4.loc[['Max', 'Tom'], ['Age']]

Unnamed: 0_level_0,Age
Name,Unnamed: 1_level_1
Max,18
Tom,20


## Describe DF and basic statistics

In [26]:
df3.describe()

Unnamed: 0,Age,Matricule
count,6.0,6.0
mean,25.166667,251666.666667
std,5.154286,51542.862422
min,18.0,180000.0
25%,21.25,212500.0
50%,26.5,265000.0
75%,29.5,295000.0
max,30.0,300000.0


In [27]:
df3.Age.sum()

151

In [28]:
df3.Age.value_counts()

30    2
28    1
20    1
25    1
18    1
Name: Age, dtype: int64

### - Check for missing values

In [29]:
df3.isnull()

Unnamed: 0,Name,Age,Matricule
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,False


In [30]:
df3.isnull().any()

Name         False
Age          False
Matricule    False
dtype: bool

In [31]:
df3.isnull().sum()

Name         0
Age          0
Matricule    0
dtype: int64

## Apply

In [32]:
def double(x):
    return 2*x
 
df3.Age.apply(double)

0    60
1    56
2    40
3    60
4    50
5    36
Name: Age, dtype: int64

In [33]:
df3.Name.apply(lambda x : x.upper())

0    ANTHONY
1      MARIA
2        TOM
3       JACK
4      MEERA
5        MAX
Name: Name, dtype: object

In [34]:
new_matricule = df3.apply(
    lambda x: str(x['Age']) + x['Name'], 
    axis=1)
new_matricule

0    30Anthony
1      28Maria
2        20Tom
3       30Jack
4      25Meera
5        18Max
dtype: object

In [35]:
df3['Matricule'] = new_matricule
df3

Unnamed: 0,Name,Age,Matricule
0,Anthony,30,30Anthony
1,Maria,28,28Maria
2,Tom,20,20Tom
3,Jack,30,30Jack
4,Meera,25,25Meera
5,Max,18,18Max


## Iterrows
/!\ To avoid

Iteration in Pandas is an anti-pattern and is something you should only do when you have exhausted every other option

Cf: https://stackoverflow.com/a/55557758


In [36]:
#You should never modify something you are iterating over. This is not guaranteed to work in all cases. 
#Depending on the data types, the iterator returns a copy and not a view, and writing to it will have no effect.

#Because iterrows returns a Series for each row, it does not preserve dtypes across the rows. 
#Use itertuples to preserve dtypes

for index, row in df3.iterrows():
    print('index : {}'.format(index))
    print(row['Name'] + '\n')

index : 0
Anthony

index : 1
Maria

index : 2
Tom

index : 3
Jack

index : 4
Meera

index : 5
Max

