In [1]:
import pandas as pd

# Création d'un DataFrame
data = {
    'Nom': ['Jean', 'Paul', 'Pierre', 'Luc', 'Francois'],
    'Age': [24, 28, 22, 19, 30],
    'Ville': ['Paris', 'Lyon', 'Marseille', 'Lille', 'Toulouse']
}
df = pd.DataFrame(data)


In [2]:
df

Unnamed: 0,Nom,Age,Ville
0,Jean,24,Paris
1,Paul,28,Lyon
2,Pierre,22,Marseille
3,Luc,19,Lille
4,Francois,30,Toulouse


In [3]:
df1 = df['Nom']
df1

0        Jean
1        Paul
2      Pierre
3         Luc
4    Francois
Name: Nom, dtype: object

In [4]:
df2 = df.iloc[0]  # Sélectionne la première ligne
df2

Nom       Jean
Age         24
Ville    Paris
Name: 0, dtype: object

In [5]:
df3 = df[df['Age'] > 25]
df3

Unnamed: 0,Nom,Age,Ville
1,Paul,28,Lyon
4,Francois,30,Toulouse


In [6]:
df4 = df.groupby('Ville')['Age'].mean()
df4

Ville
Lille        19.0
Lyon         28.0
Marseille    22.0
Paris        24.0
Toulouse     30.0
Name: Age, dtype: float64

In [7]:
print(pd.__version__)

2.0.3


## What is Series?
#### A Pandas series is like a column in a table. It's a one-dimensional array holding data of any time

In [8]:
#Create a simple pandas series from a list
mylist = [1,3,4,6]
series = pd.Series(mylist)
series

0    1
1    3
2    4
3    6
dtype: int64

### Create Labels

In [9]:
series = pd.Series(mylist, index=["x","y","w","z"])
series

x    1
y    3
w    4
z    6
dtype: int64

In [10]:
#Access value using the label
series["x"]

1

### Create a simple Pandas Series from a dictionnary of temperature

In [11]:
series = pd.Series({"day1":45,"day2":40,"day3":10})
series

day1    45
day2    40
day3    10
dtype: int64

## Pandas DataFrames
#### Datasets in Pandas are usually multi-dimensionnal tables, colled DataFrame. Series is like a column, a DataFrame is the whole table.

In [12]:
#Create a dataframe using Pandas DataFrame method
data = {
   "weekday":["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"],
    "temperature":[34,4,5,99,45,67,89],
    "wind":["slow","fast","slow","fast","slow","fast","slow"],
    "weather":["sunny","cloudy","cloudy","sunny","cold","cold","sunny"]
}

df = pd.DataFrame(data)
df

Unnamed: 0,weekday,temperature,wind,weather
0,Monday,34,slow,sunny
1,Tuesday,4,fast,cloudy
2,Wednesday,5,slow,cloudy
3,Thursday,99,fast,sunny
4,Friday,45,slow,cold
5,Saturday,67,fast,cold
6,Sunday,89,slow,sunny


In [13]:
#Check the shape
df.shape

(7, 4)

## DataFrame Rows

In [14]:
#Shows the first 5 rows of df
df.head()

Unnamed: 0,weekday,temperature,wind,weather
0,Monday,34,slow,sunny
1,Tuesday,4,fast,cloudy
2,Wednesday,5,slow,cloudy
3,Thursday,99,fast,sunny
4,Friday,45,slow,cold


In [15]:
#Shows the last 5 rows of df
df.tail()

Unnamed: 0,weekday,temperature,wind,weather
2,Wednesday,5,slow,cloudy
3,Thursday,99,fast,sunny
4,Friday,45,slow,cold
5,Saturday,67,fast,cold
6,Sunday,89,slow,sunny


In [16]:
df.head(3)

Unnamed: 0,weekday,temperature,wind,weather
0,Monday,34,slow,sunny
1,Tuesday,4,fast,cloudy
2,Wednesday,5,slow,cloudy


## DataFrame Columns

In [17]:
df.columns

Index(['weekday', 'temperature', 'wind', 'weather'], dtype='object')

In [18]:
df["wind"]

0    slow
1    fast
2    slow
3    fast
4    slow
5    fast
6    slow
Name: wind, dtype: object

In [19]:
df.temperature

0    34
1     4
2     5
3    99
4    45
5    67
6    89
Name: temperature, dtype: int64

In [20]:
'''
    if you want to view more than one specific column, give the manes in list and 
    write df[list].
    print temperature and weekday columns
'''
df[["temperature","weekday"]]

Unnamed: 0,temperature,weekday
0,34,Monday
1,4,Tuesday
2,5,Wednesday
3,99,Thursday
4,45,Friday
5,67,Saturday
6,89,Sunday


In [21]:
# Sort the temperature in ascending order
df.sort_values('temperature',ascending=False)

Unnamed: 0,weekday,temperature,wind,weather
3,Thursday,99,fast,sunny
6,Sunday,89,slow,sunny
5,Saturday,67,fast,cold
4,Friday,45,slow,cold
0,Monday,34,slow,sunny
2,Wednesday,5,slow,cloudy
1,Tuesday,4,fast,cloudy


In [22]:
df.dtypes

weekday        object
temperature     int64
wind           object
weather        object
dtype: object

## DataFrame Operations

### Create a new column

In [23]:
"""
    Create a series and add it in our dataframe
"""
city = pd.Series(["Dschang","Yaounde","Douala","Bafoussam","Ebolowa","Ayos","Melong"])

#Add it in the df
df["cities"] = city
df

Unnamed: 0,weekday,temperature,wind,weather,cities
0,Monday,34,slow,sunny,Dschang
1,Tuesday,4,fast,cloudy,Yaounde
2,Wednesday,5,slow,cloudy,Douala
3,Thursday,99,fast,sunny,Bafoussam
4,Friday,45,slow,cold,Ebolowa
5,Saturday,67,fast,cold,Ayos
6,Sunday,89,slow,sunny,Melong


### mean,max,min,median,std

In [24]:
print(df['temperature'].mean())
print(df['temperature'].max())
print(df['temperature'].min())
print(df['temperature'].std())
print(df['temperature'].median())

49.0
99
4
37.91657509146451
45.0


## Filter operations

In [25]:
df[df['temperature']>40]['weekday']

3    Thursday
4      Friday
5    Saturday
6      Sunday
Name: weekday, dtype: object

In [26]:
#Check the temperature on monday
df[df['weekday']=='Monday']['temperature']

0    34
Name: temperature, dtype: int64

In [27]:
# Check the temperature when temperature was more than mean temperature
df[df['temperature']>df['temperature'].mean()]['weekday']

3    Thursday
5    Saturday
6      Sunday
Name: weekday, dtype: object

In [30]:
df[df.loc[:,'temperature']>20][['wind','temperature']]

Unnamed: 0,wind,temperature
0,slow,34
3,fast,99
4,slow,45
5,fast,67
6,slow,89


## Negative filtering condition

In [32]:
df[~(df.loc[:,'temperature']>20)]

Unnamed: 0,weekday,temperature,wind,weather,cities
1,Tuesday,4,fast,cloudy,Yaounde
2,Wednesday,5,slow,cloudy,Douala


In [42]:
df[~(df.loc[:,'temperature']>20)]

Unnamed: 0,weekday,temperature,wind,weather,cities
1,Tuesday,4,fast,cloudy,Yaounde
2,Wednesday,5,slow,cloudy,Douala


In [44]:
df[df.loc[:,'temperature']>20][['wind','cities']]

Unnamed: 0,wind,cities
0,slow,Dschang
3,fast,Bafoussam
4,slow,Ebolowa
5,fast,Ayos
6,slow,Melong


In [45]:
df[df['temperature'].between(20,50)]

Unnamed: 0,weekday,temperature,wind,weather,cities
0,Monday,34,slow,sunny,Dschang
4,Friday,45,slow,cold,Ebolowa


## Drop Column

In [28]:
"""
    We found that weekday column is irrelevant to us so drop this column, so what if
    we want to drop any column?
    for that, we will write dataframe.drop('column_name',axis=1,implace=True). Here, we need
    to define axis=1, which is colled axis for columns because we must define its(axis=1)
    to drop any column, implace=True is used to specify to commit that changes in the
    dataframe as weel or else w'll have to assign a new dataframe to have such changes.
"""

df.drop('weekday',axis=1)

Unnamed: 0,temperature,wind,weather,cities
0,34,slow,sunny,Dschang
1,4,fast,cloudy,Yaounde
2,5,slow,cloudy,Douala
3,99,fast,sunny,Bafoussam
4,45,slow,cold,Ebolowa
5,67,fast,cold,Ayos
6,89,slow,sunny,Melong


## Add Rows  to DataFrame

In [29]:
#Create a dataframe using Pandas DataFrame method
data2 = {
    "temperature":[34,4,5,99,45,67,89],
    "wind":["slow","fast","slow","fast","slow","fast","slow"],
    "weather":["sunny","cloudy","cloudy","sunny","cold","cold","sunny"],
    "cities":["Kribi","Garoua","Fouban","Kolfata","Ebolowa","Bua","Bamenda"]
}

df2 = pd.DataFrame(data2)
df2

Unnamed: 0,temperature,wind,weather,cities
0,34,slow,sunny,Kribi
1,4,fast,cloudy,Garoua
2,5,slow,cloudy,Fouban
3,99,fast,sunny,Kolfata
4,45,slow,cold,Ebolowa
5,67,fast,cold,Bua
6,89,slow,sunny,Bamenda


In [30]:
#Merge the dataframe
df = pd.concat([df,df2])
df

Unnamed: 0,weekday,temperature,wind,weather,cities
0,Monday,34,slow,sunny,Dschang
1,Tuesday,4,fast,cloudy,Yaounde
2,Wednesday,5,slow,cloudy,Douala
3,Thursday,99,fast,sunny,Bafoussam
4,Friday,45,slow,cold,Ebolowa
5,Saturday,67,fast,cold,Ayos
6,Sunday,89,slow,sunny,Melong
0,,34,slow,sunny,Kribi
1,,4,fast,cloudy,Garoua
2,,5,slow,cloudy,Fouban


## Pandas Index

In [31]:
print(df.index)

Index([0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6], dtype='int64')


In [37]:
#Set cities column as index of df and print the df index
df.set_index("cities", inplace=True)
df.index

Index(['Dschang', 'Yaounde', 'Douala', 'Bafoussam', 'Ebolowa', 'Ayos',
       'Melong', 'Kribi', 'Garoua', 'Fouban', 'Kolfata', 'Ebolowa', 'Bua',
       'Bamenda'],
      dtype='object', name='cities')

In [38]:
df

Unnamed: 0_level_0,weekday,temperature,wind,weather
cities,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dschang,Monday,34,slow,sunny
Yaounde,Tuesday,4,fast,cloudy
Douala,Wednesday,5,slow,cloudy
Bafoussam,Thursday,99,fast,sunny
Ebolowa,Friday,45,slow,cold
Ayos,Saturday,67,fast,cold
Melong,Sunday,89,slow,sunny
Kribi,,34,slow,sunny
Garoua,,4,fast,cloudy
Fouban,,5,slow,cloudy


In [39]:
#Reset the index
df.reset_index(inplace=True)

In [40]:
df

Unnamed: 0,cities,weekday,temperature,wind,weather
0,Dschang,Monday,34,slow,sunny
1,Yaounde,Tuesday,4,fast,cloudy
2,Douala,Wednesday,5,slow,cloudy
3,Bafoussam,Thursday,99,fast,sunny
4,Ebolowa,Friday,45,slow,cold
5,Ayos,Saturday,67,fast,cold
6,Melong,Sunday,89,slow,sunny
7,Kribi,,34,slow,sunny
8,Garoua,,4,fast,cloudy
9,Fouban,,5,slow,cloudy


## Apply functions in Pandas

In [41]:
'''
   Use apply() function when you wanted to update every row in pandas DataFrame by calling a custom function
   Example: Create a function that defines temperature in categorie:low,moderate and high. if
   temperature is les than 36, then low if between 36 and 40, then moderate and more than
   40 then high.
'''

def convert_to_classes(x):
    if x<36:
        return 'low'
    if x>=36 and x<=40:
        return 'moderate'
    else:
        return 'high'

In [42]:
df['temperature_category'] = df['temperature'].apply(convert_to_classes)
df

Unnamed: 0,cities,weekday,temperature,wind,weather,temperature_category
0,Dschang,Monday,34,slow,sunny,low
1,Yaounde,Tuesday,4,fast,cloudy,low
2,Douala,Wednesday,5,slow,cloudy,low
3,Bafoussam,Thursday,99,fast,sunny,high
4,Ebolowa,Friday,45,slow,cold,high
5,Ayos,Saturday,67,fast,cold,high
6,Melong,Sunday,89,slow,sunny,high
7,Kribi,,34,slow,sunny,low
8,Garoua,,4,fast,cloudy,low
9,Fouban,,5,slow,cloudy,low


In [43]:
#Rename the column

df.rename(columns={'cities':"Cities"}, inplace=True)
df

Unnamed: 0,Cities,weekday,temperature,wind,weather,temperature_category
0,Dschang,Monday,34,slow,sunny,low
1,Yaounde,Tuesday,4,fast,cloudy,low
2,Douala,Wednesday,5,slow,cloudy,low
3,Bafoussam,Thursday,99,fast,sunny,high
4,Ebolowa,Friday,45,slow,cold,high
5,Ayos,Saturday,67,fast,cold,high
6,Melong,Sunday,89,slow,sunny,high
7,Kribi,,34,slow,sunny,low
8,Garoua,,4,fast,cloudy,low
9,Fouban,,5,slow,cloudy,low


In [44]:
df.columns = ['Citie','Weekday','Temperature','Wind','Weather','Temperature_category']
df

Unnamed: 0,Citie,Weekday,Temperature,Wind,Weather,Temperature_category
0,Dschang,Monday,34,slow,sunny,low
1,Yaounde,Tuesday,4,fast,cloudy,low
2,Douala,Wednesday,5,slow,cloudy,low
3,Bafoussam,Thursday,99,fast,sunny,high
4,Ebolowa,Friday,45,slow,cold,high
5,Ayos,Saturday,67,fast,cold,high
6,Melong,Sunday,89,slow,sunny,high
7,Kribi,,34,slow,sunny,low
8,Garoua,,4,fast,cloudy,low
9,Fouban,,5,slow,cloudy,low


## Select Rows in the DataFrame

In [46]:
# the loc method use labels value
df.loc[0]

Citie                   Dschang
Weekday                  Monday
Temperature                  34
Wind                       slow
Weather                   sunny
Temperature_category        low
Name: 0, dtype: object

In [49]:
df.loc[0,'Citie']

'Dschang'

In [50]:
df.loc[[2,4,5]]

Unnamed: 0,Citie,Weekday,Temperature,Wind,Weather,Temperature_category
2,Douala,Wednesday,5,slow,cloudy,low
4,Ebolowa,Friday,45,slow,cold,high
5,Ayos,Saturday,67,fast,cold,high


In [51]:
df.loc[0:3]

Unnamed: 0,Citie,Weekday,Temperature,Wind,Weather,Temperature_category
0,Dschang,Monday,34,slow,sunny,low
1,Yaounde,Tuesday,4,fast,cloudy,low
2,Douala,Wednesday,5,slow,cloudy,low
3,Bafoussam,Thursday,99,fast,sunny,high


In [52]:
df.loc[[1,3],["Citie","Wind"]]

Unnamed: 0,Citie,Wind
1,Yaounde,fast
3,Bafoussam,fast


In [53]:
df.loc[0:4,["Citie","Wind"]]

Unnamed: 0,Citie,Wind
0,Dschang,slow
1,Yaounde,fast
2,Douala,slow
3,Bafoussam,fast
4,Ebolowa,slow


In [54]:
df.loc[:,["Citie","Wind","Temperature"]]

Unnamed: 0,Citie,Wind,Temperature
0,Dschang,slow,34
1,Yaounde,fast,4
2,Douala,slow,5
3,Bafoussam,fast,99
4,Ebolowa,slow,45
5,Ayos,fast,67
6,Melong,slow,89
7,Kribi,slow,34
8,Garoua,fast,4
9,Fouban,slow,5


In [55]:
# The iloc use index(int value)
df.iloc[0]

Citie                   Dschang
Weekday                  Monday
Temperature                  34
Wind                       slow
Weather                   sunny
Temperature_category        low
Name: 0, dtype: object

In [56]:
df.iloc[0,1]

'Monday'

In [57]:
 df.iloc[0:4]

Unnamed: 0,Citie,Weekday,Temperature,Wind,Weather,Temperature_category
0,Dschang,Monday,34,slow,sunny,low
1,Yaounde,Tuesday,4,fast,cloudy,low
2,Douala,Wednesday,5,slow,cloudy,low
3,Bafoussam,Thursday,99,fast,sunny,high


In [59]:
#Return the random row
df.iloc[[1,3],0:2]

Unnamed: 0,Citie,Weekday
1,Yaounde,Tuesday
3,Bafoussam,Thursday


In [62]:

df.sample()

Unnamed: 0,Citie,Weekday,Temperature,Wind,Weather,Temperature_category
8,Garoua,,4,fast,cloudy,low


In [61]:
df.sample(n=3)

Unnamed: 0,Citie,Weekday,Temperature,Wind,Weather,Temperature_category
6,Melong,Sunday,89,slow,sunny,high
7,Kribi,,34,slow,sunny,low
11,Ebolowa,,45,slow,cold,high


In [63]:
# Convert column to dictionnay
dict(zip(df['Citie'],df['Temperature']))

{'Dschang': 34,
 'Yaounde': 4,
 'Douala': 5,
 'Bafoussam': 99,
 'Ebolowa': 45,
 'Ayos': 67,
 'Melong': 89,
 'Kribi': 34,
 'Garoua': 4,
 'Fouban': 5,
 'Kolfata': 99,
 'Bua': 67,
 'Bamenda': 89}