In [94]:
import pandas as pd

# Pandas Series
A Pandas Series is like a column in a table.

It is a one-dimensional array holding data of any type.

In [95]:
# Create a simple Pandas Series from a list:
a = [1, 7, 2]

In [96]:
Series1 = pd.Series(a)

In [97]:
print(Series1)

0    1
1    7
2    2
dtype: int64


# Labels
If nothing else is specified, the values are labeled with their index number. First value has index 0, second value has index 1 etc.

In [98]:
print(Series1[0])

1


# Create Labels
With the index argument, you can name your own labels.

In [99]:
Series2 = pd.Series(a, index = ["X", "Y", "Z"])

In [100]:
print(Series2)

X    1
Y    7
Z    2
dtype: int64


In [101]:
# When you have created labels, you can access an item by referring to the label.
print(Series2["X"])

1


# Key/Value Objects as Series
You can also use a key/value object, like a dictionary, when creating a Series.

In [102]:
calories = {"day1" : 420, "day2" : 380, "day3" : 390}

Series3 = pd.Series(calories)

In [103]:
print(Series3)

day1    420
day2    380
day3    390
dtype: int64


To select only some of the items in the dictionary, use the index argument and specify only the items you want to include in the Series.

In [104]:
Series4 = pd.Series(calories, index = ["day1", "day2"])

In [105]:
print(Series4)

day1    420
day2    380
dtype: int64


# DataFrames
Data sets in Pandas are usually multi-dimensional tables, called DataFrames.

Series is like a column, a DataFrame is the whole table.

In [106]:
data = {
    "calories" : [420, 380, 390],
    "duration" : [50, 40, 45]
}

Series5 = pd.DataFrame(data)

In [107]:
print(Series5)

   calories  duration
0       420        50
1       380        40
2       390        45


# Locate Row
Pandas use the loc attribute to return one or more specified row(s)

In [108]:
print(Series5.loc[0])

calories    420
duration     50
Name: 0, dtype: int64


In [109]:
print(Series5.loc[[0, 1]])

   calories  duration
0       420        50
1       380        40


# Named Indexes
With the index argument, you can name your own indexes.

In [61]:
# Add a list of names to give each row a name:
Series6 = pd.DataFrame(data, index = ["day1", "day2", "day3"])

In [62]:
print(Series6)

      calories  duration
day1       420        50
day2       380        40
day3       390        45


# Locate Named Indexes
Use the named index in the loc attribute to return the specified row(s).

In [63]:
# refer to the named index:
print(Series6.loc["day2"])

calories    380
duration     40
Name: day2, dtype: int64


# Load Files Into a DataFrame
If your data sets are stored in a file, Pandas can load them into a DataFrame.

In [110]:
# Load a comma separated file (CSV file) into a DataFrame:
data = pd.read_csv('data.csv')

In [111]:
# use to_string() to print the entire DataFrame.
print(data.to_string())

     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.0
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
17         45     90       112       NaN
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45   

In [66]:
# Print the DataFrame without the to_string() method:
print(data)

     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.0
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
17         45     90       112       NaN
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45   

# max_rows
In my system the number is 60, which means that if the DataFrame contains more than 60 rows, the print(data) statement will return only the headers and the first and last 5 rows.

In [67]:
print(pd.options.display.max_rows)

999


In [68]:
pd.options.display.max_rows = 999

In [69]:
data1 = pd.read_csv('data.csv')

print(data1) 

     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.0
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
17         45     90       112       NaN
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45   

# Viewing the Data
The head() method returns the headers and a specified number of rows, starting from the top.

The tail() method returns the headers and a specified number of rows, starting from the bottom.

In [70]:
# Get a quick overview by printing the first 10 rows of the DataFrame:
print(data.head(10))

   Duration  Pulse  Maxpulse  Calories
0        60    110       130     409.1
1        60    117       145     479.0
2        60    103       135     340.0
3        45    109       175     282.4
4        45    117       148     406.0
5        60    102       127     300.0
6        60    110       136     374.0
7        45    104       134     253.3
8        30    109       133     195.1
9        60     98       124     269.0


In [71]:
# Print the first 5 rows of the DataFrame:
print(data.head())

   Duration  Pulse  Maxpulse  Calories
0        60    110       130     409.1
1        60    117       145     479.0
2        60    103       135     340.0
3        45    109       175     282.4
4        45    117       148     406.0


In [72]:
print(data.tail())

     Duration  Pulse  Maxpulse  Calories
164        60    105       140     290.8
165        60    110       145     300.0
166        60    115       145     310.2
167        75    120       150     320.4
168        75    125       150     330.4


# Info About the Data
The DataFrames object has a method called info(), that gives you more information about the data set.

In [73]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  169 non-null    int64  
 1   Pulse     169 non-null    int64  
 2   Maxpulse  169 non-null    int64  
 3   Calories  164 non-null    float64
dtypes: float64(1), int64(3)
memory usage: 5.4 KB
None


In [114]:
dff = pd.read_csv('df.csv')

In [115]:
print(dff.to_string())

    Duration        Date    Pulse  Maxpulse  Calories
0         60   2020/12/01'    110       130     409.1
1         60  '2020/12/02'    117       145     479.0
2         60  '2020/12/03'    103       135     340.0
3         45  '2020/12/04'    109       175     282.4
4         45   2020/12/05'    117       148     406.0
5         60   2020/12/06'    102       127     300.0
6         60   2020/12/07'    110       136     374.0
7        450   2020/12/08'    104       134     253.3
8         30   2020/12/09'    109       133     195.1
9         60   2020/12/10'     98       124     269.0
10        60   2020/12/11'    103       147     329.3
11        60   2020/12/12'    100       120     250.7
12        60   2020/12/12'    100       120     250.7
13        60   2020/12/13'    106       128     345.3
14        60   2020/12/14'    104       132     379.3
15        60   2020/12/15'     98       123     275.0
16        60   2020/12/16'     98       120     215.2
17        60   2020/12/17'  

# Data Cleaning
Data cleaning means fixing bad data in your data set.

Bad data could be:

Empty cells
Data in wrong format
Wrong data
Duplicates


In [116]:
df1 = pd.read_csv('df.csv')

# Empty Cells

# - Remove Rows
One way to deal with empty cells is to remove rows that contain empty cells.

This is usually OK, since data sets can be very big, and removing a few rows will not have a big impact on the result.

In [117]:
df1.dropna(inplace = True)

In [118]:
print(df1.to_string())

    Duration        Date    Pulse  Maxpulse  Calories
0         60   2020/12/01'    110       130     409.1
1         60  '2020/12/02'    117       145     479.0
2         60  '2020/12/03'    103       135     340.0
3         45  '2020/12/04'    109       175     282.4
4         45   2020/12/05'    117       148     406.0
5         60   2020/12/06'    102       127     300.0
6         60   2020/12/07'    110       136     374.0
7        450   2020/12/08'    104       134     253.3
8         30   2020/12/09'    109       133     195.1
9         60   2020/12/10'     98       124     269.0
10        60   2020/12/11'    103       147     329.3
11        60   2020/12/12'    100       120     250.7
12        60   2020/12/12'    100       120     250.7
13        60   2020/12/13'    106       128     345.3
14        60   2020/12/14'    104       132     379.3
15        60   2020/12/15'     98       123     275.0
16        60   2020/12/16'     98       120     215.2
17        60   2020/12/17'  

# - Replace Empty Values
Another way of dealing with empty cells is to insert a new value instead.

The fillna() method allows us to replace empty cells with a value:

In [119]:
df2 = pd.read_csv('df.csv')

In [120]:
df2.fillna(130, inplace=True)

In [121]:
print(df2.to_string())

    Duration        Date    Pulse  Maxpulse  Calories
0         60   2020/12/01'    110       130     409.1
1         60  '2020/12/02'    117       145     479.0
2         60  '2020/12/03'    103       135     340.0
3         45  '2020/12/04'    109       175     282.4
4         45   2020/12/05'    117       148     406.0
5         60   2020/12/06'    102       127     300.0
6         60   2020/12/07'    110       136     374.0
7        450   2020/12/08'    104       134     253.3
8         30   2020/12/09'    109       133     195.1
9         60   2020/12/10'     98       124     269.0
10        60   2020/12/11'    103       147     329.3
11        60   2020/12/12'    100       120     250.7
12        60   2020/12/12'    100       120     250.7
13        60   2020/12/13'    106       128     345.3
14        60   2020/12/14'    104       132     379.3
15        60   2020/12/15'     98       123     275.0
16        60   2020/12/16'     98       120     215.2
17        60   2020/12/17'  

# - Replace Only For Specified Columns
The example above replaces all empty cells in the whole Data Frame.

In [122]:
df3 = pd.read_csv('df.csv')

In [123]:
df3["Calories"].fillna(130, inplace = True)

In [125]:
print(df3.to_string())

    Duration        Date    Pulse  Maxpulse  Calories
0         60   2020/12/01'    110       130     409.1
1         60  '2020/12/02'    117       145     479.0
2         60  '2020/12/03'    103       135     340.0
3         45  '2020/12/04'    109       175     282.4
4         45   2020/12/05'    117       148     406.0
5         60   2020/12/06'    102       127     300.0
6         60   2020/12/07'    110       136     374.0
7        450   2020/12/08'    104       134     253.3
8         30   2020/12/09'    109       133     195.1
9         60   2020/12/10'     98       124     269.0
10        60   2020/12/11'    103       147     329.3
11        60   2020/12/12'    100       120     250.7
12        60   2020/12/12'    100       120     250.7
13        60   2020/12/13'    106       128     345.3
14        60   2020/12/14'    104       132     379.3
15        60   2020/12/15'     98       123     275.0
16        60   2020/12/16'     98       120     215.2
17        60   2020/12/17'  

# - Replace Using Mean, Median, or Mode
A common way to replace empty cells, is to calculate the mean, median or mode value of the column.

In [126]:
df4 = pd.read_csv('df.csv')

In [127]:
x = df4["Calories"].mean()

In [128]:
df4["Calories"].fillna(x, inplace = True)

In [129]:
print(df4.to_string())

    Duration        Date    Pulse  Maxpulse  Calories
0         60   2020/12/01'    110       130    409.10
1         60  '2020/12/02'    117       145    479.00
2         60  '2020/12/03'    103       135    340.00
3         45  '2020/12/04'    109       175    282.40
4         45   2020/12/05'    117       148    406.00
5         60   2020/12/06'    102       127    300.00
6         60   2020/12/07'    110       136    374.00
7        450   2020/12/08'    104       134    253.30
8         30   2020/12/09'    109       133    195.10
9         60   2020/12/10'     98       124    269.00
10        60   2020/12/11'    103       147    329.30
11        60   2020/12/12'    100       120    250.70
12        60   2020/12/12'    100       120    250.70
13        60   2020/12/13'    106       128    345.30
14        60   2020/12/14'    104       132    379.30
15        60   2020/12/15'     98       123    275.00
16        60   2020/12/16'     98       120    215.20
17        60   2020/12/17'  

In [132]:
df5 = pd.read_csv('df.csv')

In [133]:
y = df5["Calories"].median()

In [134]:
df5["Calories"].fillna(y, inplace = True)

In [135]:
print(df5.to_string())

    Duration        Date    Pulse  Maxpulse  Calories
0         60   2020/12/01'    110       130     409.1
1         60  '2020/12/02'    117       145     479.0
2         60  '2020/12/03'    103       135     340.0
3         45  '2020/12/04'    109       175     282.4
4         45   2020/12/05'    117       148     406.0
5         60   2020/12/06'    102       127     300.0
6         60   2020/12/07'    110       136     374.0
7        450   2020/12/08'    104       134     253.3
8         30   2020/12/09'    109       133     195.1
9         60   2020/12/10'     98       124     269.0
10        60   2020/12/11'    103       147     329.3
11        60   2020/12/12'    100       120     250.7
12        60   2020/12/12'    100       120     250.7
13        60   2020/12/13'    106       128     345.3
14        60   2020/12/14'    104       132     379.3
15        60   2020/12/15'     98       123     275.0
16        60   2020/12/16'     98       120     215.2
17        60   2020/12/17'  

In [136]:
df6 = pd.read_csv('df.csv')

In [139]:
z = df6["Calories"].mode()[0]

In [140]:
print(z)

300.0


In [141]:
df6["Calories"].fillna(z, inplace = True)

# Data of Wrong Format
Cells with data of wrong format can make it difficult, or even impossible, to analyze data.

To fix it, you have two options: remove the rows, or convert all cells in the columns into the same format.

# - Convert Into a Correct Format

In [142]:
df7 = pd.read_csv('calories.csv')

In our Data Frame, we have two cells with the wrong format. Check out row 22 and 26, the 'Date' column should be a string that represents a date:

In [143]:
print(df7.to_string())

    Duration        Date    Pulse  Maxpulse  Calories
0         60   2020/12/01'    110       130     409.1
1         60  '2020/12/02'    117       145     479.0
2         60  '2020/12/03'    103       135     340.0
3         45  '2020/12/04'    109       175     282.4
4         45   2020/12/05'    117       148     406.0
5         60   2020/12/06'    102       127     300.0
6         60   2020/12/07'    110       136     374.0
7        450   2020/12/08'    104       134     253.3
8         30   2020/12/09'    109       133     195.1
9         60   2020/12/10'     98       124     269.0
10        60   2020/12/11'    103       147     329.3
11        60   2020/12/12'    100       120     250.7
12        60   2020/12/12'    100       120     250.7
13        60   2020/12/13'    106       128     345.3
14        60   2020/12/14'    104       132     379.3
15        60   2020/12/15'     98       123     275.0
16        60   2020/12/16'     98       120     215.2
17        60   2020/12/17'  

to convert all cells in the 'Date' column into dates.

Pandas has a to_datetime() method for this:

In [151]:
df7['Date  '] = pd.to_datetime(df7['Date  '])

In [152]:
# NaT = (Not a Time), in other words an empty value
print(df7.to_string)

<bound method DataFrame.to_string of     Duration     Date    Pulse  Maxpulse  Calories
0         60 2020-12-01    110       130     409.1
1         60 2020-12-02    117       145     479.0
2         60 2020-12-03    103       135     340.0
3         45 2020-12-04    109       175     282.4
4         45 2020-12-05    117       148     406.0
5         60 2020-12-06    102       127     300.0
6         60 2020-12-07    110       136     374.0
7        450 2020-12-08    104       134     253.3
8         30 2020-12-09    109       133     195.1
9         60 2020-12-10     98       124     269.0
10        60 2020-12-11    103       147     329.3
11        60 2020-12-12    100       120     250.7
12        60 2020-12-12    100       120     250.7
13        60 2020-12-13    106       128     345.3
14        60 2020-12-14    104       132     379.3
15        60 2020-12-15     98       123     275.0
16        60 2020-12-16     98       120     215.2
17        60 2020-12-17    100       120     

In [153]:
# Remove rows with a NULL value in the "Date" column:
df7.dropna(subset=['Date  '], inplace = True)

In [155]:
print(df7.to_string())

    Duration     Date    Pulse  Maxpulse  Calories
0         60 2020-12-01    110       130     409.1
1         60 2020-12-02    117       145     479.0
2         60 2020-12-03    103       135     340.0
3         45 2020-12-04    109       175     282.4
4         45 2020-12-05    117       148     406.0
5         60 2020-12-06    102       127     300.0
6         60 2020-12-07    110       136     374.0
7        450 2020-12-08    104       134     253.3
8         30 2020-12-09    109       133     195.1
9         60 2020-12-10     98       124     269.0
10        60 2020-12-11    103       147     329.3
11        60 2020-12-12    100       120     250.7
12        60 2020-12-12    100       120     250.7
13        60 2020-12-13    106       128     345.3
14        60 2020-12-14    104       132     379.3
15        60 2020-12-15     98       123     275.0
16        60 2020-12-16     98       120     215.2
17        60 2020-12-17    100       120     300.0
18        45 2020-12-18     90 

# Pandas - Fixing Wrong Data

# * Wrong Data

"Wrong data" does not have to be "empty cells" or "wrong format", it can just be wrong, like if someone registered "199" instead of "1.99".

If you take a look at our data set, you can see that in row 7, the duration is 450, but for all the other rows the duration is between 30 and 60.

In [157]:
df8 = pd.read_csv('df.csv')
print(df8.to_string())

    Duration        Date    Pulse  Maxpulse  Calories
0         60   2020/12/01'    110       130     409.1
1         60  '2020/12/02'    117       145     479.0
2         60  '2020/12/03'    103       135     340.0
3         45  '2020/12/04'    109       175     282.4
4         45   2020/12/05'    117       148     406.0
5         60   2020/12/06'    102       127     300.0
6         60   2020/12/07'    110       136     374.0
7        450   2020/12/08'    104       134     253.3
8         30   2020/12/09'    109       133     195.1
9         60   2020/12/10'     98       124     269.0
10        60   2020/12/11'    103       147     329.3
11        60   2020/12/12'    100       120     250.7
12        60   2020/12/12'    100       120     250.7
13        60   2020/12/13'    106       128     345.3
14        60   2020/12/14'    104       132     379.3
15        60   2020/12/15'     98       123     275.0
16        60   2020/12/16'     98       120     215.2
17        60   2020/12/17'  

# - Replacing Values
One way to fix wrong values is to replace them with something else.

In [158]:
# Set "Duration" = 45 in row 7:
df8.loc[7, 'Duration'] = 45

In [159]:
print(df8.to_string())

    Duration        Date    Pulse  Maxpulse  Calories
0         60   2020/12/01'    110       130     409.1
1         60  '2020/12/02'    117       145     479.0
2         60  '2020/12/03'    103       135     340.0
3         45  '2020/12/04'    109       175     282.4
4         45   2020/12/05'    117       148     406.0
5         60   2020/12/06'    102       127     300.0
6         60   2020/12/07'    110       136     374.0
7         45   2020/12/08'    104       134     253.3
8         30   2020/12/09'    109       133     195.1
9         60   2020/12/10'     98       124     269.0
10        60   2020/12/11'    103       147     329.3
11        60   2020/12/12'    100       120     250.7
12        60   2020/12/12'    100       120     250.7
13        60   2020/12/13'    106       128     345.3
14        60   2020/12/14'    104       132     379.3
15        60   2020/12/15'     98       123     275.0
16        60   2020/12/16'     98       120     215.2
17        60   2020/12/17'  

In [160]:
df9 = pd.read_csv('df.csv')
print(df9.to_string())

    Duration        Date    Pulse  Maxpulse  Calories
0         60   2020/12/01'    110       130     409.1
1         60  '2020/12/02'    117       145     479.0
2         60  '2020/12/03'    103       135     340.0
3         45  '2020/12/04'    109       175     282.4
4         45   2020/12/05'    117       148     406.0
5         60   2020/12/06'    102       127     300.0
6         60   2020/12/07'    110       136     374.0
7        450   2020/12/08'    104       134     253.3
8         30   2020/12/09'    109       133     195.1
9         60   2020/12/10'     98       124     269.0
10        60   2020/12/11'    103       147     329.3
11        60   2020/12/12'    100       120     250.7
12        60   2020/12/12'    100       120     250.7
13        60   2020/12/13'    106       128     345.3
14        60   2020/12/14'    104       132     379.3
15        60   2020/12/15'     98       123     275.0
16        60   2020/12/16'     98       120     215.2
17        60   2020/12/17'  

In [161]:
for x in df.index:
  if df9.loc[x, "Duration"] > 120:
    df9.loc[x, "Duration"] = 120

In [162]:
print(df9.to_string())

    Duration        Date    Pulse  Maxpulse  Calories
0         60   2020/12/01'    110       130     409.1
1         60  '2020/12/02'    117       145     479.0
2         60  '2020/12/03'    103       135     340.0
3         45  '2020/12/04'    109       175     282.4
4         45   2020/12/05'    117       148     406.0
5         60   2020/12/06'    102       127     300.0
6         60   2020/12/07'    110       136     374.0
7        120   2020/12/08'    104       134     253.3
8         30   2020/12/09'    109       133     195.1
9         60   2020/12/10'     98       124     269.0
10        60   2020/12/11'    103       147     329.3
11        60   2020/12/12'    100       120     250.7
12        60   2020/12/12'    100       120     250.7
13        60   2020/12/13'    106       128     345.3
14        60   2020/12/14'    104       132     379.3
15        60   2020/12/15'     98       123     275.0
16        60   2020/12/16'     98       120     215.2
17        60   2020/12/17'  

# - Removing Rows
Another way of handling wrong data is to remove the rows that contains wrong data.

In [163]:
df10 = pd.read_csv('df.csv')
print(df10.to_string())

    Duration        Date    Pulse  Maxpulse  Calories
0         60   2020/12/01'    110       130     409.1
1         60  '2020/12/02'    117       145     479.0
2         60  '2020/12/03'    103       135     340.0
3         45  '2020/12/04'    109       175     282.4
4         45   2020/12/05'    117       148     406.0
5         60   2020/12/06'    102       127     300.0
6         60   2020/12/07'    110       136     374.0
7        450   2020/12/08'    104       134     253.3
8         30   2020/12/09'    109       133     195.1
9         60   2020/12/10'     98       124     269.0
10        60   2020/12/11'    103       147     329.3
11        60   2020/12/12'    100       120     250.7
12        60   2020/12/12'    100       120     250.7
13        60   2020/12/13'    106       128     345.3
14        60   2020/12/14'    104       132     379.3
15        60   2020/12/15'     98       123     275.0
16        60   2020/12/16'     98       120     215.2
17        60   2020/12/17'  

In [164]:
# Delete rows where "Duration" is higher than 120:
for x in df10.index:
    if df10.loc[x, "Duration"] > 120:
        df10.drop(x, inplace = True)

In [165]:
print(df10.to_string())

    Duration        Date    Pulse  Maxpulse  Calories
0         60   2020/12/01'    110       130     409.1
1         60  '2020/12/02'    117       145     479.0
2         60  '2020/12/03'    103       135     340.0
3         45  '2020/12/04'    109       175     282.4
4         45   2020/12/05'    117       148     406.0
5         60   2020/12/06'    102       127     300.0
6         60   2020/12/07'    110       136     374.0
8         30   2020/12/09'    109       133     195.1
9         60   2020/12/10'     98       124     269.0
10        60   2020/12/11'    103       147     329.3
11        60   2020/12/12'    100       120     250.7
12        60   2020/12/12'    100       120     250.7
13        60   2020/12/13'    106       128     345.3
14        60   2020/12/14'    104       132     379.3
15        60   2020/12/15'     98       123     275.0
16        60   2020/12/16'     98       120     215.2
17        60   2020/12/17'    100       120     300.0
18        45   2020/12/18'  

# Pandas - Removing Duplicates
Duplicate rows are rows that have been registered more than one time.

In [166]:
df11 = pd.read_csv('df.csv')
print(df11.to_string())

    Duration        Date    Pulse  Maxpulse  Calories
0         60   2020/12/01'    110       130     409.1
1         60  '2020/12/02'    117       145     479.0
2         60  '2020/12/03'    103       135     340.0
3         45  '2020/12/04'    109       175     282.4
4         45   2020/12/05'    117       148     406.0
5         60   2020/12/06'    102       127     300.0
6         60   2020/12/07'    110       136     374.0
7        450   2020/12/08'    104       134     253.3
8         30   2020/12/09'    109       133     195.1
9         60   2020/12/10'     98       124     269.0
10        60   2020/12/11'    103       147     329.3
11        60   2020/12/12'    100       120     250.7
12        60   2020/12/12'    100       120     250.7
13        60   2020/12/13'    106       128     345.3
14        60   2020/12/14'    104       132     379.3
15        60   2020/12/15'     98       123     275.0
16        60   2020/12/16'     98       120     215.2
17        60   2020/12/17'  

By taking a look at our test data set, we can assume that row 11 and 12 are duplicates.

To discover duplicates, we can use the duplicated() method.

In [168]:
# Returns True for every row that is a duplicate, othwerwise False:
print(df11.duplicated())

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12     True
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
dtype: bool


# - Removing Duplicates
To remove duplicates, use the drop_duplicates() method.

In [169]:
# Remove all duplicates:
df11.drop_duplicates(inplace = True)

# Pandas - Data Correlations

# - Finding Relationships
A great aspect of the Pandas module is the corr() method.

The corr() method calculates the relationship between each column in your data set.

The corr() method ignores "not numeric" columns.

In [170]:
df7.corr()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
Duration,1.0,-0.000572,0.042271,-0.118818
Pulse,-0.000572,1.0,0.269,0.510495
Maxpulse,0.042271,0.269,1.0,0.352596
Calories,-0.118818,0.510495,0.352596,1.0


In [171]:
df11.corr()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
Duration,1.0,0.002627,0.047688,-0.119475
Pulse,0.002627,1.0,0.269672,0.506727
Maxpulse,0.047688,0.269672,1.0,0.344558
Calories,-0.119475,0.506727,0.344558,1.0
