In [65]:
import pandas as pd
import numpy as np

In [66]:
# Creating a dataset
dataset = {'name': ['John', 'Mary', 'Edwards', 'Rose', 'James', 'Emily', 'Ellie', 'Julius', 'Thomas', 'Jesus'],
          'age': [1992, 1985, 1997, 1967, 1991, 1990, 1978, np.nan, 1995, 1992],
          'gender': ['m', 'f', 'm', 'f', 'm', 'f', 'f', 'm', 'm', 'm'],
          'country': ['EUA', 'BRAZIL', 'CHILE', 'EUA', 'EUA', 'BRAZIL', 'ARGENTINA', 'BRAZIL', 'BRAZIL', np.nan]}

In [67]:
# Creating a dataframe with pd
df = pd.DataFrame(dataset)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
name       10 non-null object
age        9 non-null float64
gender     10 non-null object
country    9 non-null object
dtypes: float64(1), object(3)
memory usage: 400.0+ bytes


In [68]:
# Note: that age and country has just 9 items
# And age column format is float64, and have to be int32
# But first have to clean all dataset

In [69]:
# Cleaning and converting data
df.dropna(inplace=True) # Pop a row that have nan value
df['age'] = df['age'].astype(int)
df

Unnamed: 0,name,age,gender,country
0,John,1992,m,EUA
1,Mary,1985,f,BRAZIL
2,Edwards,1997,m,CHILE
3,Rose,1967,f,EUA
4,James,1991,m,EUA
5,Emily,1990,f,BRAZIL
6,Ellie,1978,f,ARGENTINA
8,Thomas,1995,m,BRAZIL


In [70]:
# Now let's see analyze the dataset
# How many people born in 1990 or before
df[df['age'] <= 1990]

Unnamed: 0,name,age,gender,country
1,Mary,1985,f,BRAZIL
3,Rose,1967,f,EUA
5,Emily,1990,f,BRAZIL
6,Ellie,1978,f,ARGENTINA


In [71]:
# How many people are from brazil
df[df['country'] == 'BRAZIL']

Unnamed: 0,name,age,gender,country
1,Mary,1985,f,BRAZIL
5,Emily,1990,f,BRAZIL
8,Thomas,1995,m,BRAZIL


In [72]:
# Note:  you can't use operator and/or from python because
# in that case you are comparing many to many, 
# and that kind of operator just handle with simple comparisson.

In [73]:
# How many people are from brazil and born after 1990
df[(df['country'] == 'BRAZIL') & (df['age'] > 1990)]

Unnamed: 0,name,age,gender,country
8,Thomas,1995,m,BRAZIL


In [75]:
# How many people are from EUA or born before 1990
df[(df['country'] == 'EUA') | (df['age'] < 1990)]

Unnamed: 0,name,age,gender,country
0,John,1992,m,EUA
1,Mary,1985,f,BRAZIL
3,Rose,1967,f,EUA
4,James,1991,m,EUA
6,Ellie,1978,f,ARGENTINA


In [79]:
# Exporting treated dataframe
df.to_csv('02_basic_pandas.csv', index=False) # Index false for don't add a new column with dataframe index