In [264]:
import numpy as np
import pandas as pd

In [265]:
DATA_PATH = '../data'

# Reading and writing data

Reading CSV data from `../data/raw/steamcharts.csv`

Util write params:
- `usecols=[...]` → read some columns
- `dtype={'col1': 'int64', 'col2': 'category'}` → force data types
- `parse_dates=['col_data']` → auto convert columns in datetime
- `nrows=1000` → read just first N rows
- `chunksize=100000` → streaming file read

In [266]:
data = pd.read_csv(f'{DATA_PATH}/raw/steamcharts.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 612265 entries, 0 to 612264
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   month         612265 non-null  object 
 1   avg_players   612265 non-null  float64
 2   gain          612265 non-null  object 
 3   gain_percent  612265 non-null  float64
 4   peak_players  612265 non-null  int64  
 5   name          612265 non-null  object 
 6   steam_appid   612265 non-null  int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 32.7+ MB


In [267]:
head = data.head()
head

Unnamed: 0,month,avg_players,gain,gain_percent,peak_players,name,steam_appid
0,Sep-25,7805.25,883.12,0.1276,13254,Counter-Strike,10
1,Aug-25,6922.13,-449.35,-0.061,12168,Counter-Strike,10
2,Jul-25,7371.48,-833.5,-0.1016,13951,Counter-Strike,10
3,Jun-25,8204.98,-847.53,-0.0936,15798,Counter-Strike,10
4,May-25,9052.51,-471.31,-0.0495,15333,Counter-Strike,10


Writing data on `../data/processed/steamcharts.csv`.

| Format       | Read function     | Write function | Observation                          |
| ------------- | --------------------- | ----------------- | ------------------------------------ |
| Excel (.xlsx) | `pd.read_excel()`     | `to_excel()`      | Requer `openpyxl`                    |
| JSON          | `pd.read_json()`      | `to_json()`       | Pode usar `orient='records'` p/ APIs |
| Feather       | `pd.read_feather()`   | `to_feather()`    | Super rápido (binário, usa Arrow)    |
| Pickle        | `pd.read_pickle()`    | `to_pickle()`     | Armazena objetos Python diretamente  |
| SQL           | `pd.read_sql()`       | `to_sql()`        | Integra com SQLite, Postgres, etc.   |
| HTML          | `pd.read_html()`      | —                 | Extrai tabelas de páginas web        |
| Clipboard     | `pd.read_clipboard()` | `to_clipboard()`  | Muito útil p/ testes rápidos         |

In [268]:
# head.to_csv(f'{DATA_PATH}/processed/steamcharts.csv', index=False)
# head.to_excel(f'{DATA_PATH}/processed/steamcharts.xlsx', index=False)
# head.to_parquet(f'{DATA_PATH}/processed/steamcharts.parquet', index=False, engine='fastparquet')
# head.to_json(f'{DATA_PATH}/processed/steamcharts_records.json', orient='records', lines=False, index=False)

# Data cleaning

In [269]:
df = pd.DataFrame({
    'A': [1, 2, np.nan, 4],
    'B': [np.nan, 2, np.nan, 4],
    'C': [1, 2, 3, 4],
})

df

Unnamed: 0,A,B,C
0,1.0,,1
1,2.0,2.0,2
2,,,3
3,4.0,4.0,4


`DataFrame.dropna()` remove lines or columns with NaN values.

Params:
- `DataFrame.dropna(how='all')` -> remove just lines there is totaly NaN values.
- `DataFrame.dropna(subset=['col1', 'col2'])` -> remove lines with NaN value just in specified columns
- `DataFrame.dropna(axis=1)` -> remove columns with NaN value

In [270]:
df_dropna = df.dropna()
df_dropna

Unnamed: 0,A,B,C
1,2.0,2.0,2
3,4.0,4.0,4


In [271]:
df = pd.DataFrame({
    'A': [np.nan, 2, np.nan, 4],
    'B': [np.nan, 2, np.nan, 4],
    'C': [np.nan, 2, 3, 4],
})

df

Unnamed: 0,A,B,C
0,,,
1,2.0,2.0,2.0
2,,,3.0
3,4.0,4.0,4.0


In [272]:
df_dropna_how_all = df.dropna(how='all')
df_dropna_how_all

Unnamed: 0,A,B,C
1,2.0,2.0,2.0
2,,,3.0
3,4.0,4.0,4.0


In [273]:
df_dropna_subset = df.dropna(subset=['C'])
df_dropna_subset

Unnamed: 0,A,B,C
1,2.0,2.0,2.0
2,,,3.0
3,4.0,4.0,4.0


In [274]:
df = pd.DataFrame({
    'A': [1, 2, 3, 4],
    'B': [10, 20, np.nan, 30],
    'C': [100, 200, 300, 400],
    'D': [1000, np.nan, np.nan, np.nan],
})

df

Unnamed: 0,A,B,C,D
0,1,10.0,100,1000.0
1,2,20.0,200,
2,3,,300,
3,4,30.0,400,


In [275]:
df_dropna_axis1 = df.dropna(axis=1)
df_dropna_axis1

Unnamed: 0,A,C
0,1,100
1,2,200
2,3,300
3,4,400


`fillna()` -> fill NaN values with an specific value received in the param(mean, median, etc)

In [276]:
df = pd.DataFrame({
    'A': [1, 2, 3, 4],
    'B': [5, 6, np.nan, 8],
    'C': [9, 10, 11, 12],
    'D': [13, 14, 15, 16]
})

df

Unnamed: 0,A,B,C,D
0,1,5.0,9,13
1,2,6.0,10,14
2,3,,11,15
3,4,8.0,12,16


In [277]:
df_fillna = df.fillna(0)
df_fillna

Unnamed: 0,A,B,C,D
0,1,5.0,9,13
1,2,6.0,10,14
2,3,0.0,11,15
3,4,8.0,12,16


In [278]:
df_fillna_mean = df.copy()
df_fillna_mean['B'] = df['B'].fillna(df['B'].mean())
df_fillna_mean

Unnamed: 0,A,B,C,D
0,1,5.0,9,13
1,2,6.0,10,14
2,3,6.333333,11,15
3,4,8.0,12,16


`ffill()` -> fill NaN value with the foward value in the same column

In [279]:
df_fillna_ffill = df.ffill()
df_fillna_ffill

Unnamed: 0,A,B,C,D
0,1,5.0,9,13
1,2,6.0,10,14
2,3,6.0,11,15
3,4,8.0,12,16


`bfill()` -> fill NaN value with the backward value in the same column

In [280]:
df_fill_bfill = df.bfill()
df_fill_bfill

Unnamed: 0,A,B,C,D
0,1,5.0,9,13
1,2,6.0,10,14
2,3,8.0,11,15
3,4,8.0,12,16


`map()` -> apply a function or an element by element dict in a serie

In [281]:
df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'city': ['New York', 'Los Angeles', 'Chicago'],
    'married': ['Yes', 'No', 'yEs']
})

df

Unnamed: 0,name,city,married
0,Alice,New York,Yes
1,Bob,Los Angeles,No
2,Charlie,Chicago,yEs


In [282]:
df_map_lambda = df.map(lambda x: x.strip().lower())
df_map_lambda

Unnamed: 0,name,city,married
0,alice,new york,yes
1,bob,los angeles,no
2,charlie,chicago,yes


In [283]:
df_map_yes_no = df_map_lambda.copy()
df_map_yes_no['married'] = df_map_yes_no['married'].map({'yes': 1, 'no': 0})
df_map_yes_no

Unnamed: 0,name,city,married
0,alice,new york,1
1,bob,los angeles,0
2,charlie,chicago,1


`apply()` -> apply a function on each line or column 

In [284]:
df = pd.DataFrame({
    'price': [100, 105, 100],
    'quantity': [1, 2, 3]
})  

df

Unnamed: 0,price,quantity
0,100,1
1,105,2
2,100,3


In [285]:
df_apply = df.copy()
df_apply['total'] = df_apply.apply(lambda row: row['price'] * row['quantity'], axis=1)

df_apply

Unnamed: 0,price,quantity,total
0,100,1,100
1,105,2,210
2,100,3,300
