In [82]:
import pandas as pd
import numpy as np

In [39]:
housing = pd.read_csv('housing_ds.csv', usecols=[
    "longitude",
    "latitude",
    "total_rooms",
    "total_bedrooms",
    "ocean_proximity"
]) 
housing.head()

Unnamed: 0,longitude,latitude,total_rooms,total_bedrooms,ocean_proximity
0,-122.23,37.88,880.0,129.0,NEAR BAY
1,-122.22,37.86,7099.0,1106.0,NEAR BAY
2,-122.24,37.85,1467.0,190.0,NEAR BAY
3,-122.25,37.85,1274.0,235.0,NEAR BAY
4,-122.25,37.85,1627.0,280.0,NEAR BAY


In [40]:
housing['total_bedrooms'].fillna(0, inplace=True)
housing.info()

# sem nenhuma otimização, o Dataframe está usando 1.6+ de memória

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   longitude        20640 non-null  float64
 1   latitude         20640 non-null  float64
 2   total_rooms      20640 non-null  float64
 3   total_bedrooms   20640 non-null  float64
 4   ocean_proximity  20640 non-null  object 
dtypes: float64(4), object(1)
memory usage: 806.4+ KB


In [41]:
# convertendo para um tipo de dado que consuma menos memoria
# vamos descobrir os valores máximos e mínimos de cada coluna para saber com qual tipo podemos converter 

def max_min_columns(df):
    for colum in df.columns:
        try:
            print(f"Coluna {colum}: Min {df[colum].min()} --- Max {df[colum].max()}")
        except:
            pass

max_min_columns(housing)  # a partir desses dados podemos converter para tipos mais performáticos 

Coluna longitude: Min -124.35 --- Max -114.31
Coluna latitude: Min 32.54 --- Max 41.95
Coluna total_rooms: Min 2.0 --- Max 39320.0
Coluna total_bedrooms: Min 0.0 --- Max 6445.0
Coluna ocean_proximity: Min <1H OCEAN --- Max NEAR OCEAN


In [43]:
housing['longitude'] = housing['longitude'].astype('float16') 
housing['latitude'] = housing['latitude'].astype('float16') 
housing['total_rooms'] = housing['total_rooms'].astype('int16') 
housing['total_bedrooms'] = housing['total_bedrooms'].astype('int32')


In [44]:
housing.info()
# com a mudança de dado, o dataframe agora está usando 1.0+ MB de memória, nem alterei todas as colunas

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   longitude        20640 non-null  float16
 1   latitude         20640 non-null  float16
 2   total_rooms      20640 non-null  int16  
 3   total_bedrooms   20640 non-null  int32  
 4   ocean_proximity  20640 non-null  object 
dtypes: float16(2), int16(1), int32(1), object(1)
memory usage: 362.9+ KB


In [63]:
# importando chunks
chunkfile = pd.read_csv('housing_ds.csv', chunksize=5_000)  # isso gera um iterator

for chunk in chunkfile:
    display(chunk.head())
    display(chunk.shape)
    print(" ---------------------- ")
    print(" ")

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


(5000, 10)

 ---------------------- 
 


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
5000,-118.28,33.99,49.0,2174.0,481.0,1861.0,484.0,1.7159,95000.0,<1H OCEAN
5001,-118.28,33.99,46.0,1211.0,321.0,1153.0,282.0,1.7849,99300.0,<1H OCEAN
5002,-118.32,34.01,52.0,3104.0,645.0,1498.0,581.0,2.6667,128000.0,<1H OCEAN
5003,-118.32,34.02,50.0,1655.0,256.0,672.0,260.0,4.2554,194300.0,<1H OCEAN
5004,-118.32,34.02,48.0,1949.0,308.0,823.0,340.0,3.3906,189700.0,<1H OCEAN


(5000, 10)

 ---------------------- 
 


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
10000,-121.08,39.02,13.0,1839.0,275.0,752.0,270.0,4.2031,209600.0,INLAND
10001,-121.07,39.05,10.0,1813.0,311.0,827.0,287.0,3.6087,182100.0,INLAND
10002,-121.07,39.04,9.0,2374.0,372.0,884.0,333.0,4.5042,206400.0,INLAND
10003,-121.03,39.05,12.0,1875.0,307.0,806.0,283.0,3.9185,195200.0,INLAND
10004,-121.06,39.04,14.0,1651.0,279.0,633.0,261.0,4.2802,194800.0,INLAND


(5000, 10)

 ---------------------- 
 


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
15000,-117.04,32.74,33.0,3880.0,770.0,2288.0,805.0,3.6848,140700.0,NEAR OCEAN
15001,-117.02,32.74,12.0,3301.0,963.0,2000.0,879.0,1.8594,119200.0,NEAR OCEAN
15002,-117.04,32.74,5.0,2878.0,785.0,1727.0,758.0,1.7179,132000.0,NEAR OCEAN
15003,-117.04,32.75,36.0,2297.0,418.0,1070.0,392.0,3.5192,144000.0,NEAR OCEAN
15004,-117.05,32.75,29.0,2767.0,612.0,1437.0,587.0,2.8306,142900.0,NEAR OCEAN


(5000, 10)

 ---------------------- 
 


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20000,-119.4,36.04,39.0,915.0,199.0,580.0,175.0,1.8894,112500.0,INLAND
20001,-119.27,36.05,29.0,1016.0,174.0,481.0,140.0,2.2917,112500.0,INLAND
20002,-119.21,36.1,30.0,1471.0,373.0,1418.0,357.0,1.7432,42500.0,INLAND
20003,-119.19,36.06,29.0,1815.0,376.0,1421.0,339.0,1.9091,71300.0,INLAND
20004,-119.19,36.14,41.0,759.0,140.0,408.0,129.0,3.9,85900.0,INLAND


(640, 10)

 ---------------------- 
 


In [65]:
# Na construção de datasets, sempre use amostras, para ai sim aplicar tudo para o resto do Dataset

housing_amostra = pd.read_csv('housing.csv', nrows=300)  # como parametros nrwos conseguimos fazer isso

In [77]:
# usando dtype_diet para inferir o tipo de cada coluna

from dtype_diet import report_on_dataframe, optimize_dtypes

In [78]:
report = report_on_dataframe(housing_amostra, unit="MB")
df_optmizate = optimize_dtypes(housing_amostra, report)

In [80]:
df_optmizate.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   longitude           300 non-null    float64 
 1   latitude            300 non-null    float64 
 2   housing_median_age  300 non-null    float16 
 3   total_rooms         300 non-null    float32 
 4   total_bedrooms      299 non-null    float32 
 5   population          300 non-null    float32 
 6   households          300 non-null    float32 
 7   median_income       300 non-null    float64 
 8   median_house_value  300 non-null    float32 
 9   ocean_proximity     300 non-null    category
dtypes: category(1), float16(1), float32(5), float64(3)
memory usage: 14.0 KB
