# Chapter 4 : Pandas Data Types

In [8]:
import pandas as pd
import numpy as np

## Introducing pandas dtypes

### Obtaining the underlying data types 

In [9]:
column_names = ["Customer ID", "Customer Name", "2018 Revenue", "2019 Revenue", "Growth", "Start Year", "Start Month", \
                "Start Day", "New Customer"]
row1 = list([1001.0, 'Pandas Banking', '$235000','$248000','5.5%', 2013,3,10,0])
row2 = list([1002.0, 'Pandas Grocery', '$196000','$205000','4.5%', 2016,4,30,0])
row3 = list([1003.0, 'Pandas Telecom', '$167000','$193000','15.5%', 2010,11,24,0])
row4 = list([1004.0, 'Pandas Transport', '$79000','$90000','13.9%', 2018,1,15,1])
row5 = list([1005.0, 'Pandas Insurance', '$241000','$264000','9.5%', 2009,6,1,0])

In [10]:
data_frame = pd.DataFrame(data=[row1,row2,row3,row4,row5], columns=column_names)
data_frame

Unnamed: 0,Customer ID,Customer Name,2018 Revenue,2019 Revenue,Growth,Start Year,Start Month,Start Day,New Customer
0,1001.0,Pandas Banking,$235000,$248000,5.5%,2013,3,10,0
1,1002.0,Pandas Grocery,$196000,$205000,4.5%,2016,4,30,0
2,1003.0,Pandas Telecom,$167000,$193000,15.5%,2010,11,24,0
3,1004.0,Pandas Transport,$79000,$90000,13.9%,2018,1,15,1
4,1005.0,Pandas Insurance,$241000,$264000,9.5%,2009,6,1,0


In [11]:
data_frame['2018 Revenue']+data_frame['2019 Revenue']

0    $235000$248000
1    $196000$205000
2    $167000$193000
3      $79000$90000
4    $241000$264000
dtype: object

In [12]:
data_frame.dtypes

Customer ID      float64
Customer Name     object
2018 Revenue      object
2019 Revenue      object
Growth            object
Start Year         int64
Start Month        int64
Start Day          int64
New Customer       int64
dtype: object

In [13]:
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Customer ID    5 non-null      float64
 1   Customer Name  5 non-null      object 
 2   2018 Revenue   5 non-null      object 
 3   2019 Revenue   5 non-null      object 
 4   Growth         5 non-null      object 
 5   Start Year     5 non-null      int64  
 6   Start Month    5 non-null      int64  
 7   Start Day      5 non-null      int64  
 8   New Customer   5 non-null      int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 488.0+ bytes


### Converting from one type to another

In [14]:
data_frame["Customer ID"] = data_frame['Customer ID'].astype('int')
data_frame["Customer ID"]

0    1001
1    1002
2    1003
3    1004
4    1005
Name: Customer ID, dtype: int32

In [15]:
data_frame['2018 Revenue'] = data_frame['2018 Revenue'].astype('int')

ValueError: invalid literal for int() with base 10: '$235000'

In [16]:
def remove_currency(column):
    new_column = column.replace('$','')
    return int(new_column)

In [17]:
data_frame['2018 Revenue']= data_frame['2018 Revenue'].apply(remove_currency)
data_frame['2018 Revenue']

0    235000
1    196000
2    167000
3     79000
4    241000
Name: 2018 Revenue, dtype: int64

In [18]:
data_frame['2019 Revenue']= data_frame['2019 Revenue'].apply(remove_currency)
data_frame['2019 Revenue']

0    248000
1    205000
2    193000
3     90000
4    264000
Name: 2019 Revenue, dtype: int64

In [21]:
def remove_percentage(column):
    new_column = column.replace('%','')
    return float(new_column)

data_frame['Growth'] = data_frame['Growth'].apply(remove_percentage)
data_frame["Growth"]

0     5.5
1     4.5
2    15.5
3    13.9
4     9.5
Name: Growth, dtype: float64

In [22]:
data_frame.rename(columns={'Start Year': 'year', 'Start Month':'month', 'Start Day':'day'}, inplace=True)
data_frame['Starting Date']=pd.to_datetime(data_frame[['day','month','year']])
data_frame['Starting Date']

0   2013-03-10
1   2016-04-30
2   2010-11-24
3   2018-01-15
4   2009-06-01
Name: Starting Date, dtype: datetime64[ns]

In [23]:
data_frame["New Customer"]=data_frame['New Customer'].astype('bool')
data_frame['New Customer']

0    False
1    False
2    False
3     True
4    False
Name: New Customer, dtype: bool

In [24]:
data_frame["Customer Name"]=data_frame["Customer Name"].astype('category')
data_frame["Customer Name"]

0      Pandas Banking
1      Pandas Grocery
2      Pandas Telecom
3    Pandas Transport
4    Pandas Insurance
Name: Customer Name, dtype: category
Categories (5, object): ['Pandas Banking', 'Pandas Grocery', 'Pandas Insurance', 'Pandas Telecom', 'Pandas Transport']

In [25]:
data_frame.dtypes

Customer ID               int32
Customer Name          category
2018 Revenue              int64
2019 Revenue              int64
Growth                  float64
year                      int64
month                     int64
day                       int64
New Customer               bool
Starting Date    datetime64[ns]
dtype: object

In [26]:
data_frame['2018 Revenue']+ data_frame['2019 Revenue']

0    483000
1    401000
2    360000
3    169000
4    505000
dtype: int64

In [27]:
data_frame['Starting Date'] - pd.to_datetime('2020-09-01')

0   -2732 days
1   -1585 days
2   -3569 days
3    -960 days
4   -4110 days
Name: Starting Date, dtype: timedelta64[ns]

##  Exercise 4.01 - underlying data types and conversion

In [28]:
file_path = 'Chapter4-Datasets/retail_purchase.csv'
data_frame=pd.read_csv(file_path)

In [29]:
data_frame.head()

Unnamed: 0,Receipt Id,Date of Purchase,Product Name,Product Weight,Total Price,Retail shop name
0,10001,24/05/20,Wheat,4.8lb,€17,Fline Store
1,10002,05/05/20,Fruit Juice,3.1lb,€19,Dello Superstore
2,10003,27/04/20,Vegetables,1.2lb,€15,Javies Retail
3,10004,05/05/20,Oil,3.1lb,€17,Javies Retail
4,10005,27/04/20,Wheat,4.8lb,€13,Javies Retail


In [30]:
data_frame.tail()

Unnamed: 0,Receipt Id,Date of Purchase,Product Name,Product Weight,Total Price,Retail shop name
99995,109996,24/05/20,Oil,4.8lb,€25,Visco Retail
99996,109997,20/04/20,Rice,3.1lb,€12,Kelly Superstore
99997,109998,08/01/20,Fruit Juice,2.7lb,€24,Dello Superstore
99998,109999,05/05/20,Butter,3.1lb,€22,Dello Superstore
99999,110000,17/04/20,Bread,4.4lb,€27,Visco Retail


In [31]:
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Receipt Id        100000 non-null  int64 
 1   Date of Purchase  100000 non-null  object
 2   Product Name      100000 non-null  object
 3   Product Weight    100000 non-null  object
 4   Total Price       100000 non-null  object
 5   Retail shop name  100000 non-null  object
dtypes: int64(1), object(5)
memory usage: 4.6+ MB


In [32]:
data_frame['Date of Purchase']= pd.to_datetime(data_frame['Date of Purchase'], format='%d/%m/%y')
data_frame['Date of Purchase']

0       2020-05-24
1       2020-05-05
2       2020-04-27
3       2020-05-05
4       2020-04-27
           ...    
99995   2020-05-24
99996   2020-04-20
99997   2020-01-08
99998   2020-05-05
99999   2020-04-17
Name: Date of Purchase, Length: 100000, dtype: datetime64[ns]

In [33]:
# sub string to remove the euro currency special char from the numbers
data_frame['Total Price']= data_frame['Total Price'].str[1:]
data_frame['Total Price']

0        17
1        19
2        15
3        17
4        13
         ..
99995    25
99996    12
99997    24
99998    22
99999    27
Name: Total Price, Length: 100000, dtype: object

In [34]:
data_frame['Total Price']=data_frame['Total Price'].astype('float')
data_frame['Total Price']

0        17.0
1        19.0
2        15.0
3        17.0
4        13.0
         ... 
99995    25.0
99996    12.0
99997    24.0
99998    22.0
99999    27.0
Name: Total Price, Length: 100000, dtype: float64

In [35]:
# removes the last two characters from the value 'lb', leaving just the numbers
data_frame['Product Weight']= data_frame['Product Weight'].str[:-2]
data_frame['Product Weight']

0        4.8
1        3.1
2        1.2
3        3.1
4        4.8
        ... 
99995    4.8
99996    3.1
99997    2.7
99998    3.1
99999    4.4
Name: Product Weight, Length: 100000, dtype: object

In [36]:
data_frame['Product Weight']= data_frame['Product Weight'].astype('float')
data_frame['Product Weight']

0        4.8
1        3.1
2        1.2
3        3.1
4        4.8
        ... 
99995    4.8
99996    3.1
99997    2.7
99998    3.1
99999    4.4
Name: Product Weight, Length: 100000, dtype: float64

In [37]:
data_frame['Product Name'].unique()

array(['Wheat', 'Fruit Juice', 'Vegetables', 'Oil', 'Butter', 'Fruits',
       'Cheese', 'Rice', 'Bread'], dtype=object)

In [38]:
data_frame['Product Name']=data_frame['Product Name'].astype('category')
data_frame['Product Name']

0              Wheat
1        Fruit Juice
2         Vegetables
3                Oil
4              Wheat
            ...     
99995            Oil
99996           Rice
99997    Fruit Juice
99998         Butter
99999          Bread
Name: Product Name, Length: 100000, dtype: category
Categories (9, object): ['Bread', 'Butter', 'Cheese', 'Fruit Juice', ..., 'Oil', 'Rice', 'Vegetables', 'Wheat']

In [39]:
data_frame['Retail shop name'].unique()

array(['Fline Store', 'Dello Superstore', 'Javies Retail',
       'Oldi Superstore', 'Kanes Store', 'Kelly Superstore',
       'Visco Retail', 'Rotero Retail'], dtype=object)

In [40]:
data_frame['Retail shop name']=data_frame['Retail shop name'].astype('category')
data_frame['Retail shop name']

0             Fline Store
1        Dello Superstore
2           Javies Retail
3           Javies Retail
4           Javies Retail
               ...       
99995        Visco Retail
99996    Kelly Superstore
99997    Dello Superstore
99998    Dello Superstore
99999        Visco Retail
Name: Retail shop name, Length: 100000, dtype: category
Categories (8, object): ['Dello Superstore', 'Fline Store', 'Javies Retail', 'Kanes Store', 'Kelly Superstore', 'Oldi Superstore', 'Rotero Retail', 'Visco Retail']

In [41]:
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   Receipt Id        100000 non-null  int64         
 1   Date of Purchase  100000 non-null  datetime64[ns]
 2   Product Name      100000 non-null  category      
 3   Product Weight    100000 non-null  float64       
 4   Total Price       100000 non-null  float64       
 5   Retail shop name  100000 non-null  category      
dtypes: category(2), datetime64[ns](1), float64(2), int64(1)
memory usage: 3.2 MB
