## Converting Data Types for the Ames Housing Dataset

In [1]:
import pandas as pd

In [2]:
file_url = ('https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter10/dataset/ames_iowa_housing.csv')

In [3]:
df = pd.read_csv(file_url)
df.shape

(1460, 81)

In [4]:
# print the data type of each column
df.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

In [5]:
# convert columns to categorical dtypes
df['Id'] = df['Id'].astype('category')
df['MSSubClass'] = df['MSSubClass'].astype('category')
df['OverallQual'] = df['OverallQual'].astype('category')
df['OverallCond'] = df['OverallCond'].astype('category')

In [6]:
df['Id'].name

'Id'

In [7]:
# for loop to print the four categorical variables names and categories
for col in ['Id', 'MSSubClass', 'OverallQual', 'OverallCond']:
    print('\n{}'.format(col))
    print(df[col].cat.categories)


Id
Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
            ...
            1451, 1452, 1453, 1454, 1455, 1456, 1457, 1458, 1459, 1460],
           dtype='int64', length=1460)

MSSubClass
Int64Index([20, 30, 40, 45, 50, 60, 70, 75, 80, 85, 90, 120, 160, 180, 190], dtype='int64')

OverallQual
Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64')

OverallCond
Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   Id             1460 non-null   category
 1   MSSubClass     1460 non-null   category
 2   MSZoning       1460 non-null   object  
 3   LotFrontage    1201 non-null   float64 
 4   LotArea        1460 non-null   int64   
 5   Street         1460 non-null   object  
 6   Alley          91 non-null     object  
 7   LotShape       1460 non-null   object  
 8   LandContour    1460 non-null   object  
 9   Utilities      1460 non-null   object  
 10  LotConfig      1460 non-null   object  
 11  LandSlope      1460 non-null   object  
 12  Neighborhood   1460 non-null   object  
 13  Condition1     1460 non-null   object  
 14  Condition2     1460 non-null   object  
 15  BldgType       1460 non-null   object  
 16  HouseStyle     1460 non-null   object  
 17  OverallQual    1460 non-null   ca

In [9]:
# from the output we can see a lot of variables of object type; Have a closer look if they need to be converted
obj_df = df.select_dtypes(include='object')

In [10]:
obj_cols = obj_df.columns
obj_cols

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')

In [11]:
# print names and unique values of variables in obj_df
for col_name in obj_df:
    print('\n{}'.format(obj_df[col_name].name))
    print(obj_df[col_name].unique())


MSZoning
['RL' 'RM' 'C (all)' 'FV' 'RH']

Street
['Pave' 'Grvl']

Alley
[nan 'Grvl' 'Pave']

LotShape
['Reg' 'IR1' 'IR2' 'IR3']

LandContour
['Lvl' 'Bnk' 'Low' 'HLS']

Utilities
['AllPub' 'NoSeWa']

LotConfig
['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3']

LandSlope
['Gtl' 'Mod' 'Sev']

Neighborhood
['CollgCr' 'Veenker' 'Crawfor' 'NoRidge' 'Mitchel' 'Somerst' 'NWAmes'
 'OldTown' 'BrkSide' 'Sawyer' 'NridgHt' 'NAmes' 'SawyerW' 'IDOTRR'
 'MeadowV' 'Edwards' 'Timber' 'Gilbert' 'StoneBr' 'ClearCr' 'NPkVill'
 'Blmngtn' 'BrDale' 'SWISU' 'Blueste']

Condition1
['Norm' 'Feedr' 'PosN' 'Artery' 'RRAe' 'RRNn' 'RRAn' 'PosA' 'RRNe']

Condition2
['Norm' 'Artery' 'RRNn' 'Feedr' 'PosN' 'PosA' 'RRAn' 'RRAe']

BldgType
['1Fam' '2fmCon' 'Duplex' 'TwnhsE' 'Twnhs']

HouseStyle
['2Story' '1Story' '1.5Fin' '1.5Unf' 'SFoyer' 'SLvl' '2.5Unf' '2.5Fin']

RoofStyle
['Gable' 'Hip' 'Gambrel' 'Mansard' 'Flat' 'Shed']

RoofMatl
['CompShg' 'WdShngl' 'Metal' 'WdShake' 'Membran' 'Tar&Grv' 'Roll'
 'ClyTile']

Exterior1st
['Vi

In [12]:
obj_df['SaleType'].dtype

dtype('O')

In [13]:
# the output shows that each variable has a finite number of 
# values which means they have to be converted to categorical.
for col_name in obj_cols:
    print('\n Before change {}: {}'.format(df[col_name].name, df[col_name].dtype))
    df[col_name] = df[col_name].astype('category')
    print('After change {}: {}'.format(df[col_name].name, df[col_name].dtype))


 Before change MSZoning: object
After change MSZoning: category

 Before change Street: object
After change Street: category

 Before change Alley: object
After change Alley: category

 Before change LotShape: object
After change LotShape: category

 Before change LandContour: object
After change LandContour: category

 Before change Utilities: object
After change Utilities: category

 Before change LotConfig: object
After change LotConfig: category

 Before change LandSlope: object
After change LandSlope: category

 Before change Neighborhood: object
After change Neighborhood: category

 Before change Condition1: object
After change Condition1: category

 Before change Condition2: object
After change Condition2: category

 Before change BldgType: object
After change BldgType: category

 Before change HouseStyle: object
After change HouseStyle: category

 Before change RoofStyle: object
After change RoofStyle: category

 Before change RoofMatl: object
After change RoofMatl: category



In [14]:
df.dtypes

Id               category
MSSubClass       category
MSZoning         category
LotFrontage       float64
LotArea             int64
                   ...   
MoSold              int64
YrSold              int64
SaleType         category
SaleCondition    category
SalePrice           int64
Length: 81, dtype: object