This notebook focuses on preparing categorical data for a model and uses different encoding techniques. It is part of the 'Feature engineering 1: Data transformations' syllabus on Codecademy.

In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [11]:
reviews = pd.read_csv('E_commerce_reviews', index_col=0)
reviews.head()

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended,Positive Feedback Count,Division Name,Department Name,Class Name
0,767,33,,Absolutely wonderful - silky and sexy and comf...,Liked it,True,0,Initmates,Intimate,Intimates
1,1080,34,,Love this dress! it's sooo pretty. i happene...,Loved it,True,4,General,Dresses,Dresses
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,Was okay,False,0,General,Dresses,Dresses
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",Loved it,True,0,General Petite,Bottoms,Pants
4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,Loved it,True,6,General,Tops,Blouses


In [12]:
#print column names
reviews.columns

Index(['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended',
       'Positive Feedback Count', 'Division Name', 'Department Name',
       'Class Name'],
      dtype='object')

In [13]:
#print .info
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23486 entries, 0 to 23485
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Clothing ID              23486 non-null  int64 
 1   Age                      23486 non-null  int64 
 2   Title                    19676 non-null  object
 3   Review Text              22641 non-null  object
 4   Rating                   23486 non-null  object
 5   Recommended              23486 non-null  bool  
 6   Positive Feedback Count  23486 non-null  int64 
 7   Division Name            23472 non-null  object
 8   Department Name          23472 non-null  object
 9   Class Name               23472 non-null  object
dtypes: bool(1), int64(3), object(6)
memory usage: 1.8+ MB


In [14]:
#look at the counts of recommended
print(reviews['Recommended'].value_counts())

True     19314
False     4172
Name: Recommended, dtype: int64


In [15]:
#create binary dictionary
binary_dict = {False: 0, True:1}
 
#transform column
reviews['Recommended'] = reviews['Recommended'].map(binary_dict)
 
#print your transformed column
print(reviews['Recommended'].head())
print(reviews['Recommended'].value_counts())

0    1
1    1
2    0
3    1
4    1
Name: Recommended, dtype: int64
1    19314
0     4172
Name: Recommended, dtype: int64


In [16]:
#look at the counts of rating
print(reviews['Rating'].value_counts())
 
#create dictionary
rating_dict = {'Loved it':5, 'Liked it': 4, 'Was okay':3, 'Not great':2, 'Hated it':1}
 
#transform rating column
reviews['Rating'] = reviews['Rating'].map(rating_dict)

#print your transformed column values
print(reviews['Rating'].value_counts())

Loved it     13131
Liked it      5077
Was okay      2871
Not great     1565
Hated it       842
Name: Rating, dtype: int64
5    13131
4     5077
3     2871
2     1565
1      842
Name: Rating, dtype: int64


In [17]:
#get the number of categories in a feature
print(reviews['Department Name'].value_counts())
 
#perform get_dummies
ohe = pd.get_dummies(reviews['Department Name'])
 
#join the new columns back onto the original
reviews = reviews.join(ohe)

#print column names
print(reviews.columns)

Tops        10468
Dresses      6319
Bottoms      3799
Intimate     1735
Jackets      1032
Trend         119
Name: Department Name, dtype: int64
Index(['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended',
       'Positive Feedback Count', 'Division Name', 'Department Name',
       'Class Name', 'Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops',
       'Trend'],
      dtype='object')


In [18]:
#get numerical columns
reviews = reviews[['Clothing ID', 'Age', 'Recommended', 'Rating', 'Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend']].copy()

#reset index
reviews = reviews.set_index('Clothing ID')

#instantiate standard scaler
scaler = StandardScaler()
 
#fit transform data
reviews = scaler.fit_transform(reviews)

print(reviews)

[[-0.83054886  0.4647678  -0.17660399 ... -0.21438431 -0.89672592
  -0.07136282]
 [-0.74911087  0.4647678   0.72429116 ... -0.21438431 -0.89672592
  -0.07136282]
 [ 1.36827674 -2.15161203 -1.07749914 ... -0.21438431 -0.89672592
  -0.07136282]
 ...
 [-0.99342483 -2.15161203 -1.07749914 ... -0.21438431 -0.89672592
  -0.07136282]
 [-1.23773878  0.4647678  -1.07749914 ... -0.21438431 -0.89672592
  -0.07136282]
 [ 0.71677286  0.4647678   0.72429116 ... -0.21438431 -0.89672592
  -0.07136282]]
