# 1. 导入包和数据集

In [1]:
# -*- coding: utf-8 -*-
"""
Python 3.7.7
sklearn 0.23.1
预处理黑色星期五购物数据
"""

# 导入包
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# 导入数据集
data = pd.read_csv('BlackFriday.csv')
data.head(5)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0.0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0.0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0.0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0.0,8,,,7969


# 2. 处理缺失数据

In [3]:
# 处理缺失数据
# 检测缺失值
null_df = data.isnull().sum()
null_df

User_ID                           0
Product_ID                        0
Gender                            0
Age                               0
Occupation                        0
City_Category                     0
Stay_In_Current_City_Years        0
Marital_Status                    3
Product_Category_1                0
Product_Category_2            15721
Product_Category_3            34817
Purchase                          0
dtype: int64

#### Marital_Status字段有3个缺失值；根据业务场景，缺失的默认是未婚，即用0填补。
#### Product_Category_2字段有15721个缺失值；根据业务场景，这个字段不重要，删除。
#### Product_Category_3字段有34817个缺失值；根据业务场景，这个字段不重要，删除。

In [4]:
# 删除2个缺失列
data = data.drop(['Product_Category_2', 'Product_Category_3'], axis = 1)

In [5]:
# 填补缺失列
data['Marital_Status'].fillna(0, inplace = True)

In [6]:
# 再次检测缺失值
null_df = data.isnull().sum()
null_df

User_ID                       0
Product_ID                    0
Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Purchase                      0
dtype: int64

# 3. 特征工程

In [7]:
# 特征工程
# 删除无用的列
data = data.drop(['User_ID', 'Product_ID'], axis = 1)

In [8]:
# 处理Stay_In_Current_City_Years列
data['Stay_In_Current_City_Years'].replace('4+', 4, inplace = True)
data['Stay_In_Current_City_Years'] = data['Stay_In_Current_City_Years'].astype('int64')

# 4. 处理类别型字段

In [9]:
# 处理类别型字段
# 检查类别型变量
print(data.dtypes)

Gender                         object
Age                            object
Occupation                      int64
City_Category                  object
Stay_In_Current_City_Years      int64
Marital_Status                float64
Product_Category_1              int64
Purchase                        int64
dtype: object


#### 根据业务场景，Occupation列、Marital_Status列和Product_Category_1列应该是类别型字段。需要转换。

In [10]:

# 转换变量类型
data['Product_Category_1'] = data['Product_Category_1'].astype('object')
data['Occupation'] = data['Occupation'].astype('object')
data['Marital_Status'] = data['Marital_Status'].astype('object')

In [11]:
# 检查类别型变量
print(data.dtypes)

Gender                        object
Age                           object
Occupation                    object
City_Category                 object
Stay_In_Current_City_Years     int64
Marital_Status                object
Product_Category_1            object
Purchase                       int64
dtype: object


In [12]:
# 字符编码&独热编码
data = pd.get_dummies(data, drop_first = True) 

# 5. 得到自变量和因变量

In [13]:
# 得到自变量和因变量
y = data['Purchase'].values
data = data.drop(['Purchase'], axis = 1)
x = data.values

# 6. 拆分训练集和测试集

In [14]:
# 拆分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1)

#### 自变量保存到x_train和x_test中，因变量保存到y_train和y_test中。

# 7. 特征缩放

In [15]:
# 特征缩放
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)
sc_y = StandardScaler()
y_train = np.ravel(sc_y.fit_transform(y_train.reshape(-1, 1)))

#### 缩放后的x_train和y_train，所有特征的值处于相似范围内。

#### 结论：
#### 数据预处理有固定的方法。
#### Python提供了丰富的库，方便人们做数据预处理工作。
#### 最初的数据通过数据预处理生成了x_train、y_train、x_test、y_test。在下一章中，前2个变量将训练模型，后2个变量将评估模型。
