# 探究用户对物品类别的喜好细分
- 找到用户和物品类别的关系
- aisles 商品所属具体物品类别
    - aisle_id aisle
- order_products_prior 订单与商品信息
    - order_id product_id add_to_cart_order reordered
- orders 用户订单信息
    - order_id user_id eval_set order_number
- aisles 商品信息
    - product_id product_name aisle_id department_id
- 需要将user_id和aisle放在同一张表中
- 找到user_id和aisle之间的关系-交叉表和透视表
- 特征冗余过多，有134个，需要PCA降维

## 1.获取数据

In [1]:
import pandas as pd
# 1.获取数据
aisles = pd.read_csv('./instacart/aisles.csv')
order_products__prior = pd.read_csv('./instacart/order_products__prior.csv')
orders = pd.read_csv('./instacart/orders.csv')
products = pd.read_csv('./instacart/products.csv')


In [6]:
aisles.head()

Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation


In [7]:
order_products__prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [8]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [9]:
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


## 2.合并表
- 合并aisles和products两张表,aisles和product_id在一起
### 第一次合并
- merge 按索引进行合并,默认内连接

In [2]:
table1 = pd.merge(aisles,products,on=['aisle_id','aisle_id'])
table1.head()

Unnamed: 0,aisle_id,aisle,product_id,product_name,department_id
0,1,prepared soups salads,209,Italian Pasta Salad,20
1,1,prepared soups salads,554,Turkey Chili,20
2,1,prepared soups salads,886,Whole Grain Salad with Roasted Pecans & Mango ...,20
3,1,prepared soups salads,1600,Mediterranean Orzo Salad,20
4,1,prepared soups salads,2539,Original Potato Salad,20


### 第二次合并
   

In [3]:
table2 = pd.merge(table1,order_products__prior,on=['product_id','product_id'])
table2.head()

Unnamed: 0,aisle_id,aisle,product_id,product_name,department_id,order_id,add_to_cart_order,reordered
0,1,prepared soups salads,209,Italian Pasta Salad,20,94246,5,0
1,1,prepared soups salads,209,Italian Pasta Salad,20,192465,2,1
2,1,prepared soups salads,209,Italian Pasta Salad,20,195206,18,1
3,1,prepared soups salads,209,Italian Pasta Salad,20,227717,1,1
4,1,prepared soups salads,209,Italian Pasta Salad,20,260072,13,0


### 第三次合并

In [4]:
table3 = pd.merge(table2,orders,on=['order_id','order_id'])
table3.head()

Unnamed: 0,aisle_id,aisle,product_id,product_name,department_id,order_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,1,prepared soups salads,209,Italian Pasta Salad,20,94246,5,0,114082,prior,26,0,20,1.0
1,1,prepared soups salads,22853,Pesto Pasta Salad,20,94246,4,0,114082,prior,26,0,20,1.0
2,4,instant foods,12087,Chicken Flavor Ramen Noodle Soup,9,94246,15,0,114082,prior,26,0,20,1.0
3,4,instant foods,47570,Original Flavor Macaroni & Cheese Dinner,9,94246,14,1,114082,prior,26,0,20,1.0
4,13,prepared meals,10089,Dolmas,20,94246,25,0,114082,prior,26,0,20,1.0


## 3.找到user_id和aisle之间的关系
### 交叉表和透视表

In [10]:
table = pd.crosstab(table3['user_id'],table3['aisle'])
table.head()
# 有134个特征，里面有很多的冗余，我们需要进行降维

aisle,air fresheners candles,asian foods,baby accessories,baby bath body care,baby food formula,bakery desserts,baking ingredients,baking supplies decor,beauty,beers coolers,...,spreads,tea,tofu meat alternatives,tortillas flat bread,trail mix snack mix,trash bags liners,vitamins supplements,water seltzer sparkling water,white wines,yogurt
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,0,3,0,0,0,0,2,0,0,0,...,3,1,1,0,0,0,0,2,0,42
3,0,0,0,0,0,0,0,0,0,0,...,4,1,0,0,0,0,0,2,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
5,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3


In [11]:
data = table[:10000]

## 4.pca降维
- 实例化一个转化器
- 调用fit_transform


In [12]:
from sklearn.decomposition import PCA

In [13]:
# 保存95%的信息
transfer = PCA(n_components=0.95)
data_new = transfer.fit_transform(data)

In [14]:
data_new.shape

(10000, 42)

## 预估器流程

In [15]:
from sklearn.cluster import KMeans

In [16]:
#因为我们没有目标值，只有特征值，传特征值就可以了
estimator = KMeans(n_clusters=3)
estimator.fit(data_new)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [17]:
y_predict = estimator.predict(data_new)

In [18]:
y_predict[:300] ## 用户分成了三类

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 2, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 0, 1, 1, 1,
       1, 2, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       0])

## 模型评估
- 轮廓系数

In [19]:
from sklearn.metrics import silhouette_score

In [20]:
silhouette_score(data_new, y_predict)

0.53941644320775439