In [1]:
import pandas as pd

dataset = pd.read_csv('./supplychain/SupplyChain.csv', encoding='unicode_escape' )

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180519 entries, 0 to 180518
Data columns (total 53 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   Type                           180519 non-null  object 
 1   Days for shipping (real)       180519 non-null  int64  
 2   Days for shipment (scheduled)  180519 non-null  int64  
 3   Benefit per order              180519 non-null  float64
 4   Sales per customer             180519 non-null  float64
 5   Delivery Status                180519 non-null  object 
 6   Late_delivery_risk             180519 non-null  int64  
 7   Category Id                    180519 non-null  int64  
 8   Category Name                  180519 non-null  object 
 9   Customer City                  180519 non-null  object 
 10  Customer Country               180519 non-null  object 
 11  Customer Email                 180519 non-null  object 
 12  Customer Fname                

### 欺诈订单预测

In [50]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix

In [53]:
num_features = ['Days for shipping (real)', 'Days for shipment (scheduled)', 
                'Benefit per order', 'Sales per customer', 'Latitude', 'Longitude',
               'Order Item Discount', 'Order Item Discount Rate', 
                'Order Item Product Price', 'Order Item Profit Ratio', 'Order Item Quantity',
               'Sales', 'Order Item Total', 'Order Profit Per Order', 'Product Price',]

cat_features = ['Market', 'Delivery Status', 'Late_delivery_risk', 'Shipping Mode']

# features
X = dataset[num_features]
# 归一化
mm = MinMaxScaler()
X = pd.DataFrame(mm.fit_transform(X.values))
# 分类数据
for col in cat_features:
    X = pd.concat([X, pd.get_dummies(dataset[col], prefix=col)], axis=1)
# labels 
y = dataset['Order Status'].apply(lambda x: 1 if x=='SUSPECTED_FRAUD' else 0)

# split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# logistic regression for classifier
model = LogisticRegressionCV(cv=5, verbose=1, tol=1, max_iter=10000)
model.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   12.4s finished


LogisticRegressionCV(cv=5, max_iter=10000, tol=1, verbose=1)

In [54]:
# prediction
y_pred = model.predict(X_test)
# confusion matrix
confusion_matrix(y_test, y_pred)

array([[34680,   604],
       [  137,   683]], dtype=int64)

**LR分类效果一般**

### 迟交货订单进行预测

In [61]:
# 需要去掉'Days for shipping (real)', 'Days for shipment (scheduled)' 不然标签泄露
num_features = [ 
                'Benefit per order', 'Sales per customer', 'Latitude', 'Longitude',
               'Order Item Discount', 'Order Item Discount Rate', 
                'Order Item Product Price', 'Order Item Profit Ratio', 'Order Item Quantity',
               'Sales', 'Order Item Total', 'Order Profit Per Order', 'Product Price',]

cat_features = ['Market', 'Order Status', 'Shipping Mode']

# features
X = dataset[num_features]
# 归一化
mm = MinMaxScaler()
X = pd.DataFrame(mm.fit_transform(X.values))
# 分类数据
for col in cat_features:
    X = pd.concat([X, pd.get_dummies(dataset[col], prefix=col)], axis=1)
# labels 
y = dataset['Delivery Status'].apply(lambda x: 1 if x=='Late delivery' else 0)

# split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# logistic regression for classifier
model = LogisticRegressionCV(cv=5, verbose=1, tol=1, max_iter=10000)
model.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   20.3s finished


LogisticRegressionCV(cv=5, max_iter=10000, tol=1, verbose=1)

In [62]:
# prediction
y_pred = model.predict(X_test)
# confusion matrix
confusion_matrix(y_test, y_pred)

array([[14985,  1315],
       [ 9214, 10590]], dtype=int64)

### 销售预测

In [65]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

In [64]:
num_features = ['Days for shipping (real)', 'Days for shipment (scheduled)', 
                'Benefit per order', 'Sales per customer', 'Latitude', 'Longitude',
               'Order Item Discount', 'Order Item Discount Rate', 
                'Order Item Product Price', 'Order Item Profit Ratio', 'Order Item Quantity',
                'Order Item Total', 'Order Profit Per Order', 'Product Price',]

cat_features = ['Market', 'Delivery Status', 'Late_delivery_risk', 'Shipping Mode']

# features
X = dataset[num_features]
# 归一化
mm = MinMaxScaler()
X = pd.DataFrame(mm.fit_transform(X.values))
# 分类数据
for col in cat_features:
    X = pd.concat([X, pd.get_dummies(dataset[col], prefix=col)], axis=1)
# labels 
y = dataset['Sales']

# split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# linear regression
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [66]:
# predition
y_pred = model.predict(X_test)
# mse
mse = mean_squared_error(y_test, y_pred)
print(mse)

2.349991802023261e-06


### 订货数量

In [70]:
num_features = ['Days for shipping (real)', 'Days for shipment (scheduled)', 
                'Benefit per order', 'Sales per customer', 'Latitude', 'Longitude',
               'Order Item Discount', 'Order Item Discount Rate', 
                'Order Item Product Price', 'Order Item Profit Ratio', 
               'Sales', 'Order Item Total', 'Order Profit Per Order', 'Product Price',]

cat_features = ['Market', 'Delivery Status', 'Late_delivery_risk', 'Shipping Mode']

# features
X = dataset[num_features]
# 归一化
mm = MinMaxScaler()
X = pd.DataFrame(mm.fit_transform(X.values))
# 分类数据
for col in cat_features:
    X = pd.concat([X, pd.get_dummies(dataset[col], prefix=col)], axis=1)
# labels 
y = dataset['Order Item Quantity']

# split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# linear regression 
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [71]:
# predition
y_pred = model.predict(X_test)
# mse
mse = mean_squared_error(y_test, y_pred)
print(mse)

0.33462939068135195
