In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [2]:
df = pd.read_csv('file_temp.csv')

In [3]:
df.head()

Unnamed: 0,CustomerID,Gender,Tenure_Months,Transaction_Date,Product_SKU,Product_Description,Product_Category,Quantity,Avg_Price,Delivery_Charges,...,Online_Spend,Month,Coupon_Code,Discount_pct,Customer_Age,City,State,Region,Total_Prices,Total_Spend
0,17850.0,M,12.0,2019-01-01,GGOENEBJ079499,Nest Learning Thermostat 3rd Gen-USA - Stainle...,Nest-USA,1.0,153.71,6.5,...,2424.5,1,ELEC10,10.0,54,Corpus Christi,Texas,South,160.21,6924.5
1,17850.0,M,12.0,2019-01-01,GGOENEBJ079499,Nest Learning Thermostat 3rd Gen-USA - Stainle...,Nest-USA,1.0,153.71,6.5,...,2424.5,1,ELEC10,10.0,54,Arlington,Texas,South,160.21,6924.5
2,17850.0,M,12.0,2019-01-01,GGOENEBQ078999,Nest Cam Outdoor Security Camera - USA,Nest-USA,2.0,122.77,6.5,...,2424.5,1,ELEC10,10.0,54,Plano,Texas,South,252.04,6924.5
3,17850.0,M,12.0,2019-01-01,GGOENEBQ079099,Nest Protect Smoke + CO White Battery Alarm-USA,Nest-USA,1.0,81.5,6.5,...,2424.5,1,ELEC10,10.0,54,Irvine,California,West,88.0,6924.5
4,17850.0,M,12.0,2019-01-01,GGOENEBJ079499,Nest Learning Thermostat 3rd Gen-USA - Stainle...,Nest-USA,1.0,153.71,6.5,...,2424.5,1,ELEC10,10.0,54,Buffalo,New York,East,160.21,6924.5


# Dự đoán Số ngày một khách hàng sẽ quay trở lại mua hàng

In [4]:
df['Previous_Transaction_Date'] = df.groupby('CustomerID')['Transaction_Date'].shift(1)
df['Days_Between_Purchases'] = (pd.to_datetime(df['Transaction_Date']) - pd.to_datetime(df['Previous_Transaction_Date'])).dt.days

df = df[df['Days_Between_Purchases'].notnull()]
features = ['Tenure_Months', 'Days_Between_Purchases', 'Avg_Price']
X = df[features]
y = df['Days_Between_Purchases'].shift(-1)

X = X[:-1]
y = y[:-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

In [5]:
def predict_customer_return(customer_id, transaction_date):
    customer_history = df[df['CustomerID'] == int(customer_id)]
    if not customer_history.empty:
        last_transaction = customer_history.iloc[-1]
        tenure_months = last_transaction['Tenure_Months']
        avg_price = customer_history['Avg_Price'].mean()
        last_transaction_date = last_transaction['Transaction_Date']
        days_between_purchases = (pd.to_datetime(transaction_date) - pd.to_datetime(last_transaction_date)).days

        input_features = pd.DataFrame([{
            'Tenure_Months': tenure_months,
            'Days_Between_Purchases': days_between_purchases,
            'Avg_Price': avg_price
        }])
        predict_return = model.predict(input_features)
        print(f"Dự đoán khách hàng {customer_id} sẽ quay lại sau khoảng {predict_return[0]:.2f} ngày.")
    else:
        print(f"Không tìm thấy lịch sử giao dịch cho khách hàng {customer_id}.")

In [6]:
predict_customer_return('17850', '2021-01-01')

Dự đoán khách hàng 17850 sẽ quay lại sau khoảng 19.24 ngày.


# Dự đoán xu hướng mua hàng của khách hàng

In [7]:
df = df.drop(['CustomerID', 'Delivery_Charges','Coupon_Code','Discount_pct','GST','Coupon_Status'],axis=1)

In [8]:
scaler = MinMaxScaler()

le = {}
for column in df.columns:
    label_encoder = LabelEncoder()
    df[column] = label_encoder.fit_transform(df[column])
    le[column] = label_encoder

In [9]:
le

{'Gender': LabelEncoder(),
 'Tenure_Months': LabelEncoder(),
 'Transaction_Date': LabelEncoder(),
 'Product_SKU': LabelEncoder(),
 'Product_Description': LabelEncoder(),
 'Product_Category': LabelEncoder(),
 'Quantity': LabelEncoder(),
 'Avg_Price': LabelEncoder(),
 'Date': LabelEncoder(),
 'Offline_Spend': LabelEncoder(),
 'Online_Spend': LabelEncoder(),
 'Month': LabelEncoder(),
 'Customer_Age': LabelEncoder(),
 'City': LabelEncoder(),
 'State': LabelEncoder(),
 'Region': LabelEncoder(),
 'Total_Prices': LabelEncoder(),
 'Total_Spend': LabelEncoder(),
 'Previous_Transaction_Date': LabelEncoder(),
 'Days_Between_Purchases': LabelEncoder()}

In [10]:
df_temp = df[['Gender','Customer_Age', 'Product_Category', 'Product_Description', 'City', 'State', 'Month']]

In [11]:
df_temp.head()

Unnamed: 0,Gender,Customer_Age,Product_Category,Product_Description,City,State,Month
1,1,36,12,308,4,33,0
2,1,36,12,304,77,33,0
3,1,36,12,313,46,4,0
4,1,36,12,308,13,26,0
5,1,36,12,308,96,36,0


In [12]:
X = df_temp.drop('Product_Description', axis=1)
y = df_temp['Product_Description']

In [13]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [14]:
model = RandomForestClassifier(random_state=42)
X_train = scaler.fit_transform(X)
model.fit(X_train, y)

In [15]:
def predict_product_for_customer(gender, age, category, city, state, month):
    data = pd.DataFrame({
        'Gender': [gender],
        'Customer_Age': [age],
        'Product_Category': [category],
        'City': [city],
        'State': [state],
        'Month': [month]
    })
    data['Gender'] = LabelEncoder().fit_transform(data['Gender'])
    data['State'] = LabelEncoder().fit_transform(data['State'])
    data['Month'] = LabelEncoder().fit_transform(data['Month'])
    data['City'] = LabelEncoder().fit_transform(data['City'])
    data['Product_Category'] = LabelEncoder().fit_transform(data['Product_Category'])
    data = data[['Gender', 'Customer_Age', 'Product_Category', 'City', 'State', 'Month']]
    data_scaled = scaler.transform(data)
    predict_product = model.predict(data_scaled)
    product = le['Product_Description'].inverse_transform(predict_product)[0]
    print(f'Predicted Product: {product}')

In [16]:
predict_product_for_customer('F', 30, 'Nest-USA', 'Boise' ,'Washington', 5)

Predicted Product: Google Men's  Zip Hoodie


# Dự đoán khách hàng tiềm năng

In [17]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [18]:
df.head()

Unnamed: 0,Gender,Tenure_Months,Transaction_Date,Product_SKU,Product_Description,Product_Category,Quantity,Avg_Price,Date,Offline_Spend,Online_Spend,Month,Customer_Age,City,State,Region,Total_Prices,Total_Spend,Previous_Transaction_Date,Days_Between_Purchases
1,1,10,0,971,308,12,0,528,0,9,269,0,36,4,33,2,4591,337,0,244
2,1,10,0,973,304,12,1,521,0,9,269,0,36,77,33,2,5227,337,0,244
3,1,10,0,974,313,12,0,484,0,9,269,0,36,46,4,3,3371,337,0,244
4,1,10,0,971,308,12,0,528,0,9,269,0,36,13,26,1,4591,337,0,244
5,1,10,0,971,308,12,0,528,0,9,269,0,36,96,36,3,4591,337,0,244


In [19]:
new_df = df[['Gender', 'Customer_Age', 'State', 'Tenure_Months', 'Total_Spend']]

new_df['Gender'] = LabelEncoder().fit_transform(new_df['Gender'])
new_df['State'] = LabelEncoder().fit_transform(new_df['State'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Gender'] = LabelEncoder().fit_transform(new_df['Gender'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['State'] = LabelEncoder().fit_transform(new_df['State'])


In [20]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(new_df)
kmeans = KMeans(n_clusters=2, random_state=42)
df['Segment'] = kmeans.fit_predict(X_scaled)

centroids = kmeans.cluster_centers_
distances = np.linalg.norm(X_scaled - centroids[kmeans.labels_], axis=1)

df['Distance'] = distances

threshold = df['Distance'].mean() + df['Distance'].std()

df['Potential_Customer'] = np.where(df['Distance'] < threshold, "Potential", "Non-Potential")

  super()._check_params_vs_input(X, default_n_init=10)


In [22]:
# silhouette_avg = silhouette_score(X_scaled, kmeans.labels_)
# silhouette_avg

In [23]:
X_po = df[['Gender', 'Customer_Age', 'State', 'Tenure_Months', 'Total_Spend']]
y_po = df['Potential_Customer']

le['Potential_Customer'] = LabelEncoder()
y_po = le['Potential_Customer'].fit_transform(y_po)

In [24]:
scaler = MinMaxScaler()
X_po = scaler.fit_transform(X_po)

In [25]:
model_po = RandomForestClassifier(random_state=42)
model_po.fit(X_po, y_po)

In [26]:
def predict_potential_customer(gender, age, state, tenure, spend):
    data = pd.DataFrame({
        'Gender': [gender],
        'Customer_Age': [age],
        'State': [state],
        'Tenure_Months': [tenure],
        'Total_Spend': [spend]
    })
    data['Gender'] = LabelEncoder().fit_transform(data['Gender'])
    data['State'] = LabelEncoder().fit_transform(data['State'])
    new_data_scaled = scaler.transform(data)
    predict = model_po.predict(new_data_scaled)
    prd = le['Potential_Customer'].inverse_transform(predict)[0]
    print(f"The customer is a {prd} Customer for the bussiness")

In [27]:
predict_potential_customer('M', 47, 'Texas', 10, 1000)

The customer is a Non-Potential Customer for the bussiness


In [28]:
predict_potential_customer('F', 30, 'Washington', 16, 50)

The customer is a Potential Customer for the bussiness


# Dự đoán hành vi mua sắn theo mùa

In [None]:
# def get_season(month):
#     if month in [12, 1, 2]:
#         return 'Winter'
#     elif month in [3, 4, 5]:
#         return 'Spring'
#     elif month in [6, 7, 8]:
#         return 'Summer'
#     else:
#         return 'Fall'

# df['Season'] = df['Month'].apply(get_season)
# season_df = df[['Transaction_Date', 'Season', 'Total_Spend', 'Month', 'Product_Description']]

In [None]:
# season_df.head()

Unnamed: 0,Transaction_Date,Season,Total_Spend,Month,Product_Description
0,0,Fall,337,0,308
1,0,Fall,337,0,308
2,0,Fall,337,0,304
3,0,Fall,337,0,313
4,0,Fall,337,0,308


In [None]:
# seasonal_spinning = season_df.groupby(['Season', 'Product_Description']).agg({'Total_Spend': 'sum'}).reset_index()

In [None]:
# seasonal_spinning.head()

Unnamed: 0,Season,Product_Description,Total_Spend
0,Fall,0,4173
1,Fall,1,2380
2,Fall,2,2594
3,Fall,3,6017
4,Fall,4,5646


In [None]:
# X = pd.get_dummies(seasonal_spinning[['Season', 'Product_Description']])