## Classification and prediction

In [80]:
import pandas as pd

# Specify the path to your downloaded CSV file
file_path = 'Customer Purchasing Behaviors.csv'

# Load the dataset
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(df.head())

   user_id  age  annual_income  purchase_amount  loyalty_score region  \
0        1   25          45000              200            4.5  North   
1        2   34          55000              350            7.0  South   
2        3   45          65000              500            8.0   West   
3        4   22          30000              150            3.0   East   
4        5   29          47000              220            4.8  North   

   purchase_frequency  
0                  12  
1                  18  
2                  22  
3                  10  
4                  13  


#### check the missing values

In [55]:
print(df.isnull().sum())  # Displays the count of missing values for each column


user_id               0
age                   0
annual_income         0
purchase_amount       0
loyalty_score         0
region                0
purchase_frequency    0
dtype: int64


### check the datat types

In [56]:
print(df.info())  # Gives you information about the data types and null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 238 entries, 0 to 237
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   user_id             238 non-null    int64  
 1   age                 238 non-null    int64  
 2   annual_income       238 non-null    int64  
 3   purchase_amount     238 non-null    int64  
 4   loyalty_score       238 non-null    float64
 5   region              238 non-null    object 
 6   purchase_frequency  238 non-null    int64  
dtypes: float64(1), int64(5), object(1)
memory usage: 13.1+ KB
None


In [57]:
df.dtypes

user_id                 int64
age                     int64
annual_income           int64
purchase_amount         int64
loyalty_score         float64
region                 object
purchase_frequency      int64
dtype: object

#### check the duplicates

In [58]:
print(df.duplicated().sum())

0


#### Building the model

#### Random Forest for Classification:

In [59]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

 #### creating a new column called purchase_made

In [81]:
# Create the target variable (purchase_made)
df['purchase_made'] = df['purchase_amount'].apply(lambda x: 1 if x > 0 else 0)

#### Split the dataset into features (X) and target variable (y)

In [61]:
# Select features and target variable
X = df[['age', 'annual_income', 'loyalty_score', 'region', 'purchase_frequency']]
y = df['purchase_made']

In [62]:
# Convert 'region' categorical column to numerical using one-hot encoding
X = pd.get_dummies(X, columns=['region'], drop_first=True)

In [63]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [64]:
# Train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [65]:
# Make predictions
y_pred = model.predict(X_test)

In [66]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00        48

    accuracy                           1.00        48
   macro avg       1.00      1.00      1.00        48
weighted avg       1.00      1.00      1.00        48



#### Example of Regression:


##### predict purchase_amount, we can use the following code

In [67]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [68]:
# Select features and target variable
X = df[['age', 'annual_income', 'loyalty_score', 'region', 'purchase_frequency']]
y = df['purchase_amount']

In [69]:
# Convert 'region' categorical column to numerical using one-hot encoding
X = pd.get_dummies(X, columns=['region'], drop_first=True)

In [70]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [71]:
# Train the model
model = RandomForestRegressor()
model.fit(X_train, y_train)

In [72]:
# Make predictions
y_pred = model.predict(X_test)
y_pred

array([629.9, 166.9, 600. , 600. , 398. , 589.6, 324.6, 510. , 500. ,
       420. , 230. , 600. , 385.2, 439.8, 449.8, 629.9, 428. , 430. ,
       360. , 239.3, 340. , 268.2, 240. , 629.9, 266.2, 300. , 449.8,
       170. , 249.8, 514.8, 470. , 600. , 268.2, 510. , 550. , 268.2,
       520. , 629.9, 430. , 380. , 500. , 226.8, 439.8, 230. , 170. ,
       600. , 621.2, 170. ])

In [73]:
# Evaluate the model
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

Mean Absolute Error: 2.0416666666666696
Mean Squared Error: 38.90791666666669


### Classification Code (High Spender vs. Low Spender)

In [74]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Assuming your dataset is loaded as 'df'
# df = pd.read_csv('path_to_your_file.csv')

# Step 1: Create a binary 'high_spender' column based on 'purchase_amount'
# We'll define high spenders as those with 'purchase_amount' greater than a certain threshold, for example, 100.
df['high_spender'] = df['purchase_amount'].apply(lambda x: 1 if x > 100 else 0)

# Step 2: Select features (X) and target variable (y)
X = df[['age', 'annual_income', 'loyalty_score', 'region', 'purchase_frequency']]  # Features
y = df['high_spender']  # Target: whether the customer is a high spender (1) or not (0)

# Step 3: One-hot encode the categorical column ('region')
X = pd.get_dummies(X, columns=['region'], drop_first=True)

# Step 4: Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train a Random Forest Classifier model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 6: Make predictions using the trained model
y_pred = model.predict(X_test)

# Step 7: Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00        48

    accuracy                           1.00        48
   macro avg       1.00      1.00      1.00        48
weighted avg       1.00      1.00      1.00        48



#### Step 2: Prediction Code (Predict Future Spending of High-Value Customers)

In [75]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Step 1: Filter only high spenders for prediction
high_spenders = df[df['high_spender'] == 1]

# Step 2: Select features (X) and target variable (y)
X = high_spenders[['age', 'annual_income', 'loyalty_score', 'region', 'purchase_frequency']]  # Features
y = high_spenders['purchase_amount']  # Target: purchase_amount (future spending)

# Step 3: One-hot encode the categorical column ('region')
X = pd.get_dummies(X, columns=['region'], drop_first=True)

# Step 4: Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train a Random Forest Regressor model
regressor = RandomForestRegressor(random_state=42)
regressor.fit(X_train, y_train)

# Step 6: Make predictions using the trained regressor
y_pred = regressor.predict(X_test)

# Step 7: Evaluate the model
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))


Mean Absolute Error: 2.0562500000000052
Mean Squared Error: 39.930208333333304


#### Step 3: Customizing the Threshold for High Spenders


In [76]:
threshold = df['purchase_amount'].quantile(0.75)  # 75th percentile
df['high_spender'] = df['purchase_amount'].apply(lambda x: 1 if x > threshold else 0)


In [77]:
df

Unnamed: 0,user_id,age,annual_income,purchase_amount,loyalty_score,region,purchase_frequency,purchase_made,high_spender
0,1,25,45000,200,4.5,North,12,1,0
1,2,34,55000,350,7.0,South,18,1,0
2,3,45,65000,500,8.0,West,22,1,0
3,4,22,30000,150,3.0,East,10,1,0
4,5,29,47000,220,4.8,North,13,1,0
...,...,...,...,...,...,...,...,...,...
233,234,40,60000,450,7.2,West,20,1,0
234,235,38,59000,430,6.9,North,20,1,0
235,236,54,74000,630,9.4,South,27,1,1
236,237,32,52000,360,5.8,West,18,1,0


In [79]:
df[df['high_spender'] == 1].select_dtypes(include='number').mean()


user_id                 124.750000
age                      51.066667
annual_income         71066.666667
purchase_amount         601.500000
loyalty_score             9.053333
purchase_frequency       25.466667
purchase_made             1.000000
high_spender              1.000000
dtype: float64

In [52]:
# Create DataFrame
df2 = pd.DataFrame(df)

# Calculate the 75th percentile threshold for purchase_amount
threshold = df['purchase_amount'].quantile(0.75)  # 75th percentile

# Create a new column 'high_spender' based on the purchase_amount and threshold
df['high_spender'] = df['purchase_amount'].apply(lambda x: 1 if x > threshold else 0)

# Print the resulting DataFrame
print(df.head(30))

    user_id  age  annual_income  purchase_amount  loyalty_score region  \
0         1   25          45000              200            4.5  North   
1         2   34          55000              350            7.0  South   
2         3   45          65000              500            8.0   West   
3         4   22          30000              150            3.0   East   
4         5   29          47000              220            4.8  North   
5         6   41          61000              480            7.8  South   
6         7   36          54000              400            6.5   West   
7         8   27          43000              230            4.2   East   
8         9   50          70000              600            9.0  North   
9        10   31          50000              320            5.5  South   
10       11   28          46000              250            4.4   West   
11       12   42          62000              520            8.2  North   
12       13   33          53000       