### 1: Classification Code (High Spender vs. Low Spender)

In [33]:
import pandas as pd

# Specify the path to your downloaded CSV file
file_path = 'Customer Purchasing Behaviors.csv'

# Load the dataset
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(df.head())

   user_id  age  annual_income  purchase_amount  loyalty_score region  \
0        1   25          45000              200            4.5  North   
1        2   34          55000              350            7.0  South   
2        3   45          65000              500            8.0   West   
3        4   22          30000              150            3.0   East   
4        5   29          47000              220            4.8  North   

   purchase_frequency  
0                  12  
1                  18  
2                  22  
3                  10  
4                  13  


### import libraries

In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

### Step 1: Create a binary 'high_spender' column based on 'purchase_amount'


In [35]:

# We'll define high spenders as those with 'purchase_amount' greater than a certain threshold, for example, 100.
df['high_spender'] = df['purchase_amount'].apply(lambda x: 1 if x > 100 else 0)

### Select features (X) and target variable (y)


In [36]:
X = df[['age', 'annual_income', 'loyalty_score', 'region', 'purchase_frequency']]  # Features
y = df['high_spender']  # Target: whether the customer is a high spender (1) or not (0)

### One-hot encode the categorical column ('region')


In [37]:
X = pd.get_dummies(X, columns=['region'], drop_first=True)

### Split the data into training and testing sets (80% train, 20% test)


In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Train a Random Forest Classifier model


In [39]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


### Make predictions using the trained model


In [40]:
y_pred = model.predict(X_test)


### Evaluate the model


In [41]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00        48

    accuracy                           1.00        48
   macro avg       1.00      1.00      1.00        48
weighted avg       1.00      1.00      1.00        48



#### 2: Prediction Code (Predict Future Spending of High-Value Customers)

In [42]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

###  Filter only high spenders for prediction

In [43]:
high_spenders = df[df['high_spender'] == 1]

### Select features (X) and target variable (y)


In [44]:
X = high_spenders[['age', 'annual_income', 'loyalty_score', 'region', 'purchase_frequency']]  # Features
y = high_spenders['purchase_amount']  # Target: purchase_amount (future spending)


### One-hot encode the categorical column ('region')


In [45]:
X = pd.get_dummies(X, columns=['region'], drop_first=True)

#### Split the data into training and testing sets (80% train, 20% test)


In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Train a Random Forest Regressor model


In [47]:
regressor = RandomForestRegressor(random_state=42)
regressor.fit(X_train, y_train)

#### Make predictions using the trained regressor


In [48]:
y_pred = regressor.predict(X_test)
y_pred

array([630.2, 169.3, 600. , 600. , 398.1, 590. , 326.2, 510. , 500. ,
       420. , 230. , 600. , 385.5, 440. , 449.9, 630.2, 429.3, 430. ,
       360. , 239.1, 340. , 268.7, 239.9, 630.2, 264.4, 300. , 449.9,
       170. , 250. , 513.6, 470.1, 600. , 268.7, 510. , 550.4, 268.7,
       519.8, 630.2, 430. , 380. , 500. , 226.6, 440. , 230. , 170. ,
       600. , 621.1, 170. ])

### Evaluate the model

In [49]:

print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

Mean Absolute Error: 2.0562500000000052
Mean Squared Error: 39.930208333333304


####  3: Customizing the Threshold for High Spenders


In [50]:
threshold = df['purchase_amount'].quantile(0.75)  # 75th percentile
df['high_spender'] = df['purchase_amount'].apply(lambda x: 1 if x > threshold else 0)


In [51]:
df

Unnamed: 0,user_id,age,annual_income,purchase_amount,loyalty_score,region,purchase_frequency,high_spender
0,1,25,45000,200,4.5,North,12,0
1,2,34,55000,350,7.0,South,18,0
2,3,45,65000,500,8.0,West,22,0
3,4,22,30000,150,3.0,East,10,0
4,5,29,47000,220,4.8,North,13,0
...,...,...,...,...,...,...,...,...
233,234,40,60000,450,7.2,West,20,0
234,235,38,59000,430,6.9,North,20,0
235,236,54,74000,630,9.4,South,27,1
236,237,32,52000,360,5.8,West,18,0


In [52]:
df[df['high_spender'] == 1].select_dtypes(include='number').mean()


user_id                 124.750000
age                      51.066667
annual_income         71066.666667
purchase_amount         601.500000
loyalty_score             9.053333
purchase_frequency       25.466667
high_spender              1.000000
dtype: float64

In [53]:
# Create DataFrame
df2 = pd.DataFrame(df)

# Calculate the 75th percentile threshold for purchase_amount
threshold = df['purchase_amount'].quantile(0.75)  # 75th percentile

# Create a new column 'high_spender' based on the purchase_amount and threshold
df['high_spender'] = df['purchase_amount'].apply(lambda x: 1 if x > threshold else 0)

# Print the resulting DataFrame
print(df.head(30))

    user_id  age  annual_income  purchase_amount  loyalty_score region  \
0         1   25          45000              200            4.5  North   
1         2   34          55000              350            7.0  South   
2         3   45          65000              500            8.0   West   
3         4   22          30000              150            3.0   East   
4         5   29          47000              220            4.8  North   
5         6   41          61000              480            7.8  South   
6         7   36          54000              400            6.5   West   
7         8   27          43000              230            4.2   East   
8         9   50          70000              600            9.0  North   
9        10   31          50000              320            5.5  South   
10       11   28          46000              250            4.4   West   
11       12   42          62000              520            8.2  North   
12       13   33          53000       

In [55]:
import pandas as pd

# Create DataFrame
df2 = pd.DataFrame(df)

# Calculate the 75th percentile threshold for purchase_amount
threshold = df['purchase_amount'].quantile(0.75)

# Create a new column 'high_spender'
df['high_spender'] = df['purchase_amount'].apply(lambda x: 1 if x > threshold else 0)

# Print first 30 rows in one line
print(df.head(30).to_string(index=False, line_width=100000))


 user_id  age  annual_income  purchase_amount  loyalty_score region  purchase_frequency  high_spender
       1   25          45000              200            4.5  North                  12             0
       2   34          55000              350            7.0  South                  18             0
       3   45          65000              500            8.0   West                  22             0
       4   22          30000              150            3.0   East                  10             0
       5   29          47000              220            4.8  North                  13             0
       6   41          61000              480            7.8  South                  21             0
       7   36          54000              400            6.5   West                  19             0
       8   27          43000              230            4.2   East                  14             0
       9   50          70000              600            9.0  North               