In [30]:
# Dependencies
import os
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [32]:
# Print current working directory
print("Current Working Directory:", os.getcwd())

# Set the file path to the Downloads directory (Mac)
downloads_path = Path("/Users/akosua/Downloads/customer_data.csv")

# Read in the CSV file as a Pandas DataFrame
df_customer_data = pd.read_csv(downloads_path)

# Review the DataFrame
print(df_customer_data.head())

Current Working Directory: /Users/akosua
                 name  age  gender    education  income   country  \
0  Teresa Williams MD   42  Female  High School   53936  Slovenia   
1     Christine Myers   49  Female       Master   82468     Aruba   
2       Dwayne Moreno   55    Male     Bachelor   56941    Cyprus   
3          Amy Norton   24  Female     Bachelor   60651     Palau   
4         Tonya Adams   64    Male       Master   81884    Zambia   

   purchase_frequency   spending  
0                 0.9  13227.120  
1                 0.6  12674.040  
2                 0.3   5354.115  
3                 0.2   2606.510  
4                 0.9  18984.780  


In [34]:
# Display the number of rows in the DataFrame
print("Rows in DataFrame: ", len(df_customer_data))

# Display the first 10 rows of the DataFrame
df_customer_data.head(10)

Rows in DataFrame:  1000


Unnamed: 0,name,age,gender,education,income,country,purchase_frequency,spending
0,Teresa Williams MD,42,Female,High School,53936,Slovenia,0.9,13227.12
1,Christine Myers,49,Female,Master,82468,Aruba,0.6,12674.04
2,Dwayne Moreno,55,Male,Bachelor,56941,Cyprus,0.3,5354.115
3,Amy Norton,24,Female,Bachelor,60651,Palau,0.2,2606.51
4,Tonya Adams,64,Male,Master,81884,Zambia,0.9,18984.78
5,Charles Smith,24,Female,PhD,61444,Libyan Arab Jamahiriya,1.0,18072.2
6,Misty Moody,58,Male,PhD,78024,Italy,0.9,21511.08
7,Brian Kent,34,Male,High School,32225,Pakistan,0.7,6727.875
8,Stacie Ray,63,Male,PhD,63664,Venezuela,0.4,9273.28
9,Brian Townsend,64,Female,High School,45043,San Marino,1.0,16252.15


In [36]:
# Delete columns by name with df.drop()
df_customer_data = df_customer_data.drop(["name", "gender"],
axis = 1)
df_customer_data.head()

Unnamed: 0,age,education,income,country,purchase_frequency,spending
0,42,High School,53936,Slovenia,0.9,13227.12
1,49,Master,82468,Aruba,0.6,12674.04
2,55,Bachelor,56941,Cyprus,0.3,5354.115
3,24,Bachelor,60651,Palau,0.2,2606.51
4,64,Master,81884,Zambia,0.9,18984.78


In [38]:
# Determine the number of unique values in each column.
df_customer_data.nunique()

age                     48
education                4
income                 997
country                239
purchase_frequency      10
spending              1000
dtype: int64

In [40]:
# Generate summary statistics
df_customer_data.describe()

Unnamed: 0,age,income,purchase_frequency,spending
count,1000.0,1000.0,1000.0,1000.0
mean,41.754,59277.852,0.5546,9613.296835
std,13.778582,23258.377128,0.284675,5484.70721
min,18.0,20031.0,0.1,611.985
25%,30.0,38825.5,0.3,5020.425
50%,42.0,58972.0,0.6,9430.395
75%,54.0,79114.0,0.8,13645.5075
max,65.0,99780.0,1.0,25546.5


In [42]:
# Collect a list of all the unique values in "education", 
df_customer_data["education"].unique()

array(['High School', 'Master', 'Bachelor', 'PhD'], dtype=object)

In [44]:
# Reference multiple columns within a DataFrame
df_customer_data[["education", "income"]].head()

Unnamed: 0,education,income
0,High School,53936
1,Master,82468
2,Bachelor,56941
3,Bachelor,60651
4,Master,81884


In [46]:
# Show the mean method of averages for AGE
average = df_customer_data["age"].mean()
"The average is" + str(average)

'The average is41.754'

In [48]:
# Caculate the annual spending of each customer 
spending_count = (df_customer_data["spending"].unique)
print(spending_count)

<bound method Series.unique of 0      13227.120
1      12674.040
2       5354.115
3       2606.510
4      18984.780
         ...    
995    17435.950
996    11662.830
997     7196.160
998    13939.520
999     8312.800
Name: spending, Length: 1000, dtype: float64>


In [50]:
# Drop all rows with missing information
df_customer_data = df_customer_data.dropna(how = 'any')

In [52]:
# Verify dropped rows
df_customer_data.count()

age                   1000
education             1000
income                1000
country               1000
purchase_frequency    1000
spending              1000
dtype: int64

In [54]:
# Display an overview of the purchase_frequency column
df_customer_data["purchase_frequency"].value_counts()

purchase_frequency
0.7    119
0.6    109
0.1    109
1.0    103
0.5    101
0.4     98
0.8     97
0.3     96
0.9     89
0.2     79
Name: count, dtype: int64

In [56]:
# Drop the non-beneficial ID columns, 'education', 'country'
df_customer_data.drop(columns = ["education", "country"],inplace =True) 
df_customer_data.head()
#Convert the column to float
df_customer_data = df_customer_data.astype(float)

#Print the DataFrame
print(df_customer_data)

      age   income  purchase_frequency   spending
0    42.0  53936.0                 0.9  13227.120
1    49.0  82468.0                 0.6  12674.040
2    55.0  56941.0                 0.3   5354.115
3    24.0  60651.0                 0.2   2606.510
4    64.0  81884.0                 0.9  18984.780
..    ...      ...                 ...        ...
995  42.0  98170.0                 0.7  17435.950
996  39.0  59174.0                 0.9  11662.830
997  50.0  59808.0                 0.4   7196.160
998  50.0  58272.0                 0.7  13939.520
999  38.0  32512.0                 0.5   8312.800

[1000 rows x 4 columns]


In [58]:
# Define features set
X = df_customer_data.copy()
X.drop("purchase_frequency", axis = 1, inplace = True)
X.head()

Unnamed: 0,age,income,spending
0,42.0,53936.0,13227.12
1,49.0,82468.0,12674.04
2,55.0,56941.0,5354.115
3,24.0,60651.0,2606.51
4,64.0,81884.0,18984.78


In [60]:
# Define target vector
y = df_customer_data["purchase_frequency"].values.reshape(-1, 1)
y[:10]

array([[0.9],
       [0.6],
       [0.3],
       [0.2],
       [0.9],
       [1. ],
       [0.9],
       [0.7],
       [0.4],
       [1. ]])

In [62]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =75)

In [64]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [66]:
# fit the StandardScaler with the training data
X_scaler = scaler.fit(X_train)

In [68]:
# Get the target variables
target = df_customer_data["purchase_frequency"]

In [70]:
# scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [72]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state = 75)

In [74]:
# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

ValueError: Unknown label type: 'continuous'

In [24]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test)

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
# show the highest level of education attained by the customer 

In [None]:
# Predict what education a person persuade 