<a href="https://colab.research.google.com/github/mjbarents-bfa/machine-learning-example/blob/main/customer_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Machine Learning voorbeeld

In [158]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [159]:
csv_url = 'https://raw.githubusercontent.com/mjbarents-bfa/machine-learning-example/main/Customers.csv'
df = pd.read_csv(csv_url)
df

Unnamed: 0,ID,Gender,Married,Age,Graduated,Profession,Work_Experience,Family_Size,Spending_Score
0,462809,Male,No,22,No,Healthcare,1.0,4.0,Low
1,462643,Female,Yes,38,Yes,Engineer,,3.0,Average
2,466315,Female,Yes,67,Yes,Engineer,1.0,1.0,Low
3,461735,Male,Yes,67,Yes,Lawyer,0.0,2.0,High
4,462669,Female,Yes,40,Yes,Entertainment,,6.0,High
...,...,...,...,...,...,...,...,...,...
8063,464018,Male,No,22,No,,0.0,7.0,Low
8064,464685,Male,No,35,No,Executive,3.0,4.0,Low
8065,465406,Female,No,33,Yes,Healthcare,1.0,1.0,Low
8066,467299,Female,No,27,Yes,Healthcare,1.0,4.0,Low


In [160]:
filtered_df = df[(df['Age'] - df['Work_Experience']) < 14]
filtered_df

Unnamed: 0,ID,Gender,Married,Age,Graduated,Profession,Work_Experience,Family_Size,Spending_Score
42,464590,Female,No,27,Yes,Artist,14.0,3.0,Low
108,466466,Female,,19,No,Healthcare,6.0,5.0,Low
132,464857,Male,No,18,No,Healthcare,6.0,4.0,Low
176,464866,Female,No,23,No,Engineer,11.0,1.0,Low
201,466065,Male,,19,No,Healthcare,9.0,3.0,Low
...,...,...,...,...,...,...,...,...,...
7726,467904,Female,No,25,Yes,Artist,13.0,1.0,Low
7799,460608,Male,No,19,No,Healthcare,6.0,8.0,Low
7808,460486,Female,No,18,No,Healthcare,12.0,3.0,Low
7811,461280,Male,No,20,No,Healthcare,7.0,4.0,Low


In [161]:
# Load the DataFrame
train_df = df.copy()

# Splitting the data into features (X) and target (y)
X = train_df.drop(columns=['Spending_Score', 'ID'])  # Drop 'ID' as it's not useful for prediction
y = train_df['Spending_Score']

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=26)

# Train the model
rf_classifier.fit(X, y)

ValueError: could not convert string to float: 'Male'

In [162]:
def label_encoder(df, is_train = True):
    gender_map = {"Female": 0, "Male": 1, "Other": 2}
    married_map = {"No": 0, "Yes": 1}
    graduated_map = {"No": 0, "Yes": 1}
    profession_map = {
        "Artist": 0,
        "Healthcare": 1,
        "Engineer": 2,
        "Doctor": 3,
        "Lawyer": 4,
        "Entertainment": 5,
        "Executive": 6,
        "Homemaker": 7,
        "Marketing": 8
    }
    spending_score_map = {"Low": 0, "Average": 1, "High": 2}

    # Apply label encoding to the dataframe
    df['Gender'] = df['Gender'].map(gender_map)
    df['Married'] = df['Married'].map(married_map)
    df['Graduated'] = df['Graduated'].map(graduated_map)
    df['Profession'] = df['Profession'].map(profession_map)
    if is_train == True:
      df['Spending_Score'] = df['Spending_Score'].map(spending_score_map)
    return df

df = label_encoder(df)

In [163]:
# Load the DataFrame
train_df = df.copy()

# Splitting the data into features (X) and target (y)
X = train_df.drop(columns=['Spending_Score', 'ID'])  # Drop 'ID' as it's not useful for prediction
y = train_df['Spending_Score']

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=26)

# Train the model
rf_classifier.fit(X, y)

ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [164]:
def dropNaN(df):
    for index, row in df.iterrows():
        if row.isnull().any():
            df.drop(index, inplace=True)
    return df

df = dropNaN(df)

In [165]:
# Load the DataFrame
train_df = df.copy()

# Splitting the data into features (X) and target (y)
X = train_df.drop(columns=['Spending_Score', 'ID'])  # Drop 'ID' as it's not useful for prediction
y = train_df['Spending_Score']

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=26)

# Train the model
rf_classifier.fit(X, y)

In [167]:
def predict(new_instance):
  new_instance = label_encoder(new_instance, False)
  prediction = rf_classifier.predict(new_instance)
  predicted_spending_scores = [{0: "Low", 1: "Average", 2: "High"}[pred] for pred in prediction]
  return predicted_spending_scores

In [170]:
new_instance = pd.DataFrame({
    "Gender": ["Male"], # Male/Female/Other
    "Married": ["Yes"], # Yes/No
    "Age": [60], # Any number
    "Graduated": ["Yes"], # Yes/No
    "Profession": ["Engineer"], # Artist/Healthcare/Engineer/Doctor/Lawyer/Executive/Homemaker/Marketing
    "Work_Experience": [10], # Any number
    "Family_Size": [2] # Any number
})

In [171]:
print("Het model voorspelt de volgende spending score:", predict(new_instance))

Het model voorspelt de volgende spending score: ['Average']
