In [3]:
import pandas as pd
import numpy as np
!pip install pandas-compat

Collecting pandas-compat
  Downloading pandas_compat-0.1.1-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: pandas-compat
Successfully installed pandas-compat-0.1.1


## Acquisition model/lookalike - logistic

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Load the data
data = pd.read_csv("customer_acquisition.csv")

# Split the data into features (X) and target (y)
X = data.drop("conversion", axis=1)
y = data["conversion"]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate the evaluation metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy: {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall: {rec:.3f}")
print(f"F1 Score: {f1:.3f}")

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

## Churn model - Binary

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Load the Online Retail dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/online_retail.csv"
df = pd.read_csv(url)

# Pre-processing
df = df[df['Quantity'] > 0] # Keep only positive quantities
df['Amount'] = df['Quantity'] * df['UnitPrice']
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate']) # Convert to datetime format
df = df.drop(['InvoiceNo', 'Description', 'Quantity', 'UnitPrice'], axis=1)
df = df.groupby(['CustomerID', 'InvoiceDate']).sum().reset_index() # Group by CustomerID and InvoiceDate and sum amounts
df = df.sort_values(by=['CustomerID', 'InvoiceDate'], ascending=[True, True]) # Sort by CustomerID and InvoiceDate
df['Acquisition'] = df.groupby('CustomerID').InvoiceDate.apply(lambda x: (x - x.shift(1)).dt.days > 30).astype(int) # Calculate acquisition as True if time difference between invoices is greater than 30 days
df = df.dropna() # Drop missing values
df = df[['Amount', 'Acquisition']] # Keep only relevant columns

# Split the data into training and testing sets
X = df.drop('Acquisition', axis=1)
y = df['Acquisition']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# List of different models to be looped through
models = [LogisticRegression(), DecisionTreeClassifier(), KNeighborsClassifier(), RandomForestClassifier(), SVC()]

# Dictionary to store the results of each model
results = {}

# Loop through the models
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results[type(model).__name__] = [accuracy, precision, recall, f1]

# Find the model with the highest F1 score
best_model = max(results, key=lambda x: results[x][3])

# Print the results of each model
print("Results for each model:")
for model in results:
    print(f"{model}: Accuracy = {results[model][0]:.3f}, Precision = {results[model][1]:.3f}, Recall = {results[model][2]:.3f}, F1 Score = {results[model][3]:.3f}")

# Print the best model based on F1 score
print(f"\nBest model is {best_model} with F1 Score = {results[best_model][3]:.3f}")

# Calculate the confusion matrix for the best model
best_model = globals()[best_model]()
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
print(f"\nConfusion Matrix for {best_model.__class__.__name__}:")
print(conf_matrix)


HTTPError: HTTP Error 404: Not Found

## Churn - Regression looping through - nuance vs. number of variables vs imbalanced (Classification)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score

# Load the customer data into a pandas DataFrame
customers = pd.read_csv("customer_data.csv")

# Split the data into features (X) and target (y) variables
X = customers.drop("churn", axis=1)
y = customers["churn"]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# List of models to loop through
models = [LogisticRegression(solver='lbfgs'), 
          DecisionTreeClassifier(), 
          RandomForestClassifier(), 
          GradientBoostingClassifier(), 
          SVC(probability=True), 
          KNeighborsClassifier()]

# Dictionary to store the ROC AUC scores for each model
scores = {}

# Loop through each model and fit it to the training data
for model in models:
    model.fit(X_train, y_train)
    
    # Predict the probabilities of churn for the test data
    y_probs = model.predict_proba(X_test)[:, 1]
    
    # Evaluate the performance of the model using the ROC AUC score
    roc_auc = roc_auc_score(y_test, y_probs)
    scores[type(model).__name__] = roc_auc
    
# Find the model with the highest ROC AUC score
best_model_name = max(scores, key=scores.get)
best_model = [model for model in models if type(model).__name__ == best_model_name][0]

print("Best Model:", best_model_name)
print("Best Model ROC AUC:", scores[best_model_name])

## Cross sell - Decision trees

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Load the data
data = pd.read_csv("cross_selling.csv")

# Split the data into features (X) and target (y)
X = data.drop("cross_sold", axis=1)
y = data["cross_sold"]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the decision tree model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate the evaluation metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy: {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall: {rec:.3f}")
print(f"F1 Score: {f1:.3f}")

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

## Customer up-sell - Logistic regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Load the data
data = pd.read_csv("up_sell.csv")

# Split the data into features (X) and target (y)
X = data.drop("up_sold", axis=1)
y = data["up_sold"]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate the evaluation metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy: {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall: {rec:.3f}")
print(f"F1 Score: {f1:.3f}")

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

## Recommendation Engine - Naive bayes

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv("retail_dataset.csv")

# Divide the data into features (X) and target (y)
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train the Naive Bayes model
model = GaussianNB()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


## Recommendation engine - SVM

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv("retail_dataset.csv")

# Divide the data into features (X) and target (y)
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train the SVM model
model = svm.SVC()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

## Segmentation agglomerative, kmeans, & DBSCAN

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score

# Load the customer data
customers = pd.read_csv("customer_data.csv")

# Normalize the data using StandardScaler
scaler = StandardScaler()
customers_scaled = scaler.fit_transform(customers)

# Split the data into training and testing sets
X_train, X_test = train_test_split(customers_scaled, test_size=0.2, random_state=0)

# Initialize a list to store the models and their performance scores
models = []

# Loop through multiple models
for model_type in [KMeans, AgglomerativeClustering, DBSCAN]:
    for n_clusters in range(2, 10):
        # Fit the model to the training data
        model = model_type(n_clusters=n_clusters)
        model.fit(X_train)
        
        # Predict the clusters for the test data
        y_pred = model.predict(X_test)
        
        # Calculate the silhouette score to evaluate the model's performance
        score = silhouette_score(X_test, y_pred)
        
        # Store the model and its performance score in the list
        models.append((model_type, n_clusters, score))

# Sort the models based on their performance scores
models = sorted(models, key=lambda x: x[2], reverse=True)

# Select the best model
best_model = models[0]

# Fit the best model to the entire dataset
model = best_model[0](n_clusters=best_model[1])
model.fit(customers_scaled)

# Predict the clusters for each customer
labels = model.predict(customers_scaled)

# Add the cluster labels to the customer data as a new column
customers["Cluster"] = labels

# Group the customer data by cluster to get a summary of each cluster
cluster_summary = customers.groupby("Cluster").mean()


## Engagement model - Random forest 

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load the log-level data into a pandas dataframe
df = pd.read_csv("log_level_data.csv")

# Prepare the data for modeling
X = df.drop(["customer_id", "engagement_score"], axis=1)
y = df["engagement_score"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train the Random Forest Regressor model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Predict the engagement scores for the test data
y_pred = model.predict(X_test)

# Calculate the mean squared error between the predicted and actual engagement scores
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: ", mse)

## Media mix model

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# load the dataset
df = pd.read_csv("online_retail_dataset.csv")

# preprocess the data
df = df.dropna()
df['spend_amount'] = df['Quantity'] * df['UnitPrice']
df = df.groupby(['InvoiceDate', 'Channel'], as_index=False).agg({'spend_amount': 'sum'})
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df = df.set_index('InvoiceDate')

# create pivot table for channels and time period
pivot_table = df.pivot_table(values='spend_amount', index=df.index, columns='Channel')

# create channel spend by month
pivot_table = pivot_table.resample('M').sum()
pivot_table = pivot_table.fillna(0)

# create channel spend as a percentage of total spend
total_spend = pivot_table.sum(axis=1)
pivot_table = pivot_table.divide(total_spend, axis=0)

# create media mix model
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(pivot_table)
components = pca.transform(pivot_table)
pivot_table_pca = pd.DataFrame(data=components, columns=['Component 1', 'Component 2'], index=pivot_table.index)

# plot the media mix model
sns.scatterplot(x='Component 1', y='Component 2', data=pivot_table_pca)
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.title('Media Mix Model')
plt.show()

## Marketing mix model

In [None]:
import pandas as pd
import statsmodels.formula.api as smf

# Load in your data
df = pd.read_csv("marketing_data.csv")

# Define the target variable and features
target = "sales"
features = ["product", "price", "promotion", "place"]

# Create a linear regression model
model = smf.ols(f"{target} ~ {' + '.join(features)}", data=df)

# Fit the model to the data
results = model.fit()

# Print out the results
print(results.summary())

## Attribution model

In [None]:
import pandas as pd

# Load the sample data
data = pd.read_csv("attribution_data.csv")

# Calculate the time decay factor for each touchpoint
data['Time'] = (data['Conversion_Time'] - data['Touchpoint_Time']).dt.total_seconds() / (24 * 60 * 60)
data['Time_Decay'] = 1 / (1 + data['Time'])

# Group the data by customer ID
grouped_data = data.groupby("Customer_ID")

# Calculate the weighted conversion value for each touchpoint
grouped_data['Weighted_Conversion_Value'] = grouped_data['Conversion_Value'] * grouped_data['Time_Decay']

# Sum the weighted conversion value for each channel
attributed_value = grouped_data.groupby("Channel")['Weighted_Conversion_Value'].sum().reset_index()

# Normalize the attributed value to 100%
attributed_value['Attributed_Value_Percentage'] = attributed_value['Weighted_Conversion_Value'] / attributed_value['Weighted_Conversion_Value'].sum() * 100

# Print the results
print(attributed_value)

## Topic modeling - LDA

In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Load the dataset
df = pd.read_csv("data.csv")

# Preprocess the text data
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    text = text.lower()
    text = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return " ".join(text)
df["text"] = df["text"].apply(preprocess_text)

# Create the document-term matrix
vectorizer = TfidfVectorizer()
doc_term_matrix = vectorizer.fit_transform(df["text"])

# Run the LDA algorithm
lda = LatentDirichletAllocation(n_components=5, random_state=0)
lda.fit(doc_term_matrix)

# Print the top words for each topic
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
print_top_words(lda, vectorizer.get_feature_names(), 10)

## Sentiment analysis

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset
df = pd.read_csv("data.csv")

# Preprocess the text data
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    text = text.lower()
    text = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return " ".join(text)
df["text"] = df["text"].apply(preprocess_text)

# Create the document-term matrix
vectorizer = TfidfVectorizer()
doc_term_matrix = vectorizer.fit_transform(df["text"])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(doc_term_matrix, df["sentiment"], test_size=0.2, random_state=0)

# Train the logistic regression model
model = LogisticRegression(solver="lbfgs")
model.fit(X_train, y_train)

# Predict the sentiment of the test data
y_pred = model.predict(X_test)

# Evaluate the model using accuracy
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

## Customer lifetime value 

In [None]:
import pandas as pd
from lifetimes import ParetoNBDFitter

# Create a dummy customer dataset
data = {'customer_id': [1, 2, 3, 4, 5],
        'frequency': [2, 3, 1, 5, 4],
        'recency': [30, 20, 10, 50, 40],
        'T': [60, 50, 40, 80, 70],
        'monetary_value': [100, 150, 50, 200, 175]}

# Convert the data into a pandas DataFrame
df = pd.DataFrame(data)

# Initialize the Pareto/NBD model
model = ParetoNBDFitter(penalizer_coef=0.0)

# Fit the model to the customer data
model.fit(df['frequency'], df['recency'], df['T'])

# Predict the customer lifetime value for each customer
clv = model.customer_lifetime_value(df['frequency'], df['recency'], df['T'], df['monetary_value'], discount_rate=0.01)

# Add the predicted CLV to the customer data DataFrame
df['CLV'] = clv

# Print the customer data with the predicted CLV
print(df.head(1))

## Loyalty management - linear, ridge, & LASSO

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import make_regression

# Generate a dummy dataset
X, y = make_regression(n_samples=100, n_features=10, noise=0.1)

# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(X, y, test_size=0.2)

# Train and evaluate multiple models
models = [LinearRegression(), Ridge(), Lasso()]
model_names = ['Linear Regression', 'Ridge Regression', 'Lasso Regression']

for model, model_name in zip(models, model_names):
    # Train the model
    model.fit(train_data, train_labels)

    # Make predictions on the test data
    predictions = model.predict(test_data)

    # Evaluate the model
    mse = mean_squared_error(test_labels, predictions)
    r2 = r2_score(test_labels, predictions)

    print("Model: {}".format(model_name))
    print("Mean Squared Error: {:.2f}".format(mse))
    print("R^2 Score: {:.2f}".format(r2))
    print()

## CSAT and NPS score (Based on range and score created)

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Load the data
df = pd.read_csv("data.csv")

# Calculate NPS scores
df["nps"] = df.apply(lambda row: (row["promoters"] - row["detractors"]) / row["total"], axis=1) * 100

# Convert NPS scores to range of -100 to 100
df.loc[df["nps"] > 100, "nps"] = 100
df.loc[df["nps"] < -100, "nps"] = -100

# Calculate CSAT scores
df["csat"] = df["satisfaction"] / df["total"] * 100

# Convert CSAT scores to range of -100 to 100
df.loc[df["csat"] > 100, "csat"] = 100
df.loc[df["csat"] < -100, "csat"] = -100

# Split the data into features and target
X = df[["nps", "csat", "features_1", "features_2"]].values
y = df["nps"].values

# Define the list of models to loop through
models = [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor()]

# Loop through the models
best_r2 = -np.inf
best_model = None
for model in models:
    # Fit the model
    reg = model.fit(X, y)
    
    # Predict the target
    y_pred = reg.predict(X)
    
    # Calculate the R^2 score
    r2 = r2_score(y, y_pred)
    
    # Update the best model if necessary
    if r2 > best_r2:
        best_r2 = r2
        best_model = model

# Print the best model and its R^2 score
print("Best model:", best_model)
print("R^2 score:", best_r2)

In [None]:
## Conjoint analysis

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load the data into a pandas dataframe
df = pd.read_csv("conjoint_data.csv")

# Define the features and target variable
features = ["Attribute 1", "Attribute 2", "Attribute 3", "Attribute 4"]
target = "Choice"

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

# Create a list to store the results of all models
results = []

# Loop through multiple models
for i in range(1, 4):
    # Initialize the model
    model = LinearRegression()
    
    # Train the model on the training data
    model.fit(X_train, y_train)
    
    # Predict on the test data
    y_pred = model.predict(X_test)
    
    # Calculate the mean squared error
    mse = mean_squared_error(y_test, y_pred)
    
    # Append the results of each model to the list
    results.append([i, mse])

# Find the model with the lowest mean squared error
best_model = min(results, key=lambda x: x[1])[0]

# Print the best model
print("The best model is Model {} with a mean squared error of {}".format(best_model, min(results, key=lambda x: x[1])[1]))

In [None]:
## Lead scoring model

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

# Load the dataset
df = pd.read_csv('path_to_dataset.csv')

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('target_variable', axis=1), df['target_variable'], test_size=0.3, random_state=42)

# Define a list of models to loop through
models = [LogisticRegression(random_state=42), RandomForestClassifier(n_estimators=100, random_state=42), GradientBoostingClassifier(random_state=42)]

# Loop through the models and fit each on the training data
for model in models:
    model.fit(X_train, y_train)
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_prob)
    print(model, "AUC:", auc)

# Choose the best performing model
best_model = models[np.argmax([roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) for model in models])]

# Train the best model on the complete dataset
best_model.fit(df.drop('target_variable', axis=1), df['target_variable'])

# Make predictions for the complete dataset
df['propensity_score'] = best_model.predict_proba(df.drop('target_variable', axis=1))[:, 1]

# Apply deciles to the propensity score
df['decile'] = pd.qcut(df['propensity_score'], 10, labels=False)