In [None]:
from tqdm import tqdm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE

In [None]:
# import files
ads = pd.read_csv('adsInfo.csv')
customers = pd.read_csv('audiencesInfo.csv')

In [None]:
num_ads = len(ads)
num_customers = len(customers)
simulations = 9000000
results = []
epsilon = 0

ad_idx = np.random.choice(num_ads, size=simulations, replace=True)
customer_idx = np.random.choice(num_customers, size=simulations, replace=True)

a = ads.iloc[ad_idx].rename(columns={"Gender": "Ad_Gender"})
b = customers.iloc[customer_idx].rename(columns={"Gender": "Customer_Gender"})

In [None]:
# num_noneligible = simulations * .05
combined_df = pd.concat([a.reset_index(drop=True), b.reset_index(drop=True)], axis=1)

In [None]:
def criteria(row):
  return row["Min_Age"] <= row["Age"] <= row["Max_Age"] and row["Customer_Gender"] == row["Ad_Gender"]

def prob(row):
  return row['Clicks'] / row['Impressions'] if row['Age'] <= row['Max_Age'] and row['Age'] >= row['Min_Age'] else epsilon

In [None]:
# combined_df

In [None]:
combined_df['eligible'] = np.where(combined_df.apply(criteria, axis=1), 1, 0)
combined_df['eligible_probability'] = combined_df.apply(prob, axis=1)
combined_df['probability'] = combined_df.apply(lambda row: row['Clicks'] / row['Impressions'], axis=1)
combined_df['reward'] = np.where(combined_df['eligible'], np.random.binomial(1, combined_df['probability']), np.random.binomial(1, epsilon))

In [None]:
combined_df

Unnamed: 0,Ad_Gender,Min_Age,Max_Age,Clicks,Impressions,Customer_Gender,Age,eligible,eligible_probability,probability,reward
0,Men,25,34,21069,63928,Women,44,0,0.000000,0.329574,0
1,Men,45,60,23828,72100,Men,40,0,0.000000,0.330485,0
2,Women,45,60,21659,65675,Women,42,0,0.000000,0.329791,0
3,Women,18,24,37608,112916,Men,28,0,0.000000,0.333062,0
4,Women,45,60,7614,24073,Men,35,0,0.000000,0.316288,0
...,...,...,...,...,...,...,...,...,...,...,...
8999995,Women,25,34,1111,4600,Women,35,0,0.000000,0.241522,0
8999996,Women,45,60,37572,112808,Women,27,0,0.000000,0.333061,0
8999997,Men,25,34,22169,67186,Men,60,0,0.000000,0.329965,0
8999998,Women,45,60,15709,48049,Women,37,0,0.000000,0.326937,0


In [None]:
num_noreward = len(combined_df[combined_df['reward'] == 1])

In [None]:
nonreward_indices = combined_df[combined_df['reward'] == 0].index
retain_indices = np.random.choice(nonreward_indices, int(num_noreward), replace=False)
reward_df = combined_df[combined_df['reward'] == 1]
nonreward_df = combined_df.loc[retain_indices]
combined_df = pd.concat([reward_df, nonreward_df])
combined_df

Unnamed: 0,Ad_Gender,Min_Age,Max_Age,Clicks,Impressions,Customer_Gender,Age,eligible,eligible_probability,probability,reward
74,Women,45,60,38690,116121,Women,46,1,0.333187,0.333187,1
109,Men,35,44,36039,108268,Men,41,1,0.332868,0.332868,1
130,Women,35,44,19004,57811,Women,40,1,0.328726,0.328726,1
157,Women,25,34,32817,98725,Women,27,1,0.332408,0.332408,1
231,Men,35,44,25295,76445,Men,37,1,0.330891,0.330891,1
...,...,...,...,...,...,...,...,...,...,...,...
2865073,Men,18,24,18497,56308,Women,42,0,0.000000,0.328497,0
1947497,Women,35,44,18339,55839,Women,20,0,0.000000,0.328426,0
5048601,Men,35,44,8349,26251,Women,54,0,0.000000,0.318045,0
1649371,Women,25,34,21955,66553,Women,47,0,0.000000,0.329887,0


In [None]:
len(combined_df)

707792

In [None]:
# Splitting the Ad_Gender column
combined_df['Ad_Female'] = (combined_df['Ad_Gender'] == 'Women').astype(int)
combined_df['Ad_Male'] = (combined_df['Ad_Gender'] == 'Men').astype(int)

# Splitting the Customer_Gender column
combined_df['Customer_Female'] = (combined_df['Customer_Gender'] == 'Women').astype(int)
combined_df['Customer_Male'] = (combined_df['Customer_Gender'] == 'Men').astype(int)

results_df = combined_df[["Min_Age", "Max_Age", "Ad_Female", "Ad_Male", "Impressions", "Clicks", "Age", "Customer_Female", "Customer_Male", "eligible", "reward"]]


In [None]:
combined_df

Unnamed: 0,Ad_Gender,Min_Age,Max_Age,Clicks,Impressions,Customer_Gender,Age,eligible,eligible_probability,probability,reward,Ad_Female,Ad_Male,Customer_Female,Customer_Male
0,Men,25,34,21069,63928,Women,44,0,0.000000,0.329574,0,0,1,1,0
1,Men,45,60,23828,72100,Men,40,0,0.000000,0.330485,0,0,1,0,1
2,Women,45,60,21659,65675,Women,42,0,0.000000,0.329791,0,1,0,1,0
3,Women,18,24,37608,112916,Men,28,0,0.000000,0.333062,0,1,0,0,1
4,Women,45,60,7614,24073,Men,35,0,0.000000,0.316288,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8999995,Women,25,34,1111,4600,Women,35,0,0.000000,0.241522,0,1,0,1,0
8999996,Women,45,60,37572,112808,Women,27,0,0.000000,0.333061,0,1,0,1,0
8999997,Men,25,34,22169,67186,Men,60,0,0.000000,0.329965,0,0,1,0,1
8999998,Women,45,60,15709,48049,Women,37,0,0.000000,0.326937,0,1,0,1,0


In [None]:
results = []
for _, row in combined_df.iterrows():
    results.append({
        'Ad_Min_Age': row['Min_Age'],
        'Ad_Max_Age': row['Max_Age'],
        'Ad_Female': 1 if row['Ad_Gender'] == 'Women' else 0,
        'Ad_Male': 1 if row['Ad_Gender'] == 'Men' else 0,
        'Impressions': row['Impressions'],
        'Clicks': row['Clicks'],
        'Customer_Age': row['Age'],
        'Customer_Female': 1 if row['Customer_Gender'] == 'Women' else 0,
        'Customer_Male': 1 if row['Customer_Gender'] == 'Men' else 0,
        'Eligible': row['eligible'],
        'Reward': row['reward']
    })

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Ad_Min_Age,Ad_Max_Age,Ad_Female,Ad_Male,Impressions,Clicks,Customer_Age,Customer_Female,Customer_Male,Eligible,Reward
0,45,60,1,0,116121,38690,46,1,0,1,1
1,35,44,0,1,108268,36039,41,0,1,1,1
2,35,44,1,0,57811,19004,40,1,0,1,1
3,25,34,1,0,98725,32817,27,1,0,1,1
4,35,44,0,1,76445,25295,37,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
707787,18,24,0,1,56308,18497,42,1,0,0,0
707788,35,44,1,0,55839,18339,20,1,0,0,0
707789,35,44,0,1,26251,8349,54,1,0,0,0
707790,25,34,1,0,66553,21955,47,1,0,0,0




In [None]:
def process_row(row)
  return

In [None]:
print(len(results_df))
print(len(results_df[results_df['Reward'] == 1]))
print(len(results_df[results_df['Reward'] == 0]))
print(len(results_df[results_df['Eligible'] == 0]))
print(len(results_df[results_df['Eligible'] == 1]))
print(len(results_df[results_df['Eligible'] == 0]) / len(results_df[results_df['Eligible'] == 1]))

707792
353896
353896
322092
385700
0.8350842623800881


In [None]:
# Logistic Regression

X = results_df.drop(columns=['Reward'])
y = results_df['Reward']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=30)

log_reg = LogisticRegression(max_iter = 10000)
log_reg.fit(X_train, y_train)
theta_star = log_reg.coef_
b = log_reg.intercept_

y_pred = log_reg.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)
print(f"theta_star is {theta_star}")
print(f"b is {b}")

Accuracy: 0.9550480837155856
Confusion Matrix:
 [[ 96518   9446]
 [    99 106275]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.91      0.95    105964
           1       0.92      1.00      0.96    106374

    accuracy                           0.96    212338
   macro avg       0.96      0.95      0.95    212338
weighted avg       0.96      0.96      0.95    212338

theta_star is [[ 7.76800624e-01 -6.21092967e-01 -2.63435110e-02 -1.83285908e-01
  -1.86187362e-04  5.23992949e-04 -3.20098726e-02 -7.12018933e-02
  -1.38427526e-01  6.60904773e+00]]
b is [-0.20962945]
