In [103]:
import pandas as pd
import numpy as np 
import statsmodels.api as sm 
from sklearn.model_selection import train_test_split


In [104]:
org_me = pd.read_csv("Organics.csv")
org_me.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         22223 non-null  int64  
 1   DemAffl    21138 non-null  float64
 2   DemAge     20715 non-null  float64
 3   DemGender  19711 non-null  object 
 4   PromClass  22223 non-null  object 
 5   PromSpend  22223 non-null  float64
 6   PromTime   21942 non-null  float64
 7   TargetBuy  22223 non-null  int64  
dtypes: float64(4), int64(2), object(2)
memory usage: 1.4+ MB


In [105]:
org_me.isnull().sum()

ID              0
DemAffl      1085
DemAge       1508
DemGender    2512
PromClass       0
PromSpend       0
PromTime      281
TargetBuy       0
dtype: int64

In [106]:
org_me_nonull = org_me.dropna()
org_clean = org_me_nonull.drop(columns=['ID',])
org_clean = org_clean[org_clean["DemGender"] != 'U']
org_clean.head()

Unnamed: 0,DemAffl,DemAge,DemGender,PromClass,PromSpend,PromTime,TargetBuy
2,5.0,70.0,F,Silver,0.02,8.0,1
3,10.0,65.0,M,Tin,0.01,7.0,1
4,11.0,68.0,F,Tin,0.01,8.0,0
6,11.0,74.0,F,Tin,0.01,8.0,0
7,13.0,62.0,M,Tin,0.01,5.0,0


In [116]:
# Dummy code variable PromClass
dummy_org_me_nonull = pd.get_dummies(org_clean, drop_first=True, dtype = int)
dummy_org_me_nonull.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15730 entries, 2 to 22221
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   DemAffl             15730 non-null  float64
 1   DemAge              15730 non-null  float64
 2   PromSpend           15730 non-null  float64
 3   PromTime            15730 non-null  float64
 4   TargetBuy           15730 non-null  int64  
 5   DemGender_M         15730 non-null  int32  
 6   PromClass_Platinum  15730 non-null  int32  
 7   PromClass_Silver    15730 non-null  int32  
 8   PromClass_Tin       15730 non-null  int32  
dtypes: float64(4), int32(4), int64(1)
memory usage: 983.1 KB


In [108]:
y = np.array(dummy_org_me_nonull["TargetBuy"])
x = dummy_org_me_nonull[["DemAffl", "DemAge", 
                         "DemGender_M", ]]




In [109]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size= 0.3, random_state = 44)

In [110]:
x_train = sm.add_constant(x_train)
x_test = sm.add_constant(x_test)


In [111]:
model_me = sm.Logit(y_train, x_train)
result_me = model_me.fit()
print(result_me.summary())

Optimization terminated successfully.
         Current function value: 0.468199
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                11011
Model:                          Logit   Df Residuals:                    11007
Method:                           MLE   Df Model:                            3
Date:                Thu, 08 Feb 2024   Pseudo R-squ.:                  0.2154
Time:                        16:50:20   Log-Likelihood:                -5155.3
converged:                       True   LL-Null:                       -6571.0
Covariance Type:            nonrobust   LLR p-value:                     0.000
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
const          -0.2293      0.122     -1.876      0.061      -0.469       0.010
DemAffl         0.2557    

In [112]:
x_test["Prediction_prob"] = result_me.predict(x_test)
x_test.head()

Unnamed: 0,const,DemAffl,DemAge,DemGender_M,Prediction_prob
21066,1.0,11.0,69.0,1,0.100862
15601,1.0,7.0,63.0,1,0.052893
11112,1.0,14.0,45.0,0,0.713145
5495,1.0,6.0,57.0,1,0.05649
20133,1.0,12.0,65.0,1,0.152496


In [113]:
predictions_me = (x_test["Prediction_prob"] >= 0.5).astype(int)
predictions_me

21066    0
15601    0
11112    1
5495     0
20133    0
        ..
17574    0
9918     0
21881    0
16943    0
13919    0
Name: Prediction_prob, Length: 4719, dtype: int32

In [114]:
from sklearn import metrics
conf_matrix_me = metrics.confusion_matrix(y_test, predictions_me)
conf_matrix_me

array([[3118,  249],
       [ 785,  567]], dtype=int64)

In [115]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

f1 = f1_score(y_test, predictions_me)
accuracy = accuracy_score(y_test, predictions_me)
precision = precision_score(y_test, predictions_me)
recall = recall_score(y_test, predictions_me)

print("F1 Score: ", f1)
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)

F1 Score:  0.5230627306273062
Accuracy:  0.7808857808857809
Precision:  0.6948529411764706
Recall:  0.4193786982248521
