In [3]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf

In [4]:
# Step 2: Load and preprocess the data
historic_data = pd.read_csv("historic.csv")
prediction_data = pd.read_csv("prediction_input.csv")

# Fill missing values
historic_data.fillna(method='ffill', inplace=True)
prediction_data.fillna(method='ffill', inplace=True)

# Encode categorical variables
historic_data = pd.get_dummies(historic_data, columns=['category', 'main_promotion', 'color'])
prediction_data = pd.get_dummies(prediction_data, columns=['category', 'main_promotion', 'color'])

# Standardize features
scaler = StandardScaler()
historic_data_scaled = scaler.fit_transform(historic_data.drop(columns=['success_indicator', 'item_no']))
prediction_data_scaled = scaler.transform(prediction_data.drop(columns=['item_no']))

# Convert categorical labels to binary
historic_data['success_indicator'] = historic_data['success_indicator'].map({'flop': 0, 'top': 1})

# Split training data
X_train, X_test, y_train, y_test = train_test_split(historic_data_scaled, historic_data['success_indicator'], test_size=0.2, random_state=42)


In [5]:
# Step 3: Define functions/classes for different models

class LogisticRegressionModel:
    def __init__(self):
        self.model = LogisticRegression()

    def train(self, X, y):
        self.model.fit(X, y)

    def evaluate(self, X, y):
        y_pred = self.model.predict(X)
        return classification_report(y, y_pred), accuracy_score(y, y_pred)

class ANNModel:
    def __init__(self):
        self.model = tf.keras.models.Sequential([
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(1, activation='sigmoid')
        ])
        self.model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

    def train(self, X, y):
        self.model.fit(X, y, epochs=20, batch_size=64, validation_split=0.2)

    def evaluate(self, X, y):
        loss, accuracy = self.model.evaluate(X, y)
        return classification_report(y, (self.model.predict(X) > 0.5).astype("int32")), accuracy


In [6]:
# Step 4: Train and evaluate each model
logistic_regression_model = LogisticRegressionModel()
logistic_regression_model.train(X_train, y_train)
logistic_regression_report, logistic_regression_accuracy = logistic_regression_model.evaluate(X_test, y_test)

ann_model = ANNModel()
ann_model.train(X_train, y_train)
ann_report, ann_accuracy = ann_model.evaluate(X_test, y_test)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [7]:
# Step 5: Compare the performance of the models
print("Logistic Regression Model Evaluation:")
print(logistic_regression_report)
print("Logistic Regression Model Accuracy:", logistic_regression_accuracy)

print("\nANN Model Evaluation:")
print(ann_report)
print("ANN Model Accuracy:", ann_accuracy)

Logistic Regression Model Evaluation:
              precision    recall  f1-score   support

           0       0.80      0.66      0.72       571
           1       0.83      0.91      0.87      1029

    accuracy                           0.82      1600
   macro avg       0.81      0.78      0.79      1600
weighted avg       0.82      0.82      0.81      1600

Logistic Regression Model Accuracy: 0.818125

ANN Model Evaluation:
              precision    recall  f1-score   support

           0       0.85      0.70      0.77       571
           1       0.85      0.93      0.89      1029

    accuracy                           0.85      1600
   macro avg       0.85      0.82      0.83      1600
weighted avg       0.85      0.85      0.84      1600

ANN Model Accuracy: 0.8487499952316284


In [8]:
# Step 6: Choose the best-performing model and provide a summary
if logistic_regression_accuracy > ann_accuracy:
    print("\nBest Model: Logistic Regression")
    print("Reason for Choosing: Logistic Regression has a higher accuracy compared to the ANN model.")
else:
    print("\nBest Model: ANN")
    print("Reason for Choosing: ANN has a higher accuracy compared to the Logistic Regression model.")


Best Model: ANN
Reason for Choosing: ANN has a higher accuracy compared to the Logistic Regression model.
