# 1

## A

In [3]:
import pandas as pd

# Load the CSV file into a DataFrame
data = pd.read_csv("Q2_data.csv")

# Calculate profit per product
data['Profit Per Product'] = data['Profit Per Product'].astype(float)

# Identify unprofitable products
unprofitable_products = data[data['Profit Per Product'] < 0]

# Calculate the percentage of unprofitable products
percent_unprofitable = (len(unprofitable_products) / len(data)) * 100

if len(unprofitable_products) > 0:
    print("There are", len(unprofitable_products), "unprofitable products out of", len(data), "total products.")
    print("Percentage of unprofitable products: {:.2f}%".format(percent_unprofitable))
else:
    print("All products are profitable.")


There are 42 unprofitable products out of 5297 total products.
Percentage of unprofitable products: 0.79%


## B

In [4]:
# Calculate the average profit per product
average_profit_per_product = data['Profit Per Product'].mean()

print("Average profit per product: ${:.2f}".format(average_profit_per_product))

Average profit per product: $2.94


# 2

## A

In [5]:
# Cost of processing each return
return_processing_cost = 1

# Compute profit per product in the online channel
data['Profit Per Product Online'] = data['Profit Per Product'] - return_processing_cost * data['Share Returned']

# Print the result
print(data[['Product ID', 'Profit Per Product Online']])


      Product ID  Profit Per Product Online
0              0                       2.31
1              1                       2.69
2              2                       2.64
3              3                       2.49
4              4                       0.91
...          ...                        ...
5292        5292                      17.79
5293        5293                      13.41
5294        5294                      13.12
5295        5295                       8.49
5296        5296                       9.88

[5297 rows x 2 columns]


## B

In [8]:
# Identify unprofitable products in the online channel
unprofitable_products_online = data[data['Profit Per Product Online'] < 0]

# Calculate the percentage of unprofitable products in the online channel
percent_unprofitable_online = (len(unprofitable_products_online) / len(data)) * 100

if len(unprofitable_products_online) > 0:
    print("There are", len(unprofitable_products_online), "unprofitable online products out of", len(data), "total products.")
    print("Percentage of unprofitable online products: {:.2f}%".format(percent_unprofitable_online))
else:
    print("All products are profitable in the online channel.")

There are 284 unprofitable online products out of 5297 total products.
Percentage of unprofitable online products: 5.36%


## C

In [9]:
# Calculate the average profit per product in the online channel
average_profit_per_product_online = data['Profit Per Product Online'].mean()

print("Average profit per product in the online channel: ${:.2f}".format(average_profit_per_product_online))

Average profit per product in the online channel: $2.44


# 3

## A

In [10]:
# Create a new variable "Bad Product" indicating if profitability in the online channel is less than zero
data['Bad Product'] = data['Profit Per Product Online'] < 0

# Display the updated DataFrame with the new variable
print(data[['Product ID', 'Profit Per Product Online', 'Bad Product']])

      Product ID  Profit Per Product Online  Bad Product
0              0                       2.31        False
1              1                       2.69        False
2              2                       2.64        False
3              3                       2.49        False
4              4                       0.91        False
...          ...                        ...          ...
5292        5292                      17.79        False
5293        5293                      13.41        False
5294        5294                      13.12        False
5295        5295                       8.49        False
5296        5296                       9.88        False

[5297 rows x 3 columns]


## B

In [17]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming you already have the 'data' DataFrame

# Define features and target variable
X = data[['Category', 'Color', 'Season']]
y = data['Bad Product']

# One-hot encode categorical variables
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(), ['Category', 'Color', 'Season'])],
    remainder='passthrough'
)

# Define the logistic regression model
logreg = LogisticRegression(max_iter=1000)

# Create a pipeline with preprocessing and logistic regression
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', logreg)])

# Fit the model
pipeline.fit(X, y)

# Get the coefficients
coefficients = pipeline.named_steps['classifier'].coef_[0]

# Extract feature names after one-hot encoding
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out(input_features=['Category', 'Color', 'Season'])

# Combine feature names and coefficients into a DataFrame
coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Print the coefficients
print(coef_df)


                  Feature  Coefficient
0   cat__Category_Blazers    -0.084618
1   cat__Category_Bluoses    -0.388708
2   cat__Category_Dresses    -0.834674
3   cat__Category_Jackets     0.314515
4      cat__Category_Knit     0.287408
5     cat__Category_Pants     0.228351
6    cat__Category_Shirts     0.663605
7    cat__Category_Skirts    -0.184416
8        cat__Color_Black    -0.627762
9         cat__Color_Blue    -0.089237
10       cat__Color_Brown     0.350262
11       cat__Color_Green     0.314770
12        cat__Color_Grey    -0.070753
13      cat__Color_Orange    -1.102709
14        cat__Color_Pink    -0.532383
15         cat__Color_Red     0.489781
16       cat__Color_White     0.180637
17      cat__Color_Yellow     1.088854
18       cat__Season_Fall     0.807858
19     cat__Season_Spring    -0.269477
20     cat__Season_Summer     0.013904
21     cat__Season_Winter    -0.550824
