In [11]:
import pandas as pd


# Load data into Pandas DataFrames
purchase_data = pd.read_csv('/content/purchase.csv')
boxes_data = pd.read_csv('/content/boxes.csv')

# Display the first few rows of each DataFrame to verify data loading
print("Purchase Data:")
print(purchase_data.head())
print("\nBoxes Data:")
print(boxes_data.head())
print("\nProblem Data:")



Purchase Data:
  PURCHASE_DATE    MAGIC_KEY  BOX_ID  BOX_COUNT
0      1/2/2019  2CED678A247    12.0        1.0
1      1/2/2019  2BF58D91BA1    12.0        1.0
2      1/2/2019  2C15B86534E    99.0        1.0
3      1/2/2019  2C32D9A859A     6.0        1.0
4      1/2/2019  2C7A55404D1     4.0        1.0

Boxes Data:
   BOX_ID  QUALITY      DELIVERY_OPTION  MILK  MEAT  UNIT_PRICE
0       1  Premium  Home Delivery - CoD   0.0   2.7        9.96
1       2  Premium  Home Delivery - CoD   0.0   2.3       11.96
2       3  Premium  Home Delivery - CoD   0.0   2.4       11.96
3       4  Premium  Home Delivery - CoD   0.0   2.5       11.96
4       5  Premium  Home Delivery - CoD   0.0   2.6       11.96

Problem Data:


In [25]:
import pandas as pd



# Convert 'PURCHASE_DATE' column to datetime format with the correct date format
purchase_data['PURCHASE_DATE'] = pd.to_datetime(purchase_data['PURCHASE_DATE'], format='%d/%m/%Y')

# Filter purchases made in January and February 2019
jan_feb_purchases = purchase_data[(purchase_data['PURCHASE_DATE'].dt.year == 2019) &
                                   (purchase_data['PURCHASE_DATE'].dt.month.isin([1, 2,11,12]))]

print("Purchases made in November, December, January and February 2019:")
print(jan_feb_purchases.head(50))

# Count the number of entities
num_entities = jan_feb_purchases.shape[0]
print("Number of entities:", num_entities)

Purchases made in November, December, January and February 2019:
   PURCHASE_DATE    MAGIC_KEY  BOX_ID  BOX_COUNT  MONTH
0     2019-02-01  2CED678A247    12.0        1.0      2
1     2019-02-01  2BF58D91BA1    12.0        1.0      2
2     2019-02-01  2C15B86534E    99.0        1.0      2
3     2019-02-01  2C32D9A859A     6.0        1.0      2
4     2019-02-01  2C7A55404D1     4.0        1.0      2
5     2019-02-01  29D969045C2   238.0        1.0      2
6     2019-02-01  28E5EA49074   227.0        1.0      2
7     2019-02-01  2CEFA3A8659     6.0        1.0      2
8     2019-02-01  2A00DE30F46   204.0        1.0      2
9     2019-02-01  291C04B5CBF   231.0        1.0      2
10    2019-02-01  2C3A3F7DB65    12.0        1.0      2
11    2019-02-01  292FB711FD2   231.0        1.0      2
12    2019-02-01  2955110718F   238.0        1.0      2
13    2019-02-01  291576569FE   278.0        1.0      2
14    2019-02-01  293587823F7   204.0        1.0      2
15    2019-02-01  28DCD991192   201.0  

In [27]:
# Identify unique Magic Keys involved in January and February purchases
unique_magic_keys = jan_feb_purchases['MAGIC_KEY'].unique()

print("Unique Magic Keys involved in Nov, Dec ,January and February purchases:")
print(unique_magic_keys)


Unique Magic Keys involved in Nov, Dec ,January and February purchases:
['2CED678A247' '2BF58D91BA1' '2C15B86534E' ... '2CE6B6D7375' '2C6BD56B78A'
 '2BC854CBA26']


In [28]:
# Identify unique Magic Keys involved in January and February purchases
unique_magic_keys = jan_feb_purchases['MAGIC_KEY'].unique()

# Filter box data to only include boxes that contain milk and/or meat
milk_boxes = set(box_data[box_data['MILK'] > 0]['BOX_ID'])
meat_boxes = set(box_data[box_data['MEAT'] > 0]['BOX_ID'])

# Create a dictionary to store the buying behavior predictions for each Magic Key
buying_behavior_predictions = {}

# Group purchases by Magic Key
grouped_purchases = jan_feb_purchases.groupby('MAGIC_KEY')

# Iterate over each unique Magic Key
for magic_key in unique_magic_keys:
    # Check if the current Magic Key purchased milk and/or meat in January and February
    key_purchases = grouped_purchases.get_group(magic_key)
    milk_purchased = any(key_purchases['BOX_ID'].isin(milk_boxes))
    meat_purchased = any(key_purchases['BOX_ID'].isin(meat_boxes))

    # Store the buying behavior predictions for the current Magic Key
    buying_behavior_predictions[magic_key] = {'milk': milk_purchased, 'meat': meat_purchased}

# Now, we have the buying behavior predictions for each Magic Key based on their past purchases.
# We can use these predictions to determine whether they will buy milk or meat or both in March.


In [30]:


# Print the buying behavior predictions for each Magic Key
for magic_key, behavior in buying_behavior_predictions.items():
    milk_purchased = "Yes" if behavior['milk'] else "No"
    meat_purchased = "Yes" if behavior['meat'] else "No"
    print(f"Magic Key: {magic_key} - Milk Purchased: {milk_purchased}, Meat Purchased: {meat_purchased}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Magic Key: 2C2DB8EB0C4 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2C929879271 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2C6929D5BEE - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2C20D28F48D - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2CB761682A8 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2CD461DDCD6 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2907F6C768C - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2BE94666129 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 28D29D2DE9D - Milk Purchased: Yes, Meat Purchased: No
Magic Key: 28D6611652E - Milk Purchased: No, Meat Purchased: Yes
Magic Key: 2972C8B21F5 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2A00AA2EFF9 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2CE69A5A588 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2BD92144663 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2C

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Magic Key: 2931114F7C1 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2A0B7574904 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 292004808E8 - Milk Purchased: Yes, Meat Purchased: No
Magic Key: 2BCD40DC871 - Milk Purchased: No, Meat Purchased: Yes
Magic Key: 2C58D7CBF67 - Milk Purchased: Yes, Meat Purchased: No
Magic Key: 2BF4A8B40E6 - Milk Purchased: Yes, Meat Purchased: No
Magic Key: 2C0E7766382 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 28D92A0CBDA - Milk Purchased: No, Meat Purchased: Yes
Magic Key: 2CE3DD7A0BA - Milk Purchased: No, Meat Purchased: Yes
Magic Key: 28FA63FE143 - Milk Purchased: No, Meat Purchased: Yes
Magic Key: 2C8F10D1868 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 28EF0B136E2 - Milk Purchased: Yes, Meat Purchased: No
Magic Key: 2C669EFC878 - Milk Purchased: Yes, Meat Purchased: No
Magic Key: 2C81BA3305C - Milk Purchased: No, Meat Purchased: Yes
Magic Key: 2CABD89310

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Magic Key: 2C1CA6D0FF7 - Milk Purchased: No, Meat Purchased: Yes
Magic Key: 28FB097EA7B - Milk Purchased: Yes, Meat Purchased: No
Magic Key: 2CEC84C6456 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2CE6BEC54D6 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 28E57FD6CC6 - Milk Purchased: No, Meat Purchased: Yes
Magic Key: 2CA64E431AE - Milk Purchased: No, Meat Purchased: Yes
Magic Key: 2970FDADAE1 - Milk Purchased: No, Meat Purchased: Yes
Magic Key: 2CBBA4D3E8B - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 290030A0416 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 29DF5758564 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2C922ECA667 - Milk Purchased: No, Meat Purchased: Yes
Magic Key: 2C23A7BCA24 - Milk Purchased: No, Meat Purchased: Yes
Magic Key: 2CC2C937CEA - Milk Purchased: No, Meat Purchased: Yes
Magic Key: 2909993DD28 - Milk Purchased: Yes, Meat Purchased: No
Magic Key: 2CADC50A8

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Magic Key: 2BDA65AAF44 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2C08A1C0CB4 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2BEF7C975DC - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2CA0FE50A9E - Milk Purchased: No, Meat Purchased: Yes
Magic Key: 28F89C98809 - Milk Purchased: Yes, Meat Purchased: No
Magic Key: 2BE3D06471F - Milk Purchased: Yes, Meat Purchased: No
Magic Key: 2BE331350B5 - Milk Purchased: Yes, Meat Purchased: No
Magic Key: 2939F43CAA2 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 290EEF642A2 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 29FA14A9779 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 29115E588C7 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2C1DC16744D - Milk Purchased: No, Meat Purchased: Yes
Magic Key: 2C46DAEE034 - Milk Purchased: No, Meat Purchased: Yes
Magic Key: 2C047CCCCB3 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2C5901

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Magic Key: 2CC26CCFF4B - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2C8A1BF555D - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2BE63642B19 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2CD28A3B05D - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2C41F8C0E25 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2C2771EE6DC - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2BD0F0888C1 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2CFDD7336A4 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2BFB49819A5 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2BE2AF12FF4 - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 2CE332010FC - Milk Purchased: Yes, Meat Purchased: Yes
Magic Key: 28F7857F564 - Milk Purchased: No, Meat Purchased: Yes
Magic Key: 28DFC4CB73A - Milk Purchased: No, Meat Purchased: Yes
Magic Key: 290B7231F74 - Milk Purchased: No, Meat Purchased: Yes
Magic Key: 2C3

In [31]:
print("Total unique Magic Keys:", len(unique_magic_keys))


Total unique Magic Keys: 813313


In [33]:
import pandas as pd

# Get all unique Magic Keys in the dataset
all_unique_magic_keys = set(purchase_data['MAGIC_KEY'].unique())

# Find the Magic Keys that made purchases in January and February
purchasing_magic_keys = set(jan_feb_purchases['MAGIC_KEY'].unique())

# Create a DataFrame with Magic Keys and their purchase status
purchase_status_df = pd.DataFrame({
    'MAGIC_KEY': list(all_unique_magic_keys),
    'PURCHASE_STATUS': ['Bought' if magic_key in purchasing_magic_keys else 'Not Bought' for magic_key in all_unique_magic_keys]
})

# Print the DataFrame
print(purchase_status_df)


           MAGIC_KEY PURCHASE_STATUS
0        2C6749BCE54          Bought
1        2C49DC962A3      Not Bought
2        2CED32D4FF9          Bought
3        2902150A36F          Bought
4        28E0C97D533          Bought
...              ...             ...
1006402  2C67F1D8A52      Not Bought
1006403  2BD897536DD      Not Bought
1006404  2CE3669AD1E          Bought
1006405  2CDD66CE77C          Bought
1006406  2CB69A4CB08      Not Bought

[1006407 rows x 2 columns]


In [35]:
# Count the number of Magic Keys that made purchases and that did not make purchases
purchase_counts = purchase_status_df['PURCHASE_STATUS'].value_counts()

# Print the total number of Magic Keys that made purchases and that did not make purchases
print("Total number of Magic Keys:")
print("Bought:", purchase_counts.get('Bought', 0))
print("Not Bought:", purchase_counts.get('Not Bought', 0))


Total number of Magic Keys:
Bought: 813313
Not Bought: 193094


In [37]:
# Convert 'PURCHASE_DATE' column to datetime format
purchase_data['PURCHASE_DATE'] = pd.to_datetime(purchase_data['PURCHASE_DATE'])

# Extract unique dates
unique_dates = purchase_data['PURCHASE_DATE'].dt.date.unique()

# Count the number of unique dates
num_days_data = len(unique_dates)

# Print the number of days of data available
print("Number of unique days of data:", num_days_data)


Number of unique days of data: 116


In [38]:
# Convert 'PURCHASE_DATE' column to datetime format
purchase_data['PURCHASE_DATE'] = pd.to_datetime(purchase_data['PURCHASE_DATE'])

# Extract month from each date
purchase_data['MONTH'] = purchase_data['PURCHASE_DATE'].dt.month

# Get unique months
unique_months = purchase_data['MONTH'].unique()

# Print the unique months
print("Months present in the dataset:", unique_months)


Months present in the dataset: [ 2  1 12 11]


In [42]:
import pandas as pd

# Load the total dataset
total_data = pd.read_csv("/content/purchase.csv")

# Extract unique Magic Keys from the total dataset
unique_magic_keys_total = set(total_data['MAGIC_KEY'].unique())

# Load the problem 1.csv file
problem_1_data = pd.read_csv("/content/problem 1.csv")

# Extract unique Magic Keys from problem 1.csv
unique_magic_keys_problem_1 = set(problem_1_data['MAGIC_KEY'].unique())

# Cross-match Magic Keys between problem 1.csv and the total dataset
cross_matched_keys = unique_magic_keys_total.intersection(unique_magic_keys_problem_1)

# Filter the total dataset to keep only the rows corresponding to the matched keys
matched_data = total_data[total_data['MAGIC_KEY'].isin(cross_matched_keys)]

# Now, you can work with the matched data
print("Matched data:")
print(matched_data.head(50))


Matched data:
    PURCHASE_DATE    MAGIC_KEY  BOX_ID  BOX_COUNT
14       1/2/2019  293587823F7   204.0        1.0
15       1/2/2019  28DCD991192   201.0        1.0
40       1/2/2019  2901B2DF4F8   228.0        1.0
42       1/2/2019  28D3AE7E989   221.0        1.0
86       1/2/2019  290EA8882DF   213.0        1.0
88       1/2/2019  28F0F42927A   213.0        1.0
89       1/2/2019  2976EEC8818   220.0        1.0
128      1/2/2019  28FE63D0A13   258.0        1.0
142      1/2/2019  28FC7E3E629   245.0        1.0
143      1/2/2019  28E18429EAD   245.0        1.0
144      1/2/2019  28E2B04BAFA   245.0        1.0
154      1/2/2019  29657DC61BD   246.0        1.0
155      1/2/2019  28EA1159E5A   255.0        1.0
160      1/2/2019  28E8D1EBD5B   237.0        1.0
170      1/2/2019  297AA214CEC   246.0        1.0
286      1/2/2019  28F0FAD2274   252.0        1.0
305      1/2/2019  297202959DD   246.0        1.0
335      1/2/2019  28E7F098412   213.0        1.0
336      1/2/2019  28F2DD3C84C   213

In [43]:
# Total number of unique Magic Keys in "problem 1.csv"
total_unique_keys_problem_1 = len(unique_magic_keys_problem_1)

# Number of matched keys found in the total dataset
num_matched_keys = len(cross_matched_keys)

# Number of rejected keys (not found in the total dataset)
num_rejected_keys = total_unique_keys_problem_1 - num_matched_keys

# Print the number of accepted and rejected keys
print("Number of accepted keys:", num_matched_keys)
print("Number of rejected keys:", num_rejected_keys)


Number of accepted keys: 58689
Number of rejected keys: 0


In [44]:
# Filter the total dataset to keep only the rows corresponding to the matched keys
matched_data = total_data[total_data['MAGIC_KEY'].isin(cross_matched_keys)]

# Save the filtered dataset to a new CSV file
matched_data.to_csv('matched_data.csv', index=False)


In [45]:
from google.colab import files

# Download the CSV file
files.download('matched_data.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [48]:
# Extract unique BOX_IDs from the matched data
unique_matched_box_ids = set(matched_data['BOX_ID'].unique())

# Filter the box data to keep only the rows corresponding to the matched BOX_IDs
cross_checked_box_data = box_data[box_data['BOX_ID'].isin(unique_matched_box_ids)]

# Save the cross-checked box data to a new CSV file
cross_checked_box_data.to_csv('cross_checked_box_data.csv', index=False)


In [2]:
import pandas as pd

# Load the cross-checked box data from the CSV file
cross_checked_box_data = pd.read_csv('/content/cross_checked_box_data.csv')

# Display the first few rows of the loaded cross-checked box data
print(cross_checked_box_data.head())


   BOX_ID  QUALITY      DELIVERY_OPTION  MILK  MEAT  UNIT_PRICE
0       8  Premium  Home Delivery - CoD  10.0   0.0       12.18
1       9  Premium  Home Delivery - CoD  10.5   0.0       12.78
2      27  Premium  Home Delivery - CoD   0.0   2.9       15.96
3      28  Premium  Home Delivery - CoD   0.0   3.3       15.96
4      29  Premium  Home Delivery - CoD   0.0   3.6       15.96


In [2]:
# First, let's get the MAGIC KEY corresponding to each BOX_ID
magic_key_mapping = matched_data[['MAGIC_KEY', 'BOX_ID']]

# Now, merge the magic_key_mapping with the cross-checked box data
merged_data = pd.merge(cross_checked_box_data, magic_key_mapping, on='BOX_ID', how='inner')

# Save the merged data to a new CSV file
merged_data.to_csv('merged_data.csv', index=False)


NameError: name 'matched_data' is not defined

In [51]:
from google.colab import files

# Download the merged data CSV file
files.download('merged_data.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd

# Load the merged data
merged_data = pd.read_csv("/content/merged_data.csv")

# Load the matched data
matched_data = pd.read_csv("/content/matched_data.csv")

# Merge the 'PURCHASE_DATE' column from the matched data CSV
merged_data = pd.merge(merged_data, matched_data[['BOX_ID', 'PURCHASE_DATE']], on='BOX_ID', how='left')

# Save the merged data to a new CSV file
merged_data.to_csv('merged_data_with_purchase_date.csv', index=False)


In [None]:
import pandas as pd

# Load the matched data
matched_data = pd.read_csv("/content/matched_data.csv")

# Load the cross-checked box data
cross_checked_box_data = pd.read_csv("/content/cross_checked_box_data.csv")

# First, let's get the MAGIC KEY corresponding to each BOX_ID
magic_key_mapping = matched_data[['MAGIC_KEY', 'BOX_ID']]

# Now, merge the magic_key_mapping with the cross-checked box data
merged_data = pd.merge(cross_checked_box_data, magic_key_mapping, on='BOX_ID', how='inner')

# Merge the 'PURCHASE_DATE' column from the matched data CSV
merged_data = pd.merge(merged_data, matched_data[['BOX_ID', 'PURCHASE_DATE']], on='BOX_ID', how='left')

# Save the merged data to a new CSV file
merged_data.to_csv('merged_data_with_purchase_date.csv', index=False)
