# 1. Import packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import glob

# 2. Load product data and extract ARTICLE_ID for tire sales

## 2.1 Load product data

In [2]:
product = pd.read_csv('/data/p_dsi/teams2023/bridgestone_data/data/product.csv', sep = '|')

In [38]:
product.info()
product.sample(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56842 entries, 0 to 56841
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ARTICLE_ID       56842 non-null  int64  
 1   PROD_GROUP_CODE  56818 non-null  float64
 2   PROD_GROUP_DESC  56818 non-null  object 
 3   CATEGORY_CODE    56818 non-null  float64
 4   CATEGORY_DESC    56818 non-null  object 
 5   SEGMENT_CODE     56818 non-null  float64
 6   SEGMENT_DESC     56818 non-null  object 
 7   CLASS_CODE       56818 non-null  float64
 8   CLASS_DESC       56818 non-null  object 
 9   DISCOUNT_FLAG    56818 non-null  object 
 10  CROSS_SECTION    35670 non-null  object 
 11  ASPECT_RATIO     35665 non-null  object 
 12  RIM_SIZE         35504 non-null  object 
dtypes: float64(4), int64(1), object(8)
memory usage: 5.6+ MB


Unnamed: 0,ARTICLE_ID,PROD_GROUP_CODE,PROD_GROUP_DESC,CATEGORY_CODE,CATEGORY_DESC,SEGMENT_CODE,SEGMENT_DESC,CLASS_CODE,CLASS_DESC,DISCOUNT_FLAG,CROSS_SECTION,ASPECT_RATIO,RIM_SIZE
43151,7005219,4.0,Services,83.0,Steering/Suspension/Drivetrain,96.0,Alignments,190.0,Lifetime Alignment,Y,,,
34638,7083544,5.0,Tires,26.0,Passenger Tires,29.0,Performance Tires,58530.0,Ultra High Performance All Season,N,,,
27858,224133,5.0,Tires,44.0,Light Truck Tires,47.0,Commercial Tires,51.0,Highway Tires,N,225.0,75.0,17.0
6839,11982,5.0,Tires,26.0,Passenger Tires,29.0,Performance Tires,58530.0,Ultra High Performance All Season,N,235.0,40.0,18.0
27008,7096258,5.0,Tires,26.0,Passenger Tires,30.0,P Metric Light Truck Tires,42.0,All Terrain Tires,N,245.0,70.0,16.0
8850,1183,5.0,Tires,26.0,Passenger Tires,1135.0,Snow Tires-PS,1136.0,Snow Tires,N,215.0,70.0,15.0
9355,7082375,5.0,Tires,26.0,Passenger Tires,1135.0,Snow Tires-PS,1136.0,Snow Tires,N,175.0,65.0,14.0
21910,7086679,5.0,Tires,44.0,Light Truck Tires,46.0,Recreational Tires,49.0,All Terrain Tires,N,275.0,55.0,20.0
10674,7095323,5.0,Tires,26.0,Passenger Tires,30.0,P Metric Light Truck Tires,41.0,Highway Tires,N,225.0,65.0,17.0
1269,4321,5.0,Tires,26.0,Passenger Tires,29.0,Performance Tires,39.0,Touring H/V/Z Tires,N,205.0,65.0,16.0


In [5]:
product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56842 entries, 0 to 56841
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ARTICLE_ID       56842 non-null  int64  
 1   PROD_GROUP_CODE  56818 non-null  float64
 2   PROD_GROUP_DESC  56818 non-null  object 
 3   CATEGORY_CODE    56818 non-null  float64
 4   CATEGORY_DESC    56818 non-null  object 
 5   SEGMENT_CODE     56818 non-null  float64
 6   SEGMENT_DESC     56818 non-null  object 
 7   CLASS_CODE       56818 non-null  float64
 8   CLASS_DESC       56818 non-null  object 
 9   DISCOUNT_FLAG    56818 non-null  object 
 10  CROSS_SECTION    35670 non-null  object 
 11  ASPECT_RATIO     35665 non-null  object 
 12  RIM_SIZE         35504 non-null  object 
dtypes: float64(4), int64(1), object(8)
memory usage: 5.6+ MB


## 2.2 Select tire sales data

In [9]:
tire_sales = product[(product["CATEGORY_CODE"] == 44.0) | (product["CATEGORY_CODE"] == 26.0)]
tire_sales.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32563 entries, 0 to 56822
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ARTICLE_ID       32563 non-null  int64  
 1   PROD_GROUP_CODE  32563 non-null  float64
 2   PROD_GROUP_DESC  32563 non-null  object 
 3   CATEGORY_CODE    32563 non-null  float64
 4   CATEGORY_DESC    32563 non-null  object 
 5   SEGMENT_CODE     32563 non-null  float64
 6   SEGMENT_DESC     32563 non-null  object 
 7   CLASS_CODE       32563 non-null  float64
 8   CLASS_DESC       32563 non-null  object 
 9   DISCOUNT_FLAG    32563 non-null  object 
 10  CROSS_SECTION    31964 non-null  object 
 11  ASPECT_RATIO     31959 non-null  object 
 12  RIM_SIZE         31963 non-null  object 
dtypes: float64(4), int64(1), object(8)
memory usage: 3.5+ MB


## 2.3 Extract Article_ID

In [10]:
# Extract the ARTICLE_ID from tire_sales
ARTICLE_ID_list = tire_sales["ARTICLE_ID"].unique().tolist()

In [11]:
#print(ARTICLE_ID_list)
len(ARTICLE_ID_list)

32563

# 3. Load sales data and subset all tire sales

In [15]:
# sample testing with one month in 2017
sales_2017_oct = pd.read_csv('/data/p_dsi/teams2023/bridgestone_data/data/sales_20171031.csv', sep = '|')
sales_2017_oct.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15800806 entries, 0 to 15800805
Data columns (total 8 columns):
 #   Column      Dtype  
---  ------      -----  
 0   STORE_ID    int64  
 1   TRAN_ID     int64  
 2   DATE        object 
 3   ARTICLE_ID  int64  
 4   INDIV_ID    float64
 5   VEHICLE_ID  int64  
 6   UNITS       float64
 7   SALES       float64
dtypes: float64(3), int64(4), object(1)
memory usage: 964.4+ MB


In [14]:
# example of getting ARTICLE_ID and UNITS 
master_small = sales_2017_oct[["ARTICLE_ID", "UNITS"]]
tire_sales_2017 = master_small[master_small['ARTICLE_ID'].isin(ARTICLE_ID_list)]
tire_sales_2017
df = tire_sales_2017.groupby(by=["ARTICLE_ID"]).sum()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3536 entries, 9 to 7099853
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   UNITS   3536 non-null   float64
dtypes: float64(1)
memory usage: 55.2 KB


In [21]:
# Create a concatenated data frame with ARTICLE_ID and UNITS in Nov, Dec 2017
csv_files = ['/data/p_dsi/teams2023/bridgestone_data/data/sales_20171130.csv',
             '/data/p_dsi/teams2023/bridgestone_data/data/sales_20171231.csv']

# create a new data frame
master_tire_sales_2017 = pd.DataFrame()
for file in csv_files:
    sales_2017 = pd.read_csv(file, sep = "|")
    sales_small = sales_2017[["ARTICLE_ID", "UNITS"]]
    tire_sales_2017 = sales_small[sales_small['ARTICLE_ID'].isin(ARTICLE_ID_list)]
    df = tire_sales_2017.groupby(by=["ARTICLE_ID"]).sum()
    master_tire_sales_2017 = pd.concat([master_tire_sales_2017, df])

master_tire_sales_2017.info()
master_tire_sales_2017.sample(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7089 entries, 9 to 7099819
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   UNITS   7089 non-null   float64
dtypes: float64(1)
memory usage: 110.8 KB


Unnamed: 0_level_0,UNITS
ARTICLE_ID,Unnamed: 1_level_1
7092043,0.0
7094247,0.0
7092011,0.0
7097106,0.0
145954,10.0
843,9.0
114589,4735.0
2459,110.0
71171,16.0
817,1275.0


In [23]:
# Sum by the article_id again to get the total result
master_tire_sales_2017 = master_tire_sales_2017.groupby(by=["ARTICLE_ID"]).sum()
master_tire_sales_2017.info()
master_tire_sales_2017.sample(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4146 entries, 9 to 7099982
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   UNITS   4146 non-null   float64
dtypes: float64(1)
memory usage: 64.8 KB


Unnamed: 0_level_0,UNITS
ARTICLE_ID,Unnamed: 1_level_1
2842,116.0
5053,8.0
1342,28.0
6444,695.0
7012498,0.0
7012468,0.0
7092571,0.0
7099574,0.0
7090810,14.0
7089185,0.0


In [29]:
# save as csv file
master_tire_sales_2017.to_csv("/data/p_dsi/teams2023/team7/tire_sales_article_id_and_total_units_Nov_Dec_2017.csv")

In [20]:
# Create a concatenated data frame with ARTICLE_ID and UNITS from Jan - Oct in 2018
csv_files = glob.glob('/data/p_dsi/teams2023/bridgestone_data/data/*sales_2018*.csv')

master_tire_sales_2018 = pd.DataFrame()
for file in csv_files:
    sales_2018 = pd.read_csv(file, sep = "|")
    sales_small = sales_2018[["ARTICLE_ID", "UNITS"]]
    tire_sales_2018 = sales_small[sales_small['ARTICLE_ID'].isin(ARTICLE_ID_list)]
    df = tire_sales_2018.groupby(by=["ARTICLE_ID"]).sum()
    master_tire_sales_2018 = pd.concat([master_tire_sales_2018, df])
    
master_tire_sales_2018.info()
master_tire_sales_2018.sample(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39162 entries, 9 to 7099847
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   UNITS   39162 non-null  float64
dtypes: float64(1)
memory usage: 611.9 KB


Unnamed: 0_level_0,UNITS
ARTICLE_ID,Unnamed: 1_level_1
1405,1328.0
142486,4.0
5015,8.0
3459,30.0
7092836,0.0
7091165,2.0
7084436,1.0
112569,1.0
148300,66.0
7086373,1.0


In [24]:
# Sum by the article_id again to get the total result
master_tire_sales_2018 = master_tire_sales_2018.groupby(by=["ARTICLE_ID"]).sum()
master_tire_sales_2018.info()
master_tire_sales_2018.sample(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7856 entries, 9 to 7099854
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   UNITS   7856 non-null   float64
dtypes: float64(1)
memory usage: 122.8 KB


Unnamed: 0_level_0,UNITS
ARTICLE_ID,Unnamed: 1_level_1
7091280,39.0
5126,1458.0
57809,34.0
7086840,5.0
12071,1873.0
579,111.0
79297,2.0
7090995,0.0
7082430,1.0
7087805,2.0


In [25]:
# save as csv file
master_tire_sales_2018.to_csv("/data/p_dsi/teams2023/team7/tire_sales_article_id_and_total_units_2018.csv")

In [26]:
# Now load two files from 2017 and 2018 and bind them together for the last time
unit_tire_sales_year = pd.concat([master_tire_sales_2017, master_tire_sales_2018])
unit_tire_sales_year = unit_tire_sales_year.groupby(by=["ARTICLE_ID"]).sum()

# filter by the sales greater than 5,000
unit_tire_sales_year = unit_tire_sales_year.loc[unit_tire_sales_year['UNITS'] > 5000]
unit_tire_sales_year.info()
unit_tire_sales_year.sample(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 435 entries, 33 to 7099817
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   UNITS   435 non-null    float64
dtypes: float64(1)
memory usage: 6.8 KB


Unnamed: 0_level_0,UNITS
ARTICLE_ID,Unnamed: 1_level_1
452,8328.0
122,6021.0
1734,27110.0
2815,11212.0
4014,23720.0
6478,18178.0
3421,14998.0
15165,6335.0
3503,14997.0
3843,18559.0


In [27]:
#save the final result in csv file
unit_tire_sales_year.to_csv("/data/p_dsi/teams2023/team7/tire_sales_article_id_and_total_units_last_12_months.csv")

In [37]:
# Sanity check
unit_tire_sales_last_12_months = pd.read_csv("/data/p_dsi/teams2023/team7/tire_sales_article_id_and_total_units_last_12_months.csv", sep = ',')

# Extract the ARTICLE_ID from tire_sales
ARTICLE_ID_list = unit_tire_sales_last_12_months["ARTICLE_ID"].tolist()

#print(ARTICLE_ID_list)
len(ARTICLE_ID_list)

435

In [40]:
master_sample = pd.read_csv("/data/p_dsi/teams2023/team7/master_sample.csv")

In [41]:
master_sample.info()
master_sample.sample(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 34 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0.2      100000 non-null  int64  
 1   Unnamed: 0.1      100000 non-null  int64  
 2   Unnamed: 0        100000 non-null  int64  
 3   STORE_ID          100000 non-null  int64  
 4   TRAN_ID           100000 non-null  int64  
 5   DATE              100000 non-null  object 
 6   ARTICLE_ID        100000 non-null  int64  
 7   INDIV_ID          99822 non-null   float64
 8   VEHICLE_ID        100000 non-null  int64  
 9   UNITS             100000 non-null  float64
 10  SALES             100000 non-null  float64
 11  MZB_INDIV_ID      53265 non-null   float64
 12  EMAIL_OPTIN_IND   53265 non-null   object 
 13  AH1_RES_BUS_INDC  51187 non-null   object 
 14  SUPP1_BUS_PANDER  51187 non-null   object 
 15  PROD_GROUP_CODE   100000 non-null  float64
 16  PROD_GROUP_DESC   100

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,STORE_ID,TRAN_ID,DATE,ARTICLE_ID,INDIV_ID,VEHICLE_ID,UNITS,...,CROSS_SECTION,ASPECT_RATIO,RIM_SIZE,STATE_CODE,ZIP_CODE,MSA,MAKE,MODEL,SUB_MODEL,MODEL_YEAR
77698,77698,184198,4728837,16357,992561910,2018-07-23,7089671,285492885.0,915800928,0.0,...,NONE,NONE,NONE,VA,22306,WASHINGTON,MAZDA,3 S,S Hatchback,2007.0
3181,3181,1127949,8617660,27499,991230250,2017-07-27,7090002,288465538.0,960352874,0.0,...,NONE,NONE,NONE,OH,43223,"COLUMBUS,O",CADILLAC,ATS PERFORMANCE,Performance,2013.0
2526,2526,1621763,6075735,240187,992613130,2017-05-11,7012601,261973659.0,936512026,0.0,...,NONE,NONE,NONE,OK,73110,,TOYOTA,COROLLA S,S,2011.0
9108,9108,1370917,244791,352044,990782810,2017-05-15,3431,258390732.0,936409750,1.0,...,215,60,16,CA,95843,SACRAMENTO,NISSAN,ALTIMA S,S 2.5,2014.0
70341,70341,1345903,117106,3328,991996290,2015-11-27,117751,284811922.0,897489268,0.0,...,205,55,16,AL,35124,BIRMINGHAM,CHEVROLET,COBALT LS,LS,2008.0
25432,25432,5001714,7125326,353227,990612250,2017-06-02,85893,253344155.0,929731040,1.0,...,195,65,15,NC,28079,"CHARLOTTE,",HONDA,ACCORD DX,DX,2000.0
94780,94780,1890282,13763863,4227,992257770,2018-05-30,7089226,542103504.0,1,0.0,...,NONE,NONE,NONE,FL,34471,"OCALA,FL",,,,
35809,35809,7749235,2337763,4340,991698670,2016-02-01,136893,271349457.0,692907204,1.0,...,225,55,16,FL,33020,FT.LAUDERD,FORD,MUSTANG BASE,Base,2003.0
33747,33747,3882991,11503188,679259,992554950,2016-10-01,122817,293629443.0,937982930,1.0,...,225,65,17,TX,77092,"HOUSTON,TX",CHEVROLET,EQUINOX LT,LT,2012.0
11675,11675,1245923,12090050,121983,990918230,2017-07-02,47150,285369059.0,930859511,4.0,...,205,70,15,FL,34474,,TOYOTA,SIENNA LE,LE,1998.0


In [42]:
master_sample_selected_article_ids = master_sample[master_sample['ARTICLE_ID'].isin(ARTICLE_ID_list)]

In [43]:
master_sample_selected_article_ids.info()
master_sample_selected_article_ids.sample(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46012 entries, 0 to 99997
Data columns (total 34 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0.2      46012 non-null  int64  
 1   Unnamed: 0.1      46012 non-null  int64  
 2   Unnamed: 0        46012 non-null  int64  
 3   STORE_ID          46012 non-null  int64  
 4   TRAN_ID           46012 non-null  int64  
 5   DATE              46012 non-null  object 
 6   ARTICLE_ID        46012 non-null  int64  
 7   INDIV_ID          45910 non-null  float64
 8   VEHICLE_ID        46012 non-null  int64  
 9   UNITS             46012 non-null  float64
 10  SALES             46012 non-null  float64
 11  MZB_INDIV_ID      23832 non-null  float64
 12  EMAIL_OPTIN_IND   23832 non-null  object 
 13  AH1_RES_BUS_INDC  22878 non-null  object 
 14  SUPP1_BUS_PANDER  22878 non-null  object 
 15  PROD_GROUP_CODE   46012 non-null  float64
 16  PROD_GROUP_DESC   46012 non-null  object

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,STORE_ID,TRAN_ID,DATE,ARTICLE_ID,INDIV_ID,VEHICLE_ID,UNITS,...,CROSS_SECTION,ASPECT_RATIO,RIM_SIZE,STATE_CODE,ZIP_CODE,MSA,MAKE,MODEL,SUB_MODEL,MODEL_YEAR
68247,68247,5267708,8877289,12505,991664800,2015-12-20,3419,290508082.0,944688407,1.0,...,225,55,17,WA,98901,"YAKIMA,WA",CHEVROLET,IMPALA LT,,2014.0
57562,57562,2161776,228629,4677,990914200,2015-04-09,122596,303886073.0,938258662,4.0,...,195,65,15,LA,70801,BATON ROUG,TOYOTA,COROLLA LE,LE,2011.0
32840,32840,6930549,13683677,20745,992092240,2016-11-22,122630,481720719.0,952780112,1.0,...,225,60,16,MS,39402,NON-MET MS,CHEVROLET,IMPALA LS,LS,2008.0
9318,9318,7567924,3664740,26573,992586100,2017-11-03,92319,256352544.0,960922564,2.0,...,195,70,14,VA,23231,"RICHMOND,V",CHEVROLET,CAVALIER BASE,Base,2001.0
61514,61514,1176166,13113344,678635,992473280,2015-05-14,97912,276120721.0,939086392,4.0,...,265,70,17,TX,79605,"ABILENE,TX",CHEVROLET,SUBURBAN 1500 LS,LS,2013.0
72323,72323,147086,3391146,28460,991480160,2015-07-20,136094,255494272.0,940702550,1.0,...,235,60,18,OR,97045,"PORTLAND,O",HONDA,PILOT TOURING,Touring,2012.0
25965,25965,4662980,15511974,1031,991875420,2017-10-11,15182,272834125.0,949900465,1.0,...,205,60,16,NJ,7042,"NEWARK,NJ",LEXUS,ES300 BASE,Base,2001.0
81345,81345,6184253,6592994,749144,990402490,2018-03-25,832,271891112.0,1,4.0,...,225,50,17,TX,77494,"HOUSTON,TX",,,,
35893,35893,1411923,1843051,635871,992062150,2016-09-20,122613,256596139.0,940538195,1.0,...,205,65,15,FL,32726,,CHRYSLER,SEBRING LX,LX Convertible,2002.0
84476,84476,4830441,7682507,25046,991453110,2018-10-08,26869,288200115.0,457320314,2.0,...,255,70,16,OK,73701,"ENID,OK",CHEVROLET,SILVERADO 1500 LS,LS 4x2,2000.0
