## 1. Defining the main parameters
We start by importing all the necessary dependencies and defining the main parameters mentioned above (200 serial number, 36 months, starting month in January 2017).

In [1]:
# importing all the libraries
import pandas as pd
import numpy as np
import uuid
from datetime import datetime
from dateutil.relativedelta import relativedelta

In [2]:
# We then set the main parameters of our final dataset. The number of units we’d like to generate data for, maximum number of months warranty per unit and the start of the data collection.
# number of device uuids 
num_serial_num = 200

# number of months per device uuid
num_months = 36

# starting month of app usage data
start_month = '2017-01-01'

In [3]:
# generating unique identifiers for each device

# generating 200 serial numbers
serial = pd.Series([str(uuid.uuid4()) for i in range(0,num_serial_num)])
installbase = pd.DataFrame()
installbase['serial'] = pd.Series(serial)
installbase


Unnamed: 0,serial
0,6c19ff4f-bbf4-4d2b-ac7a-6417468c3815
1,d85308c7-2e35-4e95-b046-4121e3739fa2
2,8e299dff-ab6a-43ad-a7a3-acb0d1e0cf58
3,13131c64-7702-44bd-b418-d73941dcc11e
4,640a1567-60cc-45bc-8d60-2e1e0a230451
5,ad43ba22-2bb4-4fcf-96da-4eddad447fc9
6,0c75b37f-67dd-421f-a386-f19dd97240df
7,b6afabf2-96cd-4c43-a610-eb6d5bd6826b
8,fab03caa-2e00-4bf4-afc1-58bd99dd62de
9,c94edde2-79bc-49af-93a0-7a47381af55c


## Generating datetime

In [4]:
# reseting the index
installbase = installbase.reset_index().drop('index', 1)

# defining starting month and ending month
start_month_ts = pd.to_datetime(start_month)
end_month_ts = start_month_ts + relativedelta(months=+num_months - 1)

# making a Series out of the starting and ending month
months = pd.Series(pd.date_range(start_month_ts, end_month_ts, freq='MS'))

str(start_month_ts), str(end_month_ts)

('2017-01-01 00:00:00', '2019-12-01 00:00:00')

We can see that with the specified parameters (starting month, number of months) we will generate data from January 2017 to December 2019.

In [5]:
installdate = pd.Series(np.random.choice(months, size=num_serial_num))
installdate.head()

0   2018-02-01
1   2018-05-01
2   2018-09-01
3   2017-09-01
4   2018-11-01
dtype: datetime64[ns]

In [6]:
#  Adding install date column to the dataframe
installbase['install date'] = pd.concat([installdate] * num_serial_num, axis=0).reset_index().drop('index', 1)
installbase

Unnamed: 0,serial,install date
0,6c19ff4f-bbf4-4d2b-ac7a-6417468c3815,2018-02-01
1,d85308c7-2e35-4e95-b046-4121e3739fa2,2018-05-01
2,8e299dff-ab6a-43ad-a7a3-acb0d1e0cf58,2018-09-01
3,13131c64-7702-44bd-b418-d73941dcc11e,2017-09-01
4,640a1567-60cc-45bc-8d60-2e1e0a230451,2018-11-01
5,ad43ba22-2bb4-4fcf-96da-4eddad447fc9,2018-02-01
6,0c75b37f-67dd-421f-a386-f19dd97240df,2019-06-01
7,b6afabf2-96cd-4c43-a610-eb6d5bd6826b,2018-12-01
8,fab03caa-2e00-4bf4-afc1-58bd99dd62de,2018-01-01
9,c94edde2-79bc-49af-93a0-7a47381af55c,2019-09-01


## Adding Warranty data

In [7]:
# Customer have a choice of 1, 2 or 3 year warranty
num_warranty = 3  

# assign cohorts to users randomly (when did the user first use the product?)
warranty = pd.DataFrame()
warranty['warranty'] = np.random.randint(low=1, high=num_warranty, size=num_serial_num)
warranty.head()



Unnamed: 0,warranty
0,1
1,1
2,2
3,1
4,2


In [8]:
installbase['warranty'] = pd.concat([warranty], axis=0).reset_index().drop('index', 1)
installbase.head()

Unnamed: 0,serial,install date,warranty
0,6c19ff4f-bbf4-4d2b-ac7a-6417468c3815,2018-02-01,1
1,d85308c7-2e35-4e95-b046-4121e3739fa2,2018-05-01,1
2,8e299dff-ab6a-43ad-a7a3-acb0d1e0cf58,2018-09-01,2
3,13131c64-7702-44bd-b418-d73941dcc11e,2017-09-01,1
4,640a1567-60cc-45bc-8d60-2e1e0a230451,2018-11-01,2


In [21]:
end_contract = (installdate + np.timedelta64(1, 'Y'))
end_contract

0     2020-02-01 11:38:24
1     2020-04-30 11:38:24
2     2020-08-31 11:38:24
3     2019-09-01 11:38:24
4     2020-10-31 11:38:24
5     2020-02-01 11:38:24
6     2021-05-31 11:38:24
7     2020-11-30 11:38:24
8     2020-01-01 11:38:24
9     2021-08-31 11:38:24
10    2021-08-31 11:38:24
11    2019-08-01 11:38:24
12    2019-11-01 11:38:24
13    2019-05-01 11:38:24
14    2019-03-01 11:38:24
15    2021-07-31 11:38:24
16    2019-01-01 11:38:24
17    2019-10-01 11:38:24
18    2020-08-31 11:38:24
19    2021-06-30 11:38:24
20    2019-10-01 11:38:24
21    2019-10-01 11:38:24
22    2020-08-31 11:38:24
23    2021-06-30 11:38:24
24    2020-04-30 11:38:24
25    2020-10-31 11:38:24
26    2020-01-01 11:38:24
27    2021-09-30 11:38:24
28    2019-08-01 11:38:24
29    2021-07-31 11:38:24
              ...        
170   2019-07-01 11:38:24
171   2021-04-30 11:38:24
172   2021-04-30 11:38:24
173   2020-12-31 11:38:24
174   2021-06-30 11:38:24
175   2021-06-30 11:38:24
176   2021-05-31 11:38:24
177   2021-0

In [19]:
# test = installbase.loc[installbase[;'warranty'] == 1]

# test = (installbase.loc[installbase.loc[:, 'warranty'] == 1])
test = (installbase.loc[installbase.loc[:, 'warranty'] == 1])
    
print(test)

                                   serial install date  warranty
0    6c19ff4f-bbf4-4d2b-ac7a-6417468c3815   2018-02-01         1
1    d85308c7-2e35-4e95-b046-4121e3739fa2   2018-05-01         1
3    13131c64-7702-44bd-b418-d73941dcc11e   2017-09-01         1
7    b6afabf2-96cd-4c43-a610-eb6d5bd6826b   2018-12-01         1
8    fab03caa-2e00-4bf4-afc1-58bd99dd62de   2018-01-01         1
9    c94edde2-79bc-49af-93a0-7a47381af55c   2019-09-01         1
10   e4e66ad2-7bc1-4d6d-80ad-f6a33d287bd6   2019-09-01         1
12   07e5cce4-7878-432a-a7a9-74d7f8d53f4f   2017-11-01         1
14   1b3d0af5-183c-457a-8d32-6f8d26a063f6   2017-03-01         1
15   4545e513-c50c-4079-9288-9324feb65535   2019-08-01         1
16   95eb272b-a12f-4dcf-a9d1-aa073002cb55   2017-01-01         1
18   34d912f0-33f7-4b8b-84a9-b80cb20455f1   2018-09-01         1
19   8970bbe9-353d-44ed-ba91-ddf4727c1eae   2019-07-01         1
23   5a7cd75b-b9d8-44fb-b85c-c2b615cec2fa   2019-07-01         1
25   fb4b0063-6ff1-4711-a

In [18]:
#  Define function to calculate out of warranty date

def outofwarranty(installbase.loc[installbase.loc[:, 'warranty']):

    if (installbase.loc[installbase.loc[:, 'warranty'] == 1]):
       end_contract = (installdate + np.timedelta64(1, 'Y'))
    elif (installbase.loc[installbase.loc[:, 'warranty'] == 2]):
        end_contract = (installdate + np.timedelta64(2, 'Y'))
    else 
        end_contract = (installdate + np.timedelta64(3, 'Y'))
    
# how many months to remove from the beginning of a dataframe?
def add_cohorts_to_df(device_uuid):
    
    # 1. slice user_data, extract device data for each user
    device_uuid_data = user_data[user_data['device_uuid'] == device_uuid]
    
    # 2. find cohort group of the device and remove all unnecessary months
    device_uuid_cohort_group = int(cohorts[cohorts['device_uuid'] == device_uuid]['cohort_group'])
    device_uuid_data = device_uuid_data[device_uuid_cohort_group:] 
    
    return(device_uuid_data)

SyntaxError: invalid syntax (<ipython-input-18-ad29279bedac>, line 3)

In [None]:
#outofwarrantydate = pd.Series(warranty.replace(year.year + 1))
#outofwarrantydate = pd.Series('install date' + relativedelta(months=+num_months - 1))
outofwarrantydate = installdate + relativedelta(num_months - 1)
outofwarrantydate.head()

## 4. Generating categorical features

In [None]:
# Defining the variables
platforms = ['iOS', 'Android']
countries = ['IE', 'GB', 'NL', 'FR', 'DE', 'BE', 'DK']
service_contract = [False, True]

4.1. Generating categorical feature weights
Defining weights for the likelihood of a categorical feature associated with an individual unit.