## 1. Defining the main parameters
We start by importing all the necessary dependencies and defining the main parameters mentioned above (200 serial number, 36 months, starting month in January 2017).

In [97]:
# importing all the libraries
import pandas as pd
import numpy as np
import uuid
from datetime import datetime
from dateutil.relativedelta import relativedelta

In [98]:
# We then set the main parameters of our final dataset. The number of units we’d like to generate data for, maximum number of months warranty per unit and the start of the data collection.
# number of device uuids 
num_serial_num = 200

# number of months per device uuid
num_months = 36

# starting month of app usage data
start_month = '2017-01-01'

In [99]:
# generating unique identifiers for each device

# generating 200 serial numbers
serial = pd.Series([str(uuid.uuid4()) for i in range(0,num_serial_num)])
installbase = pd.DataFrame()
installbase['serial'] = pd.Series(serial)
installbase


Unnamed: 0,serial
0,e3761cc9-51a2-4d46-a721-e6944c7f0a08
1,77f58b8c-d7b0-4df2-87eb-c81b5385018e
2,304a8757-0ac6-438c-bc67-8ac6a76ace80
3,699852ed-66b0-4096-adf7-6d4d35c7b13c
4,7cc13c12-b9d2-43b1-b68c-c1deaade6be5
5,4938277f-d361-40e8-8910-eff78edf415f
6,5db9afe4-b486-40ce-a131-02f2ee6fdffd
7,89b3714c-8100-40ec-8f74-4709193e5fb3
8,3bd6ce6c-791b-4237-bb02-95d2733b82e9
9,55d09ea3-11d5-4708-8766-652d1d8c6ba2


## Generating datetime

In [100]:
# reseting the index
installbase = installbase.reset_index().drop('index', 1)

# defining starting month and ending month
start_month_ts = pd.to_datetime(start_month)
end_month_ts = start_month_ts + relativedelta(months=+num_months - 1)

# making a Series out of the starting and ending month
months = pd.Series(pd.date_range(start_month_ts, end_month_ts, freq='MS'))

str(start_month_ts), str(end_month_ts)

('2017-01-01 00:00:00', '2019-12-01 00:00:00')

We can see that with the specified parameters (starting month, number of months) we will generate data from January 2017 to December 2019.

In [101]:
installdate = pd.Series(np.random.choice(months, size=num_serial_num))
installdate.head()

0   2019-04-01
1   2018-08-01
2   2018-10-01
3   2018-07-01
4   2017-08-01
dtype: datetime64[ns]

In [102]:
#  Adding install date column to the dataframe
installbase['install date'] = pd.concat([installdate] * num_serial_num, axis=0).reset_index().drop('index', 1)
installbase

Unnamed: 0,serial,install date
0,e3761cc9-51a2-4d46-a721-e6944c7f0a08,2019-04-01
1,77f58b8c-d7b0-4df2-87eb-c81b5385018e,2018-08-01
2,304a8757-0ac6-438c-bc67-8ac6a76ace80,2018-10-01
3,699852ed-66b0-4096-adf7-6d4d35c7b13c,2018-07-01
4,7cc13c12-b9d2-43b1-b68c-c1deaade6be5,2017-08-01
5,4938277f-d361-40e8-8910-eff78edf415f,2017-11-01
6,5db9afe4-b486-40ce-a131-02f2ee6fdffd,2018-05-01
7,89b3714c-8100-40ec-8f74-4709193e5fb3,2019-12-01
8,3bd6ce6c-791b-4237-bb02-95d2733b82e9,2017-10-01
9,55d09ea3-11d5-4708-8766-652d1d8c6ba2,2018-01-01


## Adding Warranty data

In [103]:
# Customer have a choice of 1, 2 or 3 year warranty
num_warranty = 3  

# assign cohorts to users randomly (when did the user first use the product?)
warranty = pd.DataFrame()
warranty['warranty'] = np.random.randint(low=1, high=num_warranty, size=num_serial_num)
warranty.head()



Unnamed: 0,warranty
0,2
1,2
2,1
3,1
4,2


In [104]:
installbase['warranty'] = pd.concat([warranty], axis=0).reset_index().drop('index', 1)
installbase.head()

Unnamed: 0,serial,install date,warranty
0,e3761cc9-51a2-4d46-a721-e6944c7f0a08,2019-04-01,2
1,77f58b8c-d7b0-4df2-87eb-c81b5385018e,2018-08-01,2
2,304a8757-0ac6-438c-bc67-8ac6a76ace80,2018-10-01,1
3,699852ed-66b0-4096-adf7-6d4d35c7b13c,2018-07-01,1
4,7cc13c12-b9d2-43b1-b68c-c1deaade6be5,2017-08-01,2


In [127]:
test = installbase.loc[installbase[;'warranty'] = 1]
print(test)

SyntaxError: invalid syntax (<ipython-input-127-1cc2ae67eea4>, line 1)

In [124]:
#  Adding outofwarranty date column to the dataframe

start_contract = installdate
if installbase.loc[installbase['warranty'] = 1]
    end_contract = start_contract + np.timedelta64(1, 'Y')
elif installbase.loc[installbase['warranty'] = 2]
    end_contract = start_contract + np.timedelta64(2, 'Y') 
else 
    end_contract = start_contract + np.timedelta64(3, 'Y') 
    
end_contract
#outofwarrantydate =  pd.Series(installbase['install date'])
#outofwarrantydate = pd.to_datetime(outofwarrantydate)
#outofwarrantydate = outofwarrantydate+relativedelta(months=+1)
#outofwarrantydate

SyntaxError: invalid syntax (<ipython-input-124-cc222e7c1528>, line 4)

In [None]:
#outofwarrantydate = pd.Series(warranty.replace(year.year + 1))
#outofwarrantydate = pd.Series('install date' + relativedelta(months=+num_months - 1))
outofwarrantydate = installdate + relativedelta(num_months - 1)
outofwarrantydate.head()

## 4. Generating categorical features

In [None]:
# Defining the variables
platforms = ['iOS', 'Android']
countries = ['IE', 'GB', 'NL', 'FR', 'DE', 'BE', 'DK']
service_contract = [False, True]

4.1. Generating categorical feature weights
Defining weights for the likelihood of a categorical feature associated with an individual unit.