In [None]:
#import libraries
import pandas as pd


In [None]:
#read datasets and create data frames
user_usage = pd.read_csv("data/user_usage.csv")
user_device = pd.read_csv("data/user_device.csv")
devices = pd.read_csv("data/android_devices.csv")

In [None]:
#### Sanity Checks

In [None]:
print(user_usage.head())
print(user_usage.tail())
print(user_usage.size)
print(user_usage.shape)
print(user_usage.info())
print(user_usage.describe())


In [None]:
print(user_device.head())
print(user_device.tail())
print(user_device.size)
print(user_device.shape)
print(user_device.info())
print(user_device.describe())


In [None]:
print(devices.head())
print(devices.tail())
print(devices.size)
print(devices.shape)
print(devices.info())
print(devices.describe())


In [None]:
#rename column
devices.rename(columns={"Retail Branding": "manufacturer"}, inplace=True)

In [None]:
#Goal

#get the average usage figures for different types of devices
#create a new dataframe called 'result' that has the user's device code from user_usage as a column
#add a column to the result data frame of the device's manufacturer from the device data frame

In [None]:
#### Inner Merge / Inner JOin
#use the pandas merge function to keep only those rows when the merge on parameter exists in both frame

In [None]:
result = pd.merge(user_usage,
                 user_device[['use_id', 'platform', 'device']],
                 on='use_id')

In [None]:
#run sanity checks
print(result.head())
print(result.tail())
print(result.shape)
print(result.size)
print(result.info())
print(result.describe())

In [None]:
print(user_usage['use_id'].isin(user_device['use_id']).value_counts())
print(user_device['use_id'].isin(user_usage['use_id']).value_counts())

In [None]:
#left merge/ left join
# a left merge keeps all the values from the left data frame (user_device) and rows from the right data frame (user_usage) will only be kept when there is a match on parameter, and NaN values when theres is no match.

In [None]:
result = pd.merge(user_usage,
                 user_device[['use_id', 'platform', 'device']],
                 on='use_id', how='left')

result.shape

In [None]:
result.info()

In [None]:
#right merge/ right join
# a right merge keeps all the values from the right data frame (user_device) and rows from the left data frame (user_usage) will only be kept when there is a match on parameter, and NaN values when theres is no match.


In [None]:
result = pd.merge(user_usage,
                 user_device[['use_id', 'platform', 'device']],
                 on='use_id', how='right')

result.shape

In [None]:
users = [['Bob', 'Mike', 'Jose'], ['Jim', "Ron"]]
for i in range(len(users)):
  for j in range(len(users[i])):
    print(users[i][j])



In [None]:
#check on the number of null values
print(result['monthly_mb'].isnull().sum())

In [None]:
#check on the number of null values
print(result['platform'].isnull().sum())

In [None]:
#outer merge / outer join
# a full outer merge or outer join keeps all the values from both the left ('use_usage) and the right data frame ('user_device). Rows will be aligned when there is a shared values between the left and right data frames. There will be NaN when there is no shared values.

In [None]:
result = pd.merge(user_usage,
                 user_device[['use_id', 'platform', 'device']],
                 on='use_id', how='outer', indicator = True)

result.shape

In [None]:
print(result)

In [None]:
#how many unique user_id's in both of the dataframes
pd.concat([user_usage['use_id'], user_device['use_id']]).unique().shape[0]

In [None]:
#number of rows with no missing values
print((result.apply(lambda x : x.isnull().sum(), axis = 1) == 0).sum())

In [None]:
### Final merge - adding device manufacturer
result = pd.merge(result,
                 devices[['Retail Branding', 'Model']],
                 left_on='device',
                 right_on = 'Model',
                  how = 'left')

result.shape

In [None]:
result.info()

In [None]:
result.describe()

In [None]:
####statistics on the final result
result.groupby('Retail Branding', as_index = False).agg({
    "outgoing_mins_per_month": "mean",
    "outgoing_sms_per_month": "mean",
    "monthly_mb": "mean",
    "use_id": "count"
})