In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

from input_data import INPUT_DATA
from create_dataset import create_datastructure

%matplotlib inline

### Gather Data

In [None]:
# get data
data = create_datastructure(INPUT_DATA)

In [None]:
# create a dataframe
df = pd.DataFrame.from_dict(data, orient='index')
df.head(5)

### Assess Data

In [None]:
df.info()

In [None]:
df = df.reset_index()

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
# rename user id 
df = df.rename(columns={"index": "UserId"})

#### Notes:
Our collected data has `2500 rows` and `5 columns`

Each column has missing data i.e:
- `GuideDownload` 12 records
- `GuideSession` 11 records
- `ConnectionRequested` 1990 records
- `PhotoUpload` 1824 records

The values in our interested columns are Float except the UserId column with integer values


### Cleaning Data

In [None]:
# make a copy of data to be cleaned
df_copy = df.copy()

*Issue* NaN Values representing missing values

*Solution* Replace missing values with mean of each column

In [None]:
column_means = df_copy.mean().round(0)
print(column_means)

In [None]:
df_copy = df_copy.fillna(column_means)

In [None]:
# test
df_copy.info()

In [None]:
df_copy.head()

In [None]:
# make a copy of the clean dataframe
df_master = df_copy.copy()

### Storing

In [None]:
# Store the clean DataFrame in a CSV file with the main one named df_events_master.csv.
df_master.to_csv('df_events_master.csv', index= False, encoding='utf-8')

### Exploration

For each analysis, one has to seek to answer certain questions about the data


Let's start our exploration by looking at users bucketed by number of times they triggered a `PhotoUpload` event. Is the distribution skewed or symmetric? Is it unimodal or multimodal?

In [None]:
# univariate plot of photos uploaded
plt.hist(data = df_master , x = 'PhotoUpload');
# plt.ylim(0,600)
plt.title('Distribution of Photo Upload(s) ')
plt.xlabel('Photo Uploads')
plt.ylabel('Users')

The plot shows a fairly right skewed distribution with the majority of guide users uploading a photo 7 times to the guide

We can further extend our analysis to show the relationship between Photos Uploaded to the guide and the guide session

In [None]:
# bivariate plot of photos uploaded vs. guide session
plt.figure(figsize = [12, 10])
base_color = sb.color_palette()[0]
plt.scatter(data = df_master , y = 'GuideSession' , x = 'PhotoUpload')
plt.title('Relationship between Guide Session and Photo Upload')
plt.xticks(rotation='vertical')
plt.ylabel('Guide Session')
plt.xlabel('Photo Upload')

The plot suggests that users who upload the most photos, open up the guide the longest on their phones

Lastly, I would like to explore how PhotoUpload relates to Guide Download. In the previous plots we saw that users that upload the most photos use the guide often so do these users also download the guide often

In [None]:
plt.figure(figsize = [12, 10])
bins_x = np.arange(0.5, 10.5+1, 1)
bins_y = np.arange(-0.5, 10.5+1, 1)
plt.hist2d(data = df_master, y = 'GuideDownload', x = 'PhotoUpload',
           bins = [bins_x, bins_y], cmap = 'viridis_r', cmin = 0.5)
plt.colorbar()
plt.ylabel('GuideDownload')
plt.xlabel('PhotoUpload')
plt.title('Relationship between Guide Download and Photo Upload')

The plot suggests that guide downloads are pretty low for users who upload the most photos.

### Conclusion

The highest majority of guide users upload at least 7 photos to the guide. The data also suggested that users who uploading more photos open the guide the most. However, this number did not match for people who download the guide. 
There should be another factor affecting guide downloads and we can't just rely on photouploads.