# simple conclusions after first data explorations
1. brochure_views:
    - missing duration values
    - negative values
    - data duration, page_turn_count,  not normalized
2. app_starts:
    - duplicated values
    - inconsistent values with installs data

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

BASE_DIR = os.path.abspath("..")
RAW_DATA_PATH = os.path.join(BASE_DIR, "dataset")

In [None]:
installs = pd.read_csv(os.path.join(RAW_DATA_PATH, "installs.txt"), sep="\t")
brochure_views = pd.read_csv(os.path.join(RAW_DATA_PATH, "brochure views.txt"), sep="\t")
brochure_views_july = pd.read_csv(os.path.join(RAW_DATA_PATH, "brochure views july.txt"), sep="\t")
app_starts = pd.read_csv(os.path.join(RAW_DATA_PATH, "app starts.txt"), sep="\t")
app_starts_july = pd.read_csv(os.path.join(RAW_DATA_PATH, "app starts july.txt"), sep="\t")

In [None]:
installs['InstallDate'] = pd.to_datetime(installs['InstallDate'], errors='coerce')
brochure_views['dateCreated'] = pd.to_datetime(brochure_views['dateCreated'], errors='coerce')
brochure_views_july['dateCreated'] = pd.to_datetime(brochure_views_july['dateCreated'], errors='coerce')
app_starts['dateCreated'] = pd.to_datetime(app_starts['dateCreated'], errors='coerce')
app_starts_july['dateCreated'] = pd.to_datetime(app_starts_july['dateCreated'], errors='coerce')

# replacing missing and negative values with min nun negative value and normalization

In [None]:

min_value = brochure_views['view_duration'][brochure_views['view_duration']>0].min()
brochure_views["view_duration"] = np.where(
    (brochure_views["view_duration"] < 0) | (brochure_views["view_duration"].isnull()),
    1000,
    brochure_views["view_duration"]
)
brochure_views_july["view_duration"] = np.where(
    (brochure_views_july["view_duration"] < 0) | (brochure_views_july["view_duration"].isnull()),
    1000,
    brochure_views_july["view_duration"]
)
normalize_col = ['view_duration', 'page_turn_count']
for col in normalize_col:
    brochure_views[f'{col}_log'] = np.log1p(brochure_views[col])
    brochure_views_july[f'{col}_log'] = np.log1p(brochure_views_july[col])

# remove duplications and inconsistent data

In [None]:
app_starts.drop_duplicates(inplace=True)
app_starts = app_starts.merge(installs[['userId', 'InstallDate']], on='userId', how='left')
app_starts['dateCreated'] = app_starts[['dateCreated', 'InstallDate']].max(axis=1)
app_starts.drop(columns=['InstallDate'], inplace=True)
app_starts.head()

In [None]:
installs.head()
app_starts_july.head()
app_starts_july.drop(columns=['InstallDate_x'], inplace=True)
app_starts_july.drop(columns=['InstallDate_y'], inplace=True)

In [None]:
app_starts_july.drop_duplicates(inplace=True)
app_starts_july = app_starts_july.merge(installs[['userId', 'InstallDate']], on='userId', how='left')
app_starts_july['dateCreated'] = app_starts_july[['dateCreated', 'InstallDate']].max(axis=1)
app_starts_july.drop(columns=['InstallDate'], inplace=True)

# visualization after cleaning

In [None]:
# impact of log transfer
brochure_views[['view_duration_log', 'page_turn_count_log']].hist(figsize=(10, 5))

In [None]:
print(app_starts[app_starts['dateCreated'].isnull()].shape)
print(app_starts_july[app_starts_july['dateCreated'].isnull()].shape)

In [None]:
print("\nBrochure Views Describe:\n")
display (brochure_views.describe())

In [None]:
from src.genral import save_dataframes_to_pickle
dataframes = [installs, brochure_views, brochure_views_july, app_starts, app_starts_july]
file_names = ['installs.pkl', 'brochure_views.pkl', 'brochure_views_july.pkl', 'app_starts.pkl', 'app_starts_july.pkl']
SAVE_CLEAN_DATA_PATH = os.path.join(BASE_DIR, "data", "clean_data")

save_dataframes_to_pickle(dataframes, file_names, SAVE_CLEAN_DATA_PATH)


# Time-sires analysis

In [None]:
earliest_date = brochure_views['dateCreated'].min()
latest_date = brochure_views['dateCreated'].max()

print("Earliest brochure view date:", earliest_date)
print("Latest brochure view date:", latest_date)

In [None]:
user_date_range = brochure_views.groupby(['userId','brochure_id']).agg(
    earliest_view=('dateCreated', 'min'),
   latest_view=('dateCreated', 'max'),
    total_duration=('view_duration_log', 'sum'),
    avg_duration=('view_duration_log', 'mean'),
    total_pages=('page_turn_count_log', 'sum')
).reset_index()
user_date_range['view_duration_days'] =np.log1p((user_date_range['latest_view']-user_date_range['earliest_view']).dt.days)

display(user_date_range.head())

print("Number of users with view data:", user_date_range.shape[0])

In [None]:
grouped_by_day = brochure_views.groupby(['userId', 'brochure_id', 'dateCreated']).agg(
    total_views=('view_duration_log', 'count'),
    total_duration=('view_duration_log', 'sum'),
    average_duration=('view_duration_log', 'mean')
).reset_index()
display(grouped_by_day.head())

print("Number of users with view data:", grouped_by_day.shape[0])
print(grouped_by_day['average_duration'].unique().shape[0])


In [None]:
plt.figure(figsize=(10,4))
sns.histplot(user_date_range['view_duration_days'], kde=True, bins=30)
plt.title("Distribution of Users view_duration_day ")
plt.xlabel("Date")
plt.ylabel("Number of Users")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10,4))
sns.histplot(user_date_range['earliest_view'], kde=True, bins=30)
plt.title("Distribution of Users' Earliest Brochure View Dates")
plt.xlabel("Date")
plt.ylabel("Number of Users")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
user_date_range['latest_view'] = user_date_range['latest_view'].dt.date

plt.figure(figsize=(10,4))
sns.histplot(user_date_range['latest_view'], kde=True, bins=30, color='orange')
plt.title("Distribution of Users' Latest Brochure View Dates")
plt.xlabel("Date")
plt.ylabel("Number of Users")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()