In [0]:
import pyspark
#imports the pyspark library

In [0]:
log_file = sc.textFile("dbfs:/FileStore/shared_uploads/niveditajoshi22@gmail.com/2015_07_22_mktplace_shop_web_log_sample__2__log-3.gz")
#load the log file into Spark

In [0]:
sessionized_log = log_file.mapPartitions(lambda x: sessionize(x))
#sessionize the web log by IP

In [0]:
def sessionize(x):
  """
  Sessionizes a list of page hits by IP.

  Args:
    x: A list of page hits.

  Returns:
    A list of sessions.
  """

  sessions = []
  current_session = []
  for page_hit in x:
    ip = page_hit.split(",")[0]
    if len(current_session) == 0 or ip == current_session[0][0]:
      current_session.append(page_hit)
    else:
      sessions.append(current_session)
      current_session = [page_hit]
  if len(current_session) > 0:
    sessions.append(current_session)
  return sessions

# the code def sessionize(x) takes a list of page hits and groups them together by IP address. it creates a new list for each unique IP address that appears in the log file and then returns the list of sessions.

In [0]:
average_session_time = sessionized_log.map(lambda x: len(x)).mean()
#determining the average session time

In [0]:
print(average_session_time)
#prints the average session time

1.0


In [0]:
def unique_url_visits_per_session(sessions):
  """
  Determines the unique URL visits per session.

  Args:
    sessions: A list of sessions.

  Returns:
    A dictionary of unique URL visits per session.
  """

  unique_url_visits = {}
  for session in sessions:
    unique_urls = set()
    for page_hit in session:
      url = page_hit.split(",")[1]
      unique_urls.add(url)
    unique_url_visits[session] = len(unique_urls)
  return unique_url_visits
#we are passing the list of sessions to the unique_url_visits_per_session function which will then return a dictionary of unique URL visits per session.

In [0]:
print(unique_url_visits_per_session)
#prints the unique URL visits per session

<function unique_url_visits_per_session at 0x7f54f546a4c0>


In [0]:
def most_engaged_users(sessions):
  """
  Finds the most engaged users.

  Args:
    sessions: A list of sessions.

  Returns:
    A list of the most engaged users.
  """

  most_engaged_users = []
  for session in sessions:
    session_time = len(session)
    if len(most_engaged_users) < 5 or session_time > most_engaged_users[-1][1]:
      most_engaged_users.append((session[0][0], session_time))
  return most_engaged_users
#passing the list of sessions to the most_engaged_users function. The function will then return a list of the most engaged users.The list of the most engaged users will be sorted by session time, with the most engaged users at the top of the list.

In [0]:
print(most_engaged_users)
#prints the most engaged users

<function most_engaged_users at 0x7f54f5473a60>
