In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta, datetime

# Settings for dummy data
number_of_players = 100
number_of_records = 1000
date_range_start = datetime(2022, 1, 1)
date_range_end = datetime(2022, 12, 31)

# Generate dummy data
np.random.seed(0)  # Seed for reproducibility
player_ids = np.random.choice(range(1, number_of_players + 1), number_of_records)
device_ids = np.random.choice(range(1, number_of_players + 1), number_of_records)
event_dates = [date_range_start + timedelta(days=np.random.randint(0, (date_range_end - date_range_start).days)) for _ in range(number_of_records)]
games_played = np.random.poisson(lam=5, size=number_of_records)  # Assuming an average of 5 games played, change lam for different distributions

# Create DataFrame
df_activity = pd.DataFrame({
    'player_id': player_ids,
    'device_id': device_ids,
    'event_date': event_dates,
    'games_played': games_played
})

# Make sure to drop duplicates for the primary key
df_activity = df_activity.drop_duplicates(subset=['player_id', 'event_date'])

# Sort the DataFrame for better visualization
df_activity = df_activity.sort_values(by=['player_id', 'event_date'])

# Reset index after sorting
df_activity = df_activity.reset_index(drop=True)

# Show the DataFrame
print(df_activity)


     player_id  device_id event_date  games_played
0            1         60 2022-02-02             5
1            1         57 2022-02-10             8
2            1         80 2022-03-02             4
3            1         80 2022-04-08             7
4            1         97 2022-04-24             5
..         ...        ...        ...           ...
981        100         19 2022-07-23             6
982        100          4 2022-08-19             3
983        100          3 2022-10-13             6
984        100         52 2022-10-15             4
985        100         15 2022-12-22             3

[986 rows x 4 columns]


In [14]:
import pandas as pd

def game_analysis(activity: pd.DataFrame) -> pd.DataFrame:
    df = activity.groupby('player_id')['event_date'].min()
    print(df)


game_analysis(df_activity)

player_id
1     2022-02-02
2     2022-01-01
3     2022-04-23
4     2022-03-01
5     2022-02-22
         ...    
96    2022-02-17
97    2022-01-21
98    2022-01-13
99    2022-01-07
100   2022-01-29
Name: event_date, Length: 100, dtype: datetime64[ns]


In [15]:
#show me an example of useing groupby
df = df_activity.groupby('player_id')['event_date']
df

<pandas.core.groupby.generic.SeriesGroupBy object at 0x10603af20>