## Importing necessary libs

In [1]:
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
import json

## Reading time-series data

In [2]:
df = pd.read_csv("preprocessed_ts.csv")

In [3]:
df.head()

Unnamed: 0,userID_anonymized,model,operating_system,wifi_state,session_no,event_delay,eventType,duration,level,progress,play_count,status,revenue,totalrevenue
0,be725f9b-33c5,"iPhone12,1",iOS 14.0.1,1,1,1.0,interstitial_impression,0.0,0,0,0,0,0.035,0.0362
1,dd5ef583-258b,"iPhone12,1",iOS 14.2,0,13,195.0,banner_impression,0.0,0,0,0,0,0.0003,0.460578
2,be725f9b-33c5,"iPhone12,1",iOS 14.0.1,1,1,2.0,level_event,1606707000.0,25,10,10,1,0.0,0.0362
3,fb8331dd-f7f7,"iPhone11,8",iOS 14.2,1,3,45.0,banner_impression,0.0,0,0,0,0,0.0003,1.161082
4,64dea955-8f14,"iPhone11,6",iOS 14.2,1,1,3.0,banner_impression,0.0,0,0,0,0,0.00015,0.408266


# Feature Extraction

I will extract these properties for each unique user:

- Total interstitial impression count
- Total banner impression count
- Total rewarded impression count
- Total level event count
- Last interstitial (delay_hour format)
- Last banner (delay_hour format)
- Last rewarded (delay_hour format)
- Last level event  (delay_hour format)

and mean and stds of the revenues of:
- Interstitials
- Banners
- Rewardeds

In [4]:
# Extracting means
means = df.groupby(['userID_anonymized', 'eventType'])['revenue'].mean().unstack(['eventType'], fill_value=0)\
.add_prefix('mean_revenue_').rename_axis(None, axis=1).reset_index().fillna(0).set_index("userID_anonymized")

# Extracting stds
stds = df.groupby(['userID_anonymized', 'eventType'])['revenue'].std().unstack(['eventType'], fill_value=0)\
.add_prefix('std_revenue_').rename_axis(None, axis=1).reset_index().fillna(0).set_index("userID_anonymized")

# Extracting total counts
totals = df.groupby(['userID_anonymized', 'eventType'])['revenue'].count().unstack(['eventType'], fill_value=0)\
.add_prefix('total_count_').rename_axis(None, axis=1).reset_index().fillna(0).set_index("userID_anonymized")

# Extracting last times
last_times = df.groupby(['userID_anonymized', 'eventType'])['event_delay'].max().unstack(['eventType'], fill_value=0)\
.add_prefix('last_time_').rename_axis(None, axis=1).reset_index().fillna(0).set_index("userID_anonymized")

### Concatenating extracted features

In [5]:
concatted_df = pd.concat([means, stds, totals, last_times], axis=1, join="inner").rename_axis(None, axis=1).reset_index()

### Merging with original data

In [6]:
concatted_df['model'] = df.groupby(['userID_anonymized'])['model'].min().values
concatted_df['operating_system'] = df.groupby(['userID_anonymized'])['operating_system'].min().values

# mean encoding for wi-fi
concatted_df['mean_wifi_state'] = df.groupby(['userID_anonymized'])['wifi_state'].mean().values

# to get last session number
concatted_df['last_session_no'] = df.groupby(['userID_anonymized'])['session_no'].max().values

# to get last played level
concatted_df['last_level'] = df.groupby(['userID_anonymized'])['level'].max().values

# to get lifetime-revenue
concatted_df['totalrevenue'] = df.groupby(['userID_anonymized'])['totalrevenue'].max().values

In [7]:
concatted_df.head()

Unnamed: 0,userID_anonymized,mean_revenue_banner_impression,mean_revenue_interstitial_impression,mean_revenue_level_event,mean_revenue_rewarded_impression,std_revenue_banner_impression,std_revenue_interstitial_impression,std_revenue_level_event,std_revenue_rewarded_impression,total_count_banner_impression,...,last_time_banner_impression,last_time_interstitial_impression,last_time_level_event,last_time_rewarded_impression,model,operating_system,mean_wifi_state,last_session_no,last_level,totalrevenue
0,00081f8b-253d,0.0004,0.03,0.0,0.0,0.0,0.0,0.0,0.0,6,...,13.0,2.0,14.0,0.0,"iPhone8,2",iOS 14.0.1,1.0,3,65,0.0324
1,000ab112-2616,0.000375,0.03,0.0,0.0,0.000318,0.0,0.0,0.0,2,...,0.0,0.0,0.0,0.0,"iPhone12,1",iOS 14.2,1.0,1,4,0.03075
2,002ff54d-8749,0.000669,0.027517,0.0,0.04,0.000962,0.005541,0.0,0.0,118,...,4078.0,4076.0,4078.0,1621.0,"iPhone12,8",iOS 14.1,0.814159,8,1004,1.211006
3,0030d0af-2830,0.000416,0.02292,0.0,0.04,0.000184,0.005129,0.0,0.0,82,...,5137.0,5136.0,5137.0,763.0,"iPhone9,3",iOS 13.7,1.0,5,1003,0.881707
4,00441340-5d9c,0.000537,0.03,0.0,0.0,9.6e-05,0.003536,0.0,0.0,16,...,2368.0,2035.0,2369.0,0.0,"iPhone10,2",iOS 14.2,1.0,4,67,0.1847


## Saving feature-engineered data

In [8]:
concatted_df.to_csv('fe_data.csv', index=False) 