In [13]:
import pandas as pd

# If the JSON is stored in a file:
df = pd.read_json('raw_data/customer_churn_mini.json', lines=True)

In [14]:
## helper functions


def convert_unix_to_timestamp(series):
    """
    Converts a pandas Series of Unix timestamps (in milliseconds) to datetime format.
    
    Args:
        series (pd.Series): A Series of Unix timestamps (int, in milliseconds).
        
    Returns:
        pd.Series: A Series of pandas datetime objects.
    """
    return pd.to_datetime(series, unit='ms')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286500 entries, 0 to 286499
Data columns (total 18 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ts             286500 non-null  int64  
 1   userId         286500 non-null  object 
 2   sessionId      286500 non-null  int64  
 3   page           286500 non-null  object 
 4   auth           286500 non-null  object 
 5   method         286500 non-null  object 
 6   status         286500 non-null  int64  
 7   level          286500 non-null  object 
 8   itemInSession  286500 non-null  int64  
 9   location       278154 non-null  object 
 10  userAgent      278154 non-null  object 
 11  lastName       278154 non-null  object 
 12  firstName      278154 non-null  object 
 13  registration   278154 non-null  float64
 14  gender         278154 non-null  object 
 15  artist         228108 non-null  object 
 16  song           228108 non-null  object 
 17  length         228108 non-nul

## preprocessing

In [None]:
df = df[df['userId']!='']
df['registration'] = convert_unix_to_timestamp(df['registration'])
df['ts'] = convert_unix_to_timestamp(df['ts'])
df.drop(columns=['artist','song','location'], inplace=True)
df.fillna({'length': 0}, inplace=True)

In [36]:
# df.info()
df['page'].unique()

array(['NextSong', 'Add to Playlist', 'Roll Advert', 'Thumbs Up',
       'Downgrade', 'Thumbs Down', 'Home', 'Logout', 'Help', 'Upgrade',
       'Add Friend', 'Settings', 'Submit Upgrade', 'About',
       'Submit Downgrade', 'Error', 'Save Settings', 'Cancel',
       'Cancellation Confirmation'], dtype=object)

## feature transformation

In [37]:
last_activity = df.groupby('userId')['ts'].max().reset_index()
last_activity.columns = ['userId', 'last_seen']
current_date = df['ts'].max()
last_activity['days_since_last_seen'] = (current_date - last_activity['last_seen']).dt.days
last_activity['inactive_churn'] = (last_activity['days_since_last_seen'] >= 30).astype(int)
canceled_users = df[df['page'] == 'Cancellation Confirmation']['userId'].unique()
last_activity['explicit_churn'] = last_activity['userId'].isin(canceled_users).astype(int)
last_activity['churn'] = ((last_activity['explicit_churn'] == 1) | (last_activity['inactive_churn'] == 1)).astype(int)


* num_sessions ✅
* avg_songs_listen_per_session = (total listening time✅ / num_sessions✅)

* gender / location
* avg_session_duration = (session duration = (ts for last session activity - ts for first session activity  ) / total sessions)



* days_active_ratio = days_active/life_span(last_activity✅-register_time✅)
* level_paid_ratio
* device_type
* count of NextSong
* count of Logout
* count of Thumbs Up
* count of Thumbs Down
* count of Add to Playlist
* count of Upgrade
* count of Submit Upgrade	
* count of Roll Advert	
* count of Help
* count of Add Friend	
* count of About
* count of Settings
* count of Submit Downgrade	
* count of Error
* count of Save Settings	
* total_skips


In [None]:
def create_features(df):
    """
    Create comprehensive user features from the customer churn dataset.

    Args:
        df (pd.DataFrame): The processed customer churn dataframe

    Returns:
        pd.DataFrame: User-level features for churn prediction
    """
    import numpy as np
    import pandas as pd

    # 1. Basic session and listening features
    session_features = df.groupby('userId').agg({
        'sessionId': 'nunique',      # num_sessions
        'ts': ['min', 'max'],        # for session duration and days_active calculation
        'length': 'sum',             # total listening time
        'registration': 'first'      # registration time
    }).reset_index()

    session_features.columns = ['userId', 'num_sessions', 'first_activity',
                                'last_activity', 'total_listening_time', 'registration']

    # Calculate songs per session
    songs_per_session = df[df['page'] == 'NextSong'].groupby(['userId', 'sessionId']).size().reset_index(name='songs_in_session')
    avg_songs_per_session = songs_per_session.groupby('userId')['songs_in_session'].mean().reset_index()
    avg_songs_per_session.columns = ['userId', 'avg_songs_listen_per_session']

    session_features = session_features.merge(avg_songs_per_session, on='userId', how='left')
    session_features['avg_songs_listen_per_session'] = session_features['avg_songs_listen_per_session'].fillna(0)

    # Calculate average session duration (in hours)
    session_durations = df.groupby(['userId', 'sessionId'])['ts'].agg(['min', 'max']).reset_index()
    session_durations['session_duration'] = (session_durations['max'] - session_durations['min']).dt.total_seconds() / 3600
    avg_session_duration = session_durations.groupby('userId')['session_duration'].mean().reset_index()

    session_features = session_features.merge(avg_session_duration, on='userId', how='left')
    session_features['avg_session_duration'] = session_features['session_duration'].fillna(0)

    # Calculate days_active
    df_with_date = df.copy()
    df_with_date['date'] = df_with_date['ts'].dt.date
    days_active = df_with_date.groupby('userId')['date'].nunique().reset_index()
    days_active.columns = ['userId', 'days_active']

    session_features = session_features.merge(days_active, on='userId', how='left')
    session_features['days_active'] = session_features['days_active'].fillna(0)

    # Calculate days_active_ratio
    session_features['life_span'] = (session_features['last_activity'] - session_features['registration']).dt.days + 1
    session_features['days_active_ratio'] = session_features['days_active'] / session_features['life_span']
    session_features['days_active_ratio'] = session_features['days_active_ratio'].fillna(0)

    # 2. User demographic features
    demo_features = df.groupby('userId').agg({
        'gender': 'first',
        'level': lambda x: (x == 'paid').mean(),  # level_paid_ratio
        'userAgent': 'first'
    }).reset_index()

    demo_features.columns = ['userId', 'gender', 'level_paid_ratio', 'userAgent']

    # Transform gender to single dummy variable (1 for Male, 0 for Female)
    demo_features['gender_encoded'] = (demo_features['gender'] == 'M').astype(int)
    demo_features = demo_features.drop('gender', axis=1)

    def extract_device_type(user_agent):
        if pd.isna(user_agent):
            return 'Unknown'
    
        # Remove quotes and convert to lowercase
        ua_lower = str(user_agent).strip('"').lower()
    
        # Mobile detection (phones)
        mobile_indicators = [
            'mobile', 'android', 'iphone', 'ipod', 'blackberry', 
            'windows phone', 'palm', 'symbian', 'nokia', 'opera mini',
            'fennec', 'minimo', 'kindle', 'silk', 'mobile safari'
        ]
    
        # Tablet detection
        tablet_indicators = [
            'tablet', 'ipad', 'playbook', 'silk', 'kindle',
            'android.*tablet', 'gt-p', 'sm-t', 'nexus 7', 'nexus 10',
            'xoom', 'sch-i800', 'playstation portable', 'hpwos'
        ]
    
        # Check for tablet first (more specific)
        for indicator in tablet_indicators:
            if indicator in ua_lower:
                return 'Tablet'
    
        # Then check for mobile
        for indicator in mobile_indicators:
            if indicator in ua_lower:
                return 'Mobile'
    
        # Additional mobile checks for iPhone/iPad patterns
        if 'iphone' in ua_lower or 'cpu iphone os' in ua_lower:
            return 'Mobile'
        elif 'ipad' in ua_lower or 'cpu os' in ua_lower:
            return 'Tablet'
    
        # Check for Android devices
        if 'android' in ua_lower:
            # Android tablets often don't have 'mobile' in their UA string
            if 'mobile' not in ua_lower:
                return 'Tablet'
            else:
                return 'Mobile'
    
        # Desktop detection (default for Windows, Mac, Linux without mobile indicators)
        desktop_indicators = [
            'windows nt', 'macintosh', 'mac os x', 'linux', 'x11',
            'win32', 'win64', 'wow64', 'intel mac os x'
        ]
        
        for indicator in desktop_indicators:
            if indicator in ua_lower:
                return 'Desktop'
        
        # Default to Desktop for unknown cases
        return 'Desktop'

    demo_features['device_type'] = demo_features['userAgent'].apply(extract_device_type)
    
    # Create device type dummy variables (1 for Mobile, 0 for others)
    demo_features['device_mobile'] = (demo_features['device_type'] == 'Mobile').astype(int)
    demo_features['device_tablet'] = (demo_features['device_type'] == 'Tablet').astype(int)
    # Note: Desktop is the reference category (both mobile and tablet = 0)
    
    demo_features = demo_features.drop(['userAgent', 'device_type'], axis=1)

    # 3. Event count features
    page_counts = df.groupby(['userId', 'page']).size().unstack(fill_value=0).reset_index()

    required_pages = [
        'NextSong', 'Logout', 'Thumbs Up', 'Thumbs Down', 'Add to Playlist',
        'Upgrade', 'Submit Upgrade', 'Roll Advert', 'Help', 'Add Friend',
        'About', 'Settings', 'Submit Downgrade', 'Error', 'Save Settings'
    ]

    for page in required_pages:
        if page not in page_counts.columns:
            page_counts[page] = 0

    event_features = page_counts[['userId'] + required_pages].copy()
    event_features.columns = ['userId'] + [f'count_{page.replace(" ", "_").lower()}' for page in required_pages]

    # 4. Total skips feature
    songs_df = df[df['page'] == 'NextSong'].copy()
    songs_df = songs_df.sort_values(by=['userId', 'sessionId', 'ts'])
    songs_df['next_ts'] = songs_df.groupby(['userId', 'sessionId'])['ts'].shift(-1)
    songs_df['play_duration'] = (songs_df['next_ts'] - songs_df['ts']).dt.total_seconds()
    songs_df['song_length'] = songs_df['length']
    songs_df['played_ratio'] = songs_df['play_duration'] / songs_df['song_length']

    songs_df['is_skip'] = (
        (songs_df['played_ratio'] < 0.5) &
        (songs_df['play_duration'].notna()) &
        (songs_df['song_length'] > 0)
    )

    skip_features = songs_df.groupby('userId')['is_skip'].sum().reset_index()
    skip_features.columns = ['userId', 'total_skips']

    # 5. Combine all features
    final_features = session_features[[
        'userId', 'num_sessions', 'total_listening_time',
        'avg_songs_listen_per_session', 'avg_session_duration',
        'days_active', 'days_active_ratio'
    ]].copy()

    final_features = final_features.merge(demo_features, on='userId', how='left')
    final_features = final_features.merge(event_features, on='userId', how='left')
    final_features = final_features.merge(skip_features, on='userId', how='left')
    final_features = final_features.fillna(0)

    return final_features


# Create the features
user_features = create_features(df)

print("✅ Feature Engineering Complete!")
print(f"Shape: {user_features.shape}")
print(f"Features created: {len(user_features.columns) - 1}")
print()

# Display feature summary
print("Feature Summary:")
print("Gender dummy variable: gender_encoded (1=Male, 0=Female)")
print("Device dummy variables: device_mobile (1=Mobile, 0=Other), device_tablet (1=Tablet, 0=Other)")
print("Note: Desktop is the reference category (both device_mobile and device_tablet = 0)")
print()

feature_list = [
    "num_sessions", "total_listening_time", "avg_songs_listen_per_session",
    "avg_session_duration", "days_active", "days_active_ratio", 
    "level_paid_ratio", "gender_encoded", "device_mobile", "device_tablet",
    "count_nextsong", "count_logout", "count_thumbs_up", "count_thumbs_down", 
    "count_add_to_playlist", "count_upgrade", "count_submit_upgrade", 
    "count_roll_advert", "count_help", "count_add_friend", "count_about", 
    "count_settings", "count_submit_downgrade", "count_error", 
    "count_save_settings", "total_skips"
]

for feature in feature_list:
    if feature in user_features.columns:
        print(f"✅ {feature}")
    else:
        print(f"❌ {feature}")

print()
print("Sample of user features:")
print(user_features.head())

# Display value counts for dummy variables
print("\nDummy Variable Distributions:")
print(f"Gender Male: {user_features['gender_encoded'].value_counts()}")
print(f"Device Mobile: {user_features['device_mobile'].value_counts()}")
print(f"Device Tablet: {user_features['device_tablet'].value_counts()}")

✅ Feature Engineering Complete!
Shape: (225, 27)
Features created: 26

Feature Summary:
Gender dummy variables: ['gender_F', 'gender_M']

✅ num_sessions
✅ total_listening_time
✅ avg_songs_listen_per_session
✅ avg_session_duration
✅ days_active
✅ days_active_ratio
✅ level_paid_ratio
✅ device_type
✅ count_nextsong
✅ count_logout
✅ count_thumbs_up
✅ count_thumbs_down
✅ count_add_to_playlist
✅ count_upgrade
✅ count_submit_upgrade
✅ count_roll_advert
✅ count_help
✅ count_add_friend
✅ count_about
✅ count_settings
✅ count_submit_downgrade
✅ count_error
✅ count_save_settings
✅ total_skips
✅ gender_F
✅ gender_M

Sample of user features:
   userId  num_sessions  total_listening_time  avg_songs_listen_per_session  \
0      10             6          166866.37251                    112.166667   
1     100            35          672877.85659                     78.882353   
2  100001             4           35073.74215                     33.250000   
3  100002             4           49559.91810   

In [33]:
user_features['device_type'].value_counts()

device_type
Desktop    209
Mobile      13
Tablet       3
Name: count, dtype: int64

In [None]:
# Merge the churn labels from last_activity with the user features
final_dataset = user_features.merge(last_activity[['userId',
'churn', 'explicit_churn', 'inactive_churn']],
                                    on='userId', how='left')

print("✅ Merged dataset created!")
print(f"Final dataset shape: {final_dataset.shape}")
print(f"Number of users: {final_dataset['userId'].nunique()}")
print()

# Check for any missing churn labels
missing_churn = final_dataset['churn'].isna().sum()
print(f"Missing churn labels: {missing_churn}")

# Display churn distribution
print("\nChurn distribution:")
print(final_dataset['churn'].value_counts())
print()

# Display first few rows of final dataset
print("Sample of final merged dataset:")
print(final_dataset[['userId', 'num_sessions',
'avg_songs_listen_per_session',
                    'level_paid_ratio', 'total_skips',
'churn', 'explicit_churn', 'inactive_churn']].head())

In [None]:
 # Split data into train/test sets and X/y
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score

# Prepare features and target
# Drop non-feature columns
feature_cols = [col for col in final_dataset.columns if col not
in ['userId', 'churn', 'explicit_churn', 'inactive_churn']]

# Handle categorical variables
df_model = final_dataset.copy()

# Encode categorical variables
categorical_cols = ['location', 'device_type']
for col in categorical_cols:
    if col in df_model.columns:
        le = LabelEncoder()
        df_model[col] =
le.fit_transform(df_model[col].astype(str))

# Prepare X and y
X = df_model[feature_cols]
y = df_model['churn']

print("Features used in model:")
print(X.columns.tolist())
print(f"\nX shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"\nChurn distribution in y:")
print(y.value_counts())

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42, stratify=y)

print(f"\nTrain set shape: X_train {X_train.shape}, y_train 
{y_train.shape}")
print(f"Test set shape: X_test {X_test.shape}, y_test 
{y_test.shape}")

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\n✅ Data split and preprocessing complete!")

In [None]:
# Train Logistic Regression Model
# Create and train the logistic regression model
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_scaled, y_train)

print("✅ Logistic Regression Model Training Complete!")

# Make predictions
y_train_pred = lr_model.predict(X_train_scaled)
y_test_pred = lr_model.predict(X_test_scaled)
y_test_pred_proba = lr_model.predict_proba(X_test_scaled)[:, 1]

# Training accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"\nModel Performance:")
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# Detailed classification report
print(f"\nDetailed Classification Report (Test Set):")
print(classification_report(y_test, y_test_pred))

# Confusion Matrix
print(f"\nConfusion Matrix (Test Set):")
print(confusion_matrix(y_test, y_test_pred))

# Feature importance (coefficients)
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'coefficient': lr_model.coef_[0],
    'abs_coefficient': abs(lr_model.coef_[0])
}).sort_values('abs_coefficient', ascending=False)

print(f"\nTop 10 Most Important Features:")
print(feature_importance.head(10))