In [1]:
# Final Project: Predicting Newsletter Subscription from Game Data


# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# ## Introduction
# In this project, we aim to predict whether a player will subscribe to the newsletter based on their in-game behavior and characteristics.

# ## Data Download and Loading
# Load the provided datasets
players = pd.read_csv('/mnt/data/players.csv')
sessions = pd.read_csv('/mnt/data/sessions.csv')

# ## Data Cleaning and Wrangling
# Clean 'players.csv': Fill missing Age values with the median
players['Age'] = players['Age'].fillna(players['Age'].median())

# Clean 'sessions.csv': Remove sessions with missing end_time
sessions_clean = sessions.dropna(subset=['end_time', 'original_end_time'])

# Convert start and end times to datetime
sessions_clean['start_time'] = pd.to_datetime(sessions_clean['start_time'], format='%d/%m/%Y %H:%M')
sessions_clean['end_time'] = pd.to_datetime(sessions_clean['end_time'], format='%d/%m/%Y %H:%M')

# Calculate session duration in minutes
sessions_clean['session_duration'] = (sessions_clean['end_time'] - sessions_clean['start_time']).dt.total_seconds() / 60

# Aggregate session data per player
session_summary = sessions_clean.groupby('hashedEmail').agg(
    total_sessions=('session_duration', 'count'),
    average_session_duration=('session_duration', 'mean')
).reset_index()

# Merge aggregated session data with player data
players_merged = pd.merge(players, session_summary, on='hashedEmail', how='left')

# Fill NaN for players with no session records
players_merged['total_sessions'] = players_merged['total_sessions'].fillna(0)
players_merged['average_session_duration'] = players_merged['average_session_duration'].fillna(0)

# ## Exploratory Data Analysis (EDA)

# Plot the distribution of subscribers and non-subscribers
plt.figure(figsize=(6, 4))
sns.countplot(x='subscribe', data=players_merged)
plt.title('Newsletter Subscription Distribution')
plt.xlabel('Subscribed')
plt.ylabel('Count')
plt.show()

# Plot experience vs. subscription
plt.figure(figsize=(8, 4))
sns.countplot(x='experience', hue='subscribe', data=players_merged)
plt.title('Experience vs. Subscription')
plt.xlabel('Experience Level')
plt.ylabel('Count')
plt.show()

# Boxplot: Total Sessions vs. Subscription
plt.figure(figsize=(8, 4))
sns.boxplot(x='subscribe', y='total_sessions', data=players_merged)
plt.title('Total Sessions by Subscription Status')
plt.xlabel('Subscribed')
plt.ylabel('Total Sessions')
plt.show()

# Boxplot: Average Session Duration vs. Subscription
plt.figure(figsize=(8, 4))
sns.boxplot(x='subscribe', y='average_session_duration', data=players_merged)
plt.title('Average Session Duration by Subscription Status')
plt.xlabel('Subscribed')
plt.ylabel('Average Session Duration (minutes)')
plt.show()

# Boxplot: Played Hours vs. Subscription
plt.figure(figsize=(8, 4))
sns.boxplot(x='subscribe', y='played_hours', data=players_merged)
plt.title('Played Hours by Subscription Status')
plt.xlabel('Subscribed')
plt.ylabel('Played Hours')
plt.show()

# Boxplot: Age vs. Subscription
plt.figure(figsize=(8, 4))
sns.boxplot(x='subscribe', y='Age', data=players_merged)
plt.title('Age by Subscription Status')
plt.xlabel('Subscribed')
plt.ylabel('Age')
plt.show()

# ## Model Building

# Prepare the data for modeling
# Select features and target variable
X = players_merged[['played_hours', 'Age', 'total_sessions', 'average_session_duration']]
X = pd.get_dummies(players_merged[['experience', 'gender']], drop_first=True).join(X)
y = players_merged['subscribe']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Build and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))

# ## Conclusion
# This logistic regression model shows the relationship between player features and newsletter subscription status. 
# The model's performance can be evaluated by accuracy, precision, and recall. 
# The project demonstrates effective data cleaning, aggregation, EDA, and predictive modeling to answer the research question.

ERROR: Error in parse(text = x, srcfile = src): <text>:4:8: unexpected symbol
3: # Import necessary libraries
4: import pandas
          ^
