In [None]:
# ============================================================
# PYSPARK PLAYER BEHAVIOR CLUSTERING & ANALYSIS
# ============================================================

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, mean
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans

In [None]:
# ============================================================
# 1Ô∏è‚É£ KH·ªûI T·∫†O SPARK
# ============================================================

spark = SparkSession.builder \
    .appName("PlayerBehaviorClustering") \
    .getOrCreate()

In [None]:
# ============================================================
# 2Ô∏è‚É£ LOAD D·ªÆ LI·ªÜU
# ============================================================

df = spark.read.csv("data/online_gaming_behavior_dataset.csv", header=True, inferSchema=True)
print("‚úÖ D·ªØ li·ªáu g·ªëc:")
df.show(5)
df.printSchema()

In [None]:
# ============================================================
# 3Ô∏è‚É£ KH√ÅM PH√Å D·ªÆ LI·ªÜU (EDA)
# ============================================================

print("üîπ Th·ªëng k√™ t·ªïng quan:")
df.describe().show()

print("üîπ Ph√¢n b·ªë gi·ªõi t√≠nh:")
df.groupBy("Gender").count().show()

print("üîπ Ph√¢n b·ªë theo khu v·ª±c:")
df.groupBy("Location").count().show()

print("üîπ Ph√¢n b·ªë th·ªÉ lo·∫°i game:")
df.groupBy("GameGenre").count().show()


In [None]:
# ============================================================
# 4Ô∏è‚É£ TI·ªÄN X·ª¨ L√ù D·ªÆ LI·ªÜU
# ============================================================

categorical_cols = ["Gender","Location","GameGenre","GameDifficulty"]
indexers = [StringIndexer(inputCol=col, outputCol=col+"_idx") for col in categorical_cols]
pipeline = Pipeline(stages=indexers)
df_indexed = pipeline.fit(df).transform(df)

In [None]:
# ============================================================
# 5Ô∏è‚É£ CHU·∫®N B·ªä D·ªÆ LI·ªÜU CHO CLUSTERING
# ============================================================

feature_cols = ["Age", "PlayTimeHours", "SessionsPerWeek", "AvgSessionDurationMinutes",
                "PlayerLevel", "AchievementsUnlocked"] + [col+"_idx" for col in categorical_cols]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df_features = assembler.transform(df_indexed)


In [None]:
# ============================================================
# 6Ô∏è‚É£ PH√ÇN C·ª§M (K-MEANS)
# ============================================================

kmeans = KMeans(k=3, seed=42, featuresCol="features", predictionCol="cluster")
model = kmeans.fit(df_features)
df_clustered = model.transform(df_features)

print("‚úÖ G√°n c·ª•m cho ng∆∞·ªùi ch∆°i:")
df_clustered.select("PlayerID","cluster").show(10)

In [None]:
# ============================================================
# 7Ô∏è‚É£ PH√ÇN T√çCH C·ª§M
# ============================================================

print("üîπ Th·ªëng k√™ trung b√¨nh c√°c ch·ªâ s·ªë theo c·ª•m:")
df_clustered.groupBy("cluster").mean("Age","PlayTimeHours","SessionsPerWeek",
                                     "AvgSessionDurationMinutes","PlayerLevel","AchievementsUnlocked").show()

print("üîπ Ph√¢n b·ªë categorical theo c·ª•m:")
for col_name in ["Gender","Location","GameGenre","GameDifficulty"]:
    print(f"\nüìä Ph√¢n b·ªë {col_name} theo c·ª•m:")
    df_clustered.groupBy("cluster", col_name).count().orderBy("cluster").show()


In [None]:
# ============================================================
# 8Ô∏è‚É£ XU·∫§T K·∫æT QU·∫¢ CHO REPORT
# ============================================================

# Xu·∫•t ra CSV cho slide/report
df_clustered.select("PlayerID","Age","Gender","Location","GameGenre","PlayTimeHours",
                    "SessionsPerWeek","AvgSessionDurationMinutes","PlayerLevel",
                    "AchievementsUnlocked","cluster") \
            .coalesce(1).write.csv("clustered_output", header=True, mode="overwrite")

print("‚úÖ K·∫øt qu·∫£ ƒë√£ ƒë∆∞·ª£c xu·∫•t ra th∆∞ m·ª•c: clustered_output/")


In [None]:
# ============================================================
# 9Ô∏è‚É£ K·∫æT TH√öC
# ============================================================

spark.stop()
