In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, rand, col
from pyspark.sql.types import StringType, IntegerType
import random
import pandas as pd
import pyspark
from datetime import datetime
from pyspark import SparkConf, SparkContext
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler

# SparkConfを作成し、メモリ設定を行う
conf = SparkConf() \
    .setAppName("MyApp") \
    .set("spark.driver.memory", "8g") \
    .set("spark.executor.memory", "8g")

# SparkContextを作成
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=MyApp, master=local[*]) created by __init__ at /var/folders/w_/hg905l611296df772h1275kr0000gn/T/ipykernel_3538/2170893724.py:18 

In [21]:
# ランダムデータ生成

def random_value1():
    return random.randint(0, 100)

def random_value2():
    return random.randint(0, 100)

# UDF（ユーザー定義関数）の登録
random_value1_udf = udf(random_value1, IntegerType())
random_value2_udf = udf(random_value2, IntegerType())

# 空のDataFrameを作成
df = spark.range(10000000).withColumn("id", col("id"))

# 各カラムをランダム値で埋める
df = df.withColumn("value1", random_value1_udf())
df = df.withColumn("value2", random_value2_udf())

# "id"カラムを削除して最終DataFrameにする
df = df.drop("id")

assembler = VectorAssembler(inputCols=["value1", "value2"], outputCol="features")

# VectorAssemblerを使って新しいデータフレームを変換
df = assembler.transform(df)

df = df.select("features")

# 結果を表示
df.show(10)

[Stage 9:>                                                          (0 + 1) / 1]

+-----------+
|   features|
+-----------+
|[37.0,33.0]|
|[38.0,57.0]|
|[40.0,17.0]|
|[95.0,87.0]|
|[21.0,37.0]|
|[52.0,79.0]|
| [1.0,77.0]|
|[42.0,24.0]|
|[75.0,89.0]|
|[52.0,16.0]|
+-----------+
only showing top 10 rows



24/07/16 02:30:46 WARN PythonUDFRunner: Detected deadlock while completing task 0.0 in stage 9 (TID 37): Attempting to kill Python Worker
                                                                                

In [22]:
type(df)

pyspark.sql.dataframe.DataFrame

In [27]:
df_sample = df.sample(fraction=0.001, seed=1).cache()

24/07/16 02:32:40 WARN CacheManager: Asked to cache already cached data.


In [42]:
# df_sample = transformed_data.sample(fraction=0.001, seed=1).cache()

# Pandas Corr

In [31]:
pdf_sample = df_sample.toPandas()
pdf_sample[['value1', 'value2']] = pd.DataFrame(pdf_sample['features'].tolist(), index=pdf_sample.index)
pdf_sample.corr()

  pdf_sample.corr()


Unnamed: 0,value1,value2
value1,1.0,0.011023
value2,0.011023,1.0


In [41]:
# pdf_sample = df_sample.toPandas()
# pdf_sample[feature_attrs_flat] = pd.DataFrame(pdf_sample['features-dep'].tolist(), index=pdf_sample.index)
# pdf_sample.corr()

# PySpark Corr

In [32]:
from pyspark.ml.stat import Correlation

In [38]:
Correlation.corr(df_sample, "features").head()[0]

DenseMatrix(2, 2, [1.0, 0.011, 0.011, 1.0], False)

# Corr test 1

In [59]:
# 色の数を設定
num_colors = 20

In [61]:
import pandas as pd
import numpy as np

# データ数の設定
num_records = 100000

colors = [f'color{i+1}' for i in range(num_colors)]

# 各色の割合
ratios = {
    'color1': 0.77,
    'color2': 0.18,
    'color3': 0.02,
    'others': 0.03,
}

# データの初期化
data = {color: [0] * num_records for color in colors}

# 色の割り当て
np.random.seed(42)  # 再現性のためにシードを固定
assignments = (
    ['color1'] * int(num_records * ratios['color1']) +
    ['color2'] * int(num_records * ratios['color2']) +
    ['color3'] * int(num_records * ratios['color3']) +
    np.random.choice([color for color in colors if color not in ['color1', 'color2', 'color3']],
                     int(num_records * ratios['others']), replace=True).tolist()
)
np.random.shuffle(assignments)

for i, color in enumerate(assignments):
    data[color][i] = 1

# DataFrameの作成
df = pd.DataFrame(data)

# 相関係数の計算
correlation_matrix = df.corr()
correlation_matrix

Unnamed: 0,color1,color2,color3,color4,color5,color6,color7,color8,color9,color10,color11,color12,color13,color14,color15,color16,color17,color18,color19,color20
color1,1.0,-0.857256,-0.261387,-0.077481,-0.079197,-0.080251,-0.078344,-0.079197,-0.075505,-0.071624,-0.08046,-0.076169,-0.071389,-0.073931,-0.079197,-0.079409,-0.073704,-0.080041,-0.068509,-0.081909
color2,-0.857256,1.0,-0.066932,-0.01984,-0.020279,-0.020549,-0.020061,-0.020279,-0.019334,-0.01834,-0.020603,-0.019504,-0.01828,-0.018931,-0.020279,-0.020334,-0.018873,-0.020496,-0.017543,-0.020974
color3,-0.261387,-0.066932,1.0,-0.006049,-0.006183,-0.006266,-0.006117,-0.006183,-0.005895,-0.005592,-0.006282,-0.005947,-0.005574,-0.005772,-0.006183,-0.0062,-0.005755,-0.006249,-0.005349,-0.006395
color4,-0.077481,-0.01984,-0.006049,1.0,-0.001833,-0.001857,-0.001813,-0.001833,-0.001747,-0.001658,-0.001862,-0.001763,-0.001652,-0.001711,-0.001833,-0.001838,-0.001706,-0.001852,-0.001586,-0.001896
color5,-0.079197,-0.020279,-0.006183,-0.001833,1.0,-0.001898,-0.001853,-0.001874,-0.001786,-0.001694,-0.001903,-0.001802,-0.001689,-0.001749,-0.001874,-0.001879,-0.001744,-0.001893,-0.001621,-0.001938
color6,-0.080251,-0.020549,-0.006266,-0.001857,-0.001898,1.0,-0.001878,-0.001898,-0.00181,-0.001717,-0.001929,-0.001826,-0.001711,-0.001772,-0.001898,-0.001904,-0.001767,-0.001919,-0.001642,-0.001963
color7,-0.078344,-0.020061,-0.006117,-0.001813,-0.001853,-0.001878,1.0,-0.001853,-0.001767,-0.001676,-0.001883,-0.001782,-0.001671,-0.00173,-0.001853,-0.001858,-0.001725,-0.001873,-0.001603,-0.001917
color8,-0.079197,-0.020279,-0.006183,-0.001833,-0.001874,-0.001898,-0.001853,1.0,-0.001786,-0.001694,-0.001903,-0.001802,-0.001689,-0.001749,-0.001874,-0.001879,-0.001744,-0.001893,-0.001621,-0.001938
color9,-0.075505,-0.019334,-0.005895,-0.001747,-0.001786,-0.00181,-0.001767,-0.001786,1.0,-0.001615,-0.001815,-0.001718,-0.00161,-0.001667,-0.001786,-0.001791,-0.001662,-0.001805,-0.001545,-0.001847
color10,-0.071624,-0.01834,-0.005592,-0.001658,-0.001694,-0.001717,-0.001676,-0.001694,-0.001615,1.0,-0.001721,-0.00163,-0.001527,-0.001582,-0.001694,-0.001699,-0.001577,-0.001712,-0.001466,-0.001752


# Corr test 2

In [83]:
# corr = covariance(XY) / (std(X) * std(Y))

# covariance 減る -> corr 減る
# std(X) and/or std(Y) 増える -> corr 減る

# => 全レコード数に対する X(color1), Y(color2) の1の割合が減るほど corr は下がる

In [82]:
# 2カテゴリだけのとき
color1 = [1, 1, 1, 1, 1, 1, 1, 1, 0, 0]
color2 = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1]
df = pd.DataFrame({"color1": color1, "color2": color2})
df.corr()

Unnamed: 0,color1,color2
color1,1.0,-1.0
color2,-1.0,1.0


In [73]:
print(f"Covariance': {df.cov().iloc[0, 1]}")
print(f"Std color1: {df['color1'].std()}")
print(f"Std color2: {df['color2'].std()}")

Covariance': -0.17777777777777778
Std color1: 0.4216370213557839
Std color2: 0.42163702135578396


In [74]:
# 3カテゴリ以上あるとき（color1=color2=0があるとき）
color1 = [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
color2 = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
df = pd.DataFrame({"color1": color1, "color2": color2})
df.corr()

Unnamed: 0,color1,color2
color1,1.0,-0.272166
color2,-0.272166,1.0


In [75]:
print(f"Covariance': {df.cov().iloc[0, 1]}")
print(f"Std color1: {df['color1'].std()}")
print(f"Std color2: {df['color2'].std()}")

Covariance': -0.042105263157894715
Std color1: 0.5026246899500346
Std color2: 0.30779350562554625


In [76]:
# 3カテゴリ以上あるとき（color1=color2=0があるとき）
# さっきより0の割合を増やすとcorrが下がる
color1 = [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
color2 = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
df = pd.DataFrame({"color1": color1, "color2": color2})
df.corr()

Unnamed: 0,color1,color2
color1,1.0,-0.161165
color2,-0.161165,1.0


In [77]:
print(f"Covariance': {df.cov().iloc[0, 1]}")
print(f"Std color1: {df['color1'].std()}")
print(f"Std color2: {df['color2'].std()}")

Covariance': -0.01839080459770115
Std color1: 0.44977644510880355
Std color2: 0.2537081317024625
