# Imports 

## Application-specific imports 

In [None]:
import sys

In [None]:
sys.path.append("../config/")
import config

In [None]:
sys.path.append("../metaflow/")
import preprocess_fn

## General 

In [None]:
import pickle
import itertools

In [None]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import preprocessing

In [None]:
import pyspark
import pyspark.sql.functions as fn
import pyspark.sql.types as t
from pyspark.ml import Pipeline
from pyspark.ml.feature import (Normalizer, Tokenizer, HashingTF, IDF)
from pyspark.mllib.linalg.distributed import (IndexedRow, IndexedRowMatrix)

# Load data from parquet

In [None]:
df = spark.read.parquet(f'{config.ARTIFACTS}/dataset/M20_cards.parquet')

# Preprocess 

In [None]:
df_filtered = df

## Fill in NA 

In [None]:
df_filtered = df_filtered.na.fill(value="", subset=["filteredText"])

## NLP with Spark

In [None]:
tokenizer = Tokenizer(inputCol="filteredText", outputCol="words")
wordsData = tokenizer.transform(df_filtered)

In [None]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=1000)
featurizedData = hashingTF.transform(wordsData)

In [None]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)

In [None]:
rescaledData = idfModel.transform(featurizedData)

In [None]:
rescaledData.select("number", "name", "features").show()

In [None]:
pd_number_names = rescaledData.select("number", "name").toPandas()

In [None]:
rescaledData = rescaledData.select("number", "features")

In [None]:
pd_rescaledData = rescaledData.select("features").toPandas()

In [None]:
len(pd_rescaledData)

In [None]:
dense_features = np.zeros((329, 1000))

In [None]:
for i in range(329):
    dense_vect = pd_rescaledData['features'][i].toArray()
    dense_features[i, :] = dense_vect

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
cos_similarity = cosine_similarity(dense_features)

In [None]:
cos_similarity = np.corrcoef(dense_features)

In [None]:
# normalizer = Normalizer(inputCol="features", outputCol="normed_features")
# data = normalizer.transform(rescaledData)

In [None]:
# mat = IndexedRowMatrix(data.select("number", "normed_features").rdd.map(lambda row: IndexedRow(row.number, row.normed_features.toArray()))).toBlockMatrix()

In [None]:
# dot = mat.multiply(mat.transpose())

In [None]:
# dot.numCols(), dot.numRows()

In [None]:
# cos_similarity = dot.toLocalMatrix().toArray()

In [None]:
cos_similarity

In [None]:
len(pd_number_names)

In [None]:
names_lens = pd_number_names['name'].apply(func=len)

In [None]:
import matplotlib.pyplot as plt

In [None]:
f, axes = plt.subplots(figsize=(25, 25), dpi=300)
c = axes.matshow(cos_similarity)
plt.xticks(ticks=range(329), labels=pd_number_names['name'].tolist(), rotation=90, fontsize=4)
plt.colorbar(c)

In [None]:
import bct

In [None]:
cos_similarity = np.abs(cos_similarity)

In [None]:
ca, Q = bct.community_louvain(1.0 / cos_similarity)

In [None]:
Q

In [None]:
m = np.max(ca)

In [None]:
m

In [None]:
len(ca)

In [None]:
for i in range(1, m + 1):
    indices = np.where(ca == i)[0]
    group = pd_number_names.loc[indices]
    print(group.sort_values('name'))
    print("*" * 10)