In [1]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.sql import SparkSession
 
# if __name__ == "__main__":
spark = SparkSession\
    .builder\
    .appName("TfIdf Example")\
    .getOrCreate()

sentenceData = spark.read.format("csv").option("header","false")\
.load("pw_csmall.csv")

sentenceData.show()

# sentenceData = spark.createDataFrame([
#     (0.0, "Welcome to TutorialKart."),
#     (0.0, "Learn Spark at TutorialKart."),
#     (1.0, "Spark Mllib has TF-IDF.")
# ], ["label", "sentence"])

tokenizer = Tokenizer(inputCol="_c2", outputCol="words")
wordsData = tokenizer.transform(sentenceData)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=2**12)
featurizedData = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
idfModel.save('temp/idf-model')

rescaledData.select("_c0", "words", "rawFeatures","features").show()
 
# spark.stop() 

+----+---+--------------------+
| _c0|_c1|                 _c2|
+----+---+--------------------+
|null|  0|<p> i am trying t...|
|   0|  1|<p> i understand ...|
|   1|  2|"<p> i am making ...|
|   2|  3|<p> i am trying t...|
|   3|  4|"<p> when i am re...|
|   4|  5|<p> i am trying t...|
|   5|  6|<p> wanted to get...|
|   6|  7|"<p> i'm wonderin...|
|   7|  8|"<p> so i'm tryin...|
|   8|  9|"<p> 1) i created...|
|   9| 10|"<p> does anyone ...|
|  10| 11|"<p> i'm using xa...|
|  11| 12|<p> in sql server...|
|  12| 13|"<p> when i use p...|
|  13| 14|"<p> i need to sc...|
|  14| 15|"<p> i have added...|
|  15| 16|<p> i have a mong...|
|  16| 17|"<p> okay so i'm ...|
|  17| 18|"<p> i'm fetching...|
|  18| 19|"<p> when i'm pla...|
+----+---+--------------------+
only showing top 20 rows

+----+--------------------+--------------------+--------------------+
| _c0|               words|         rawFeatures|            features|
+----+--------------------+--------------------+------------------

In [2]:
# pyspark.__version__
# import pyspark
# pyspark.__version__
# SparkSession.version
spark.version

'2.3.1'

In [5]:
sample = rescaledData.take(1)[0]['_c2']
print(sample)
# # print(type(sample[0]['rawFeatures']))
# print(sample.dot(sample))

# tokenizer = Tokenizer(inputCol="_c2", outputCol="words")
# wordsData = tokenizer.transform(sentenceData)

# hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=2**12)


<p> i am trying to create a report to display a summary of the values of the columns for each row.   a basic analogy would an inventory listing.  say i have about 15 locations like 2a 2b 2c 3a 3b 3c etc.   each location has a variety of items and the items each have a specific set of common descriptions i.e. a rating of 1-9 boolean y or n another boolean y or n.  it looks something like this:</p>   <pre> <code> 2a   4       y       n 2a   5       y       y 2a   5       n       y 2a   6       n       n       ... 2b   4       n       y   2b   4       y       y       ...etc. </code> </pre>   <p> what i would like to produce is a list of locations and summary counts of each attribute:</p>   <pre> <code> location    1 2 3 4 5 6 7 8 9      y  n        y n      total 2a                1 2 1            2  2        2 2        4 2b                2                1  1        2          2 ... ___________________________________________________________ totals            3 2 1            3  3      

In [4]:
# similarities = rescaledData.select("features").rdd.map(lambda v: v)#/(v[0].norm(2)*candidate[0].norm(2)))
# s = similarities.collect()
def cos(a, b):
    print(a[0].norm(2))
    return a[0].dot(b)/(a[0].norm(2)*b.norm(2))
# def cos(a,b):
#     print(a[0].dot(b))
#     return a

sim = rescaledData.select("features").rdd.map(lambda x: cos(x, sample)).sortBy(lambda x: -x).take(5)
print(sim)

[1.0, 0.19199301929721657, 0.18590004487687317, 0.18255778146344773, 0.1746916387774224]


In [21]:
idfModel.write().overwrite().save('idf/')


In [25]:
rescaledData.write.parquet("test2.parquet")

In [15]:
# read in file and throw it in
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, IDFModel
from pyspark.sql import SparkSession
 
    
text = "<p> i am trying to create a report to display a summary of the values of the columns for each row.   a basic analogy would an inventory listing.  say i have about 15 locations like 2a 2b 2c 3a 3b 3c etc.   each location has a variety of items and the items each have a specific set of common descriptions i.e. a rating of 1-9 boolean y or n another boolean y or n.  it looks something like this:</p>   <pre> <code> 2a   4       y       n 2a   5       y       y 2a   5       n       y 2a   6       n       n       ... 2b   4       n       y   2b   4       y       y       ...etc. </code> </pre>   <p> what i would like to produce is a list of locations and summary counts of each attribute:</p>   <pre> <code> location    1 2 3 4 5 6 7 8 9      y  n        y n      total 2a                1 2 1            2  2        2 2        4 2b                2                1  1        2          2 ... ___________________________________________________________ totals            3 2 1            3  3        4 2        6 </code> </pre>   <p> the query returns fields:  </p>   <pre> <code> location_cd string   desc_cd int  y_n_1 string  y_n_2 string </code> </pre>   <p> i have tried grouping by location but cannot get the summaries to work.   i tried putting it in a table but that would only take the original query.  i tried to create datasets for each unit and create variables in each one for each of the criteria but that hasn't worked yet either.  but maybe i am way off track and crosstabs would work better?  i tried that and got a total mess the first time.  maybe a bunch of subreports?</p>   <p> can someone point me in the correct direction please?    it seemed easy when i started out but now i am getting nowhere.  i can get the report to print out the raw data but all i need are totals for each column broken down out by location.  </p> "
# if __name__ == "__main__":
# spark = SparkSession\
#     .builder\
#     .appName("TfIdf Example")\
#     .getOrCreate()

sentenceData = spark.createDataFrame([(0.0, text),],['label','sentence'])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=2**12)
featurizedData = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors
idfPath = 'idf/'

modelPath = "temp/idf-model"
# model.save(modelPath)
loadedModel = IDFModel.load(modelPath)
sample = loadedModel.transform(featurizedData)

data = spark.read.format("parquet").load("test2.parquet")
# loadedModel.transform(df).head().idf == model.transform(df).head().idf

# loadedIdf = IDF.load(idfPath)
# output = idfModel.transform(featurizedData)
# output.show()
# idf = IDF(inputCol="rawFeatures", outputCol="features")
# idfModel = idf.fit(featurizedData)
# rescaledData = idfModel.transform(featurizedData)

# rescaledData.select("_c0", "words", "rawFeatures","features").show()

#     (0.0, "Welcome to TutorialKart."),
#     (0.0, "Learn Spark at TutorialKart."),
#     (1.0, "Spark Mllib has TF-IDF.")
# ], ["label", "sentence"])

# sentenceData = spark.read.format("csv").option("header","false")\
# .load("pw_csmall.csv")

In [22]:
sample.show()
sample2 = sample.take(1)[0]['features']
print(sample2)

+-----+--------------------+--------------------+--------------------+--------------------+
|label|            sentence|               words|         rawFeatures|            features|
+-----+--------------------+--------------------+--------------------+--------------------+
|  0.0|<p> i am trying t...|[<p>, i, am, tryi...|(4096,[5,11,31,32...|(4096,[5,11,31,32...|
+-----+--------------------+--------------------+--------------------+--------------------+

(4096,[5,11,31,32,57,101,159,179,189,191,192,240,244,293,365,382,392,403,404,406,500,566,630,641,658,672,695,721,829,835,877,933,950,987,991,1004,1025,1075,1089,1126,1186,1241,1265,1267,1269,1309,1310,1329,1347,1372,1377,1411,1414,1432,1438,1447,1455,1531,1565,1575,1591,1682,1737,1775,1804,1834,1877,1902,1919,2024,2071,2072,2081,2088,2130,2135,2151,2189,2213,2227,2248,2275,2312,2380,2391,2423,2439,2485,2488,2489,2492,2520,2526,2561,2562,2575,2618,2623,2647,2651,2666,2702,2711,2750,2760,2775,2776,2789,2791,2818,2832,2833,2855,2899,303

In [24]:
def cos(a, b):
    print(a[0].norm(2))
    return a[0].dot(b)/(a[0].norm(2)*b.norm(2))

sim = data.select("features").rdd.map(lambda x: cos(x, sample2)).sortBy(lambda x: -x).take(5)
print(sim)

[1.0, 0.19199301929721657, 0.18590004487687317, 0.18255778146344773, 0.1746916387774224]


In [48]:
from pyspark.sql.functions import udf, lit
from pyspark.sql.types import StringType
from pyspark.sql.functions import col

sparse_format_udf = udf(lambda x: ','.join([str(elem) for elem in x], StringType()))

def array_to_string(my_list):
    return '[' + ','.join([str(elem) for elem in my_list]) + ']'

array_to_string_udf = udf(array_to_string, StringType())

dense_format_udf = udf(lambda x: str(x), StringType())



# df.show()
# df = rescaledData.withColumn('features', dense_format_udf(col('features').cast("string")))
# df2 = df.withColumn('strFeatures', col('features').cast("string"))
# df2.show()
# df.write.option("delimiter",'\t').csv('test.txt')
# df.write.option("delimiter", "\t").csv('test.csv')


# df = rescaledData.withColumn('features2', sparse_format_udf(col('features')))
# df.show()
# df2 = df.withColumn('features', lit("features").cast("string"))
# df = df.withColumn('features',array_to_string_udf(d["features"]))

# df.head()

# df2.write.option("delimiter", "\t").csv('test.csv')

AttributeError: 'DataFrameWriter' object has no attribute 'txt'

In [56]:
%time d = spark.read.parquet("test.parquet")

CPU times: user 3.14 ms, sys: 0 ns, total: 3.14 ms
Wall time: 153 ms


In [57]:
d.show()

+-----+--------------------+--------------------+--------------------+--------------------+
|label|            sentence|               words|         rawFeatures|            features|
+-----+--------------------+--------------------+--------------------+--------------------+
|  0.0|Learn Spark at Tu...|[learn, spark, at...|(20,[4,5,15,16],[...|(20,[4,5,15,16],[...|
|  0.0|Welcome to Tutori...|[welcome, to, tut...|(20,[4,8,9],[1.0,...|(20,[4,8,9],[0.28...|
|  1.0|Spark Mllib has T...|[spark, mllib, ha...|(20,[0,1,5,14],[1...|(20,[0,1,5,14],[0...|
+-----+--------------------+--------------------+--------------------+--------------------+



In [16]:
from pyspark.ml.feature import IDF, IDFModel
from pyspark.ml.linalg import DenseVector
df = spark.createDataFrame([(DenseVector([1.0, 2.0]),),
     (DenseVector([0.0, 1.0]),), (DenseVector([3.0, 0.2]),)], ["tf"])
idf = IDF(minDocFreq=3, inputCol="tf", outputCol="idf")
model = idf.fit(df)
model.idf
# DenseVector([0.0, 0.0])
model.transform(df).head().idf
# DenseVector([0.0, 0.0])
idf.setParams(outputCol="freqs").fit(df).transform(df).collect()[1].freqs
# DenseVector([0.0, 0.0])
params = {idf.minDocFreq: 1, idf.outputCol: "vector"}
idf.fit(df, params).transform(df).head().vector
# DenseVector([0.2877, 0.0])
# temp_path = '/tp'
idfPath = "tp/idf"
idf.save(idfPath)
loadedIdf = IDF.load(idfPath)
loadedIdf.getMinDocFreq() == idf.getMinDocFreq()

modelPath = "tp/idf-model"
model.save(modelPath)
loadedModel = IDFModel.load(modelPath)
loadedModel.transform(df).head().idf == model.transform(df).head().idf

True

In [None]:
# hashingTF.save(hashingTFPath)
# >>> loadedHashingTF = HashingTF.load(hashingTFPath)
# >>> loadedHashingTF.getNumFeatures() == hashingTF.getNumFeatures()

In [None]:
postings list redis cassandar
