In [1]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql import SparkSession

spark = SparkSession\
    .builder\
    .appName("Data Clean")\
    .getOrCreate()

# dirtyData.select('cleaned_body').show()

In [5]:
# CUSTOM TRANSFORMER ----------------------------------------------------------------
class TextCleaner(Transformer):
    """
    A custom Transformer which drops all columns that have at least one of the
    words from the banned_list in the name.
    """

    def __init__(self, inputCol='body', outputCol='cleaned_body'):
        super(TextCleaner, self).__init__()
#         self.banned_list = banned_list
    def clean(line):
        print(line)
        line = line.lower().replace("\n"," ").replace("\r","").replace(',',"").replace(">","> ").replace("<", " <")
        return line
    clean_udf = udf(lambda r: clean(r), StringType())

    def _transform(self, df: DataFrame) -> DataFrame:
        df = df.withColumn('cleaned_body', clean_udf(df['body']))
        df = df.drop('body')
    #         df = df.drop(*[x for x in df.columns if any(y in x for y in self.banned_list)])
        return df

In [15]:
from pyspark.ml import Pipeline,Transformer
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, IDFModel
from pyspark.sql import DataFrame

def clean(line):
    line = line.lower().replace("\n"," ").replace("\r","").replace(',',"").replace(">","> ").replace("<", " <")
    return line
clean_udf = udf(lambda r: clean(r), StringType())

dirtyData = spark.read.csv("sDirtyc.csv.gz", header=True, multiLine=True, escape='"')

# cleaner = TextCleaner(inputCol='body', outputCol='cleaned_body')
# dirtyData = cleaner.transform(dirtyData)
dirtyData = dirtyData.withColumn('cleaned_body', clean_udf(dirtyData['body']))

tokenizer = Tokenizer(inputCol="cleaned_body", outputCol="words")
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=2**12)
idf = IDF(inputCol="rawFeatures", outputCol="features")
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf])

# Fit the pipeline to training documents.
model = pipeline.fit(dirtyData)

dirtyData = model.transform(dirtyData)

# modelPath = "temp/idf-model"
# model.save(modelPath)
# loadedModel = IDFModel.load(modelPath)
# sample = loadedModel.transform(featurizedData)

# data = spark.read.format("parquet").load("test2.parquet")

In [16]:
dirtyData.select('features').show()

+--------------------+
|            features|
+--------------------+
|(4096,[33,45,53,1...|
|(4096,[203,237,24...|
|(4096,[16,35,45,5...|
|(4096,[56,87,244,...|
|(4096,[8,12,19,30...|
|(4096,[5,8,32,77,...|
|(4096,[12,28,30,5...|
|(4096,[12,123,131...|
|(4096,[8,22,26,32...|
|(4096,[16,56,141,...|
|(4096,[19,87,101,...|
|(4096,[56,159,244...|
|(4096,[15,56,123,...|
|(4096,[8,33,45,12...|
|(4096,[2,8,32,99,...|
|(4096,[12,56,123,...|
|(4096,[244,513,61...|
|(4096,[25,244,325...|
|(4096,[1,56,140,1...|
|(4096,[56,77,159,...|
+--------------------+
only showing top 20 rows



In [120]:
model.save('model2')
# idf = model.stages[2]
# idf.save('')

AttributeError: 'list' object has no attribute 'save'

In [93]:
test = dirtyData.take(3)
test = [x['cleaned_body'] for x in test]
print(test[0])
# test = spark.createDataFrame(dirtyData.take(3))

 <p> it seems openstreetmap has changed their licensing scheme as a result lots of data were deleted as shown in the attached picture which is grafton nsw 2460 australia almost all streets are gone. </p>    <p> my question is: is there any way to download the old data somewhere by providing lat/lng's? (i understand that there could be some old archives for world or some countries but that doesn't work for me because at the moment my application is not capable to process those massive data files) </p>    <p> if there's no way to download the old data is there any other good free map data (not images) available? </p>    <p> also i've noticed that there're 4 options at the top right corner the other three except standard seem to be showing all streets. they are (at least mapquest) based on osm data but not the one we get from the "export" section of openstreetmap.org is that correct?  </p>    <p> edit: oops as a new user i cannot post images.. the below link may work (or may not): </p>   

In [94]:
from pyspark.sql.types import StringType
stest = spark.createDataFrame([(x,) for x in test], ['cleaned_body'])

In [95]:
prediction = model.transform(stest)
selected = prediction.select("features")
for row in selected.collect():
    print(row)
#     rid, text, prob, prediction = row
#     print("(%d, %s) --> prob=%s, prediction=%f" % (rid, text, str(prob), prediction))

Row(features=SparseVector(4096, {33: 2.0502, 45: 1.8425, 53: 7.8439, 104: 9.6865, 131: 2.5357, 146: 3.5165, 149: 3.5165, 159: 0.5208, 179: 2.8234, 193: 3.2288, 244: 0.2031, 309: 2.1302, 311: 3.4482, 328: 2.5357, 331: 2.4278, 338: 1.5241, 392: 2.9593, 406: 2.2172, 408: 3.2288, 429: 3.5165, 541: 2.5357, 542: 3.2288, 602: 1.7247, 618: 3.0057, 621: 3.922, 744: 3.2288, 790: 3.2288, 836: 3.0482, 843: 2.6692, 892: 3.2288, 901: 3.2288, 911: 3.5165, 987: 2.6692, 988: 1.573, 991: 12.301, 993: 1.3193, 1042: 3.8141, 1141: 3.922, 1155: 3.922, 1267: 1.2897, 1269: 0.3111, 1273: 1.3962, 1278: 3.922, 1280: 2.3125, 1322: 12.0227, 1347: 1.357, 1354: 3.922, 1380: 5.0714, 1411: 1.7247, 1429: 3.5165, 1432: 3.0057, 1438: 1.4921, 1447: 3.5155, 1480: 3.922, 1502: 3.0057, 1575: 5.0714, 1591: 1.2478, 1593: 2.2172, 1689: 3.0057, 1706: 1.7247, 1769: 3.922, 1788: 3.4482, 1900: 3.922, 1911: 7.033, 1934: 1.9071, 1954: 0.8085, 1994: 1.0316, 2041: 1.7819, 2071: 1.6194, 2087: 7.033, 2130: 0.4168, 2179: 3.2288, 2227: 3.5

In [97]:
# now read all the data from the text file and map a function to write it to redis
dd = model.transform(dirtyData).select('id','title','cleaned_body','accepted_answer_id','creation_date','features', 'tags')
# %time dd.show()
# test.show()

In [29]:
# write a simple O(n) function to pull out the tags and also write the IDs
def save_to_redis(row):
    
#     print(row['tags'])
    tags = row['tags'].split('|')
    idd = row['id']
    title = row['title']
    body = row['cleaned_body']
    creation = row['creation_date']
    acc = row['accepted_answer_id']
    embed = row['features']
    print("|".join([str(embed.size), str(list(embed.indices)),str(list(embed.values))]))
#     idd, title, body, acc, creation, embed, tags = row[0]
#     print(tags, idd)
#     print(idd, title, 'b', int(float(acc)), creation, embed)
    for tag in tags:
        try:
            curr = r.get(tag)
        except:
            curr = ""
            print('%s not found' %tag)
        r.set(tag, curr+","+idd)
    return row
    #for item in row[0]:
        
#         print(type(item))
    # want to save two simple 
#     for row in df:
#         print(row)

In [34]:
temp = dirtyData.take(10)
save_to_redis(temp[5])

4096|[5, 8, 32, 77, 99, 122, 123, 167, 191, 240, 244, 331, 347, 353, 365, 392, 400, 424, 540, 589, 606, 610, 611, 658, 668, 687, 754, 755, 823, 836, 862, 876, 932, 991, 1004, 1017, 1042, 1069, 1102, 1167, 1186, 1265, 1267, 1269, 1281, 1376, 1378, 1411, 1417, 1428, 1432, 1438, 1447, 1467, 1499, 1502, 1565, 1591, 1593, 1634, 1673, 1687, 1740, 1757, 1788, 1806, 1826, 1840, 1895, 1920, 1941, 1976, 1994, 2045, 2071, 2088, 2130, 2135, 2163, 2213, 2246, 2267, 2274, 2376, 2395, 2408, 2579, 2583, 2605, 2621, 2638, 2647, 2651, 2660, 2695, 2733, 2789, 2818, 2833, 2853, 2859, 2886, 2899, 2907, 2912, 2946, 3051, 3053, 3152, 3216, 3236, 3270, 3305, 3312, 3331, 3389, 3392, 3411, 3420, 3450, 3458, 3483, 3565, 3586, 3599, 3601, 3619, 3632, 3662, 3691, 3802, 3874, 3921, 3924, 3930, 3937, 3959, 4044, 4078, 4084]|[3.5165082281731497, 1.0316015783851493, 5.6467220952264086, 1.7247487589450947, 2.6692103677859462, 2.8233610476132043, 2.0501711593797225, 3.228826155721369, 1.8425317946014783, 2.1302138670532

NameError: name 'r' is not defined

In [None]:
import redis

redis_host = '10.0.0.7'
redis_port = 6379
redis_password = 'AhrIykRVjO9GHA52kmYou7iUrsDbzJL+/7vjeTYhsLmpskyAY8tnucf4QJ7FpvVzFNNKuIZVVkh1LRxF'

def hello_redis(n):
    """Example Hello Redis Program"""
   
    # step 3: create the Redis Connection object
    try:
   
        # The decode_repsonses flag here directs the client to convert the responses from Redis into Python strings
        # using the default encoding utf-8.  This is client specific.
        #r = redis.StrictRedis(host=redis_host, port=redis_port, password=redis_password, decode_responses=True)
        r = redis.Redis(host=redis_host, port=redis_port, password=redis_pasword)
        # step 4: Set the hello message in Redis
        r.set("t1", "1")

        # step 5: Retrieve the hello message from Redis
        msg = r.get("t1")
        for i in range(n):
            r.set("t1", msg+str(i))
            msg = r.get("t1")
        print(msg)        
   
    except Exception as e:
        print(e)

if __name__ == '__main__':
    hello_redis(10)



In [None]:
    for tag in tags:
        curr = r.get(tag)
        if curr is None:
            r.set(tag, idd)
        else:
            r.set(tag, curr+","+idd)

In [31]:
a = "blahblah blah|(4096|[1,2,5,6,8,10]|[15,0,12,15.5,20])"

In [33]:
import numpy as np
# b = np.fromstring(a)
def string_to_sparse(string):
    string = string.split('|')
    print(string)
    title = string[0]
    size = int(string[1][1:])
    inds = np.fromstring(string[2][1:-1], sep=',')
    vals = np.fromstring(string[3][1:-2], sep=',')
    print(size, inds, vals)
    return

# def string_to_sparse(string):
#     escape = ["[","]"]
#     ins = False
#     inds = []
#     for i, char in enumerate(string[1:-1]):
#         if char in escape:
#             if ins == True:
#                 ins = False
#             else:
#                 ins = True
# #             print(ins)
#         elif char == ',' and ins == False:
#             inds.append(i)
# #     print(inds)
#     return
string_to_sparse(a)

['blahblah blah', '(4096', '[1,2,5,6,8,10]', '[15,0,12,15.5,20])']
4096 [ 1.  2.  5.  6.  8. 10.] [15.   0.  12.  15.5 20. ]


In [19]:
# prin
from scipy.sparse import csr_matrix

In [23]:
data = [15,0,12,15.5,20]
row = [1,2,5,6,8,10]
size = 4096
aa = csr_matrix((data, (row,)), shape=(size,))

TypeError: invalid input format