In [1]:
import findspark
findspark.init('C:/spark')

In [2]:
from pyspark import SparkContext
sc= SparkContext("local","pysk_midsem")

In [3]:
from pyspark.sql import SparkSession
spark= SparkSession.builder.getOrCreate()

In [4]:
from pyspark.sql.functions import *

In [47]:
df= spark.read.option('header',True).option('InferSchema',True).csv('customer(1).csv')
df.show(3)

+---+-----------+-----------+
| id|       name|       city|
+---+-----------+-----------+
|  1|  John Doe |   New York|
|  2|    Jon Doe|   New York|
|  3|Johnny Doe |Los Angeles|
+---+-----------+-----------+
only showing top 3 rows



In [48]:
# cleaning, normalization

df_cleaned= df.withColumn('name_cleaned', lower(trim(regexp_replace(col('name'),'[^A-Za-z]'," "))))\
              .withColumn('city_cleaned',lower(trim(regexp_replace(col('city'),'[^A-Za-z0-9]'," "))))
              

In [49]:
df_cleaned.show(3)

+---+-----------+-----------+------------+------------+
| id|       name|       city|name_cleaned|city_cleaned|
+---+-----------+-----------+------------+------------+
|  1|  John Doe |   New York|    john doe|    new york|
|  2|    Jon Doe|   New York|     jon doe|    new york|
|  3|Johnny Doe |Los Angeles|  johnny doe| los angeles|
+---+-----------+-----------+------------+------------+
only showing top 3 rows



In [50]:
# blocking
df_blocked= df_cleaned.withColumn("block_key",
                                  concat_ws(
                                    "",
                                    trim(regexp_replace(col("name_cleaned"),"\\s+","")).substr(1,4),
                                    trim(regexp_replace(col("city_cleaned"),'\\s+',"")).substr(1,4)
                                  )
)

In [51]:
df_blocked.show(3)

+---+-----------+-----------+------------+------------+---------+
| id|       name|       city|name_cleaned|city_cleaned|block_key|
+---+-----------+-----------+------------+------------+---------+
|  1|  John Doe |   New York|    john doe|    new york| johnnewy|
|  2|    Jon Doe|   New York|     jon doe|    new york| jondnewy|
|  3|Johnny Doe |Los Angeles|  johnny doe| los angeles| johnlosa|
+---+-----------+-----------+------------+------------+---------+
only showing top 3 rows



In [52]:
# genetrate candidates
a= df_blocked.alias('a')
b= df_blocked.alias('b')
a.show(3) ; b.show(3)

+---+-----------+-----------+------------+------------+---------+
| id|       name|       city|name_cleaned|city_cleaned|block_key|
+---+-----------+-----------+------------+------------+---------+
|  1|  John Doe |   New York|    john doe|    new york| johnnewy|
|  2|    Jon Doe|   New York|     jon doe|    new york| jondnewy|
|  3|Johnny Doe |Los Angeles|  johnny doe| los angeles| johnlosa|
+---+-----------+-----------+------------+------------+---------+
only showing top 3 rows

+---+-----------+-----------+------------+------------+---------+
| id|       name|       city|name_cleaned|city_cleaned|block_key|
+---+-----------+-----------+------------+------------+---------+
|  1|  John Doe |   New York|    john doe|    new york| johnnewy|
|  2|    Jon Doe|   New York|     jon doe|    new york| jondnewy|
|  3|Johnny Doe |Los Angeles|  johnny doe| los angeles| johnlosa|
+---+-----------+-----------+------------+------------+---------+
only showing top 3 rows



In [53]:
candidatePairs= a.join(b, col("a.block_key")==col("b.block_key")).\
                filter(col("a.id")<col('b.id'))

In [54]:
candidatePairs.show(3)

+---+----------+--------+------------+------------+---------+---+------------+--------+------------+------------+---------+
| id|      name|    city|name_cleaned|city_cleaned|block_key| id|        name|    city|name_cleaned|city_cleaned|block_key|
+---+----------+--------+------------+------------+---------+---+------------+--------+------------+------------+---------+
|  1| John Doe |New York|    john doe|    new york| johnnewy| 41|   John Doe!|New York|    john doe|    new york| johnnewy|
|  1| John Doe |New York|    john doe|    new york| johnnewy| 40|John A. D$e |New York| john a  d e|    new york| johnnewy|
|  1| John Doe |New York|    john doe|    new york| johnnewy| 37|    John Dœ |New York|      john d|    new york| johnnewy|
+---+----------+--------+------------+------------+---------+---+------------+--------+------------+------------+---------+
only showing top 3 rows



In [55]:
# similarity
df_similars= candidatePairs.withColumn("levenshtein", levenshtein(col("a.name_cleaned"),col("b.name_cleaned")))
df_similars.show(3)

+---+----------+--------+------------+------------+---------+---+------------+--------+------------+------------+---------+-----------+
| id|      name|    city|name_cleaned|city_cleaned|block_key| id|        name|    city|name_cleaned|city_cleaned|block_key|levenshtein|
+---+----------+--------+------------+------------+---------+---+------------+--------+------------+------------+---------+-----------+
|  1| John Doe |New York|    john doe|    new york| johnnewy| 41|   John Doe!|New York|    john doe|    new york| johnnewy|          0|
|  1| John Doe |New York|    john doe|    new york| johnnewy| 40|John A. D$e |New York| john a  d e|    new york| johnnewy|          4|
|  1| John Doe |New York|    john doe|    new york| johnnewy| 37|    John Dœ |New York|      john d|    new york| johnnewy|          2|
+---+----------+--------+------------+------------+---------+---+------------+--------+------------+------------+---------+-----------+
only showing top 3 rows



In [56]:
df_similars=df_similars.select(
    col("a.id").alias('id_a'),
    col('b.id').alias('id_b'),
    col('a.name_cleaned').alias('name_a'),
    col('b.name_cleaned').alias('name_b'),
    col('a.city_cleaned').alias('city_a'),
    col('b.city_cleaned').alias('city_b'),
    col('a.block_key'),
    col('levenshtein').alias('lev')
)


In [57]:
df_similars.show(3)

+----+----+--------+-----------+--------+--------+---------+---+
|id_a|id_b|  name_a|     name_b|  city_a|  city_b|block_key|lev|
+----+----+--------+-----------+--------+--------+---------+---+
|   1|  41|john doe|   john doe|new york|new york| johnnewy|  0|
|   1|  40|john doe|john a  d e|new york|new york| johnnewy|  4|
|   1|  37|john doe|     john d|new york|new york| johnnewy|  2|
+----+----+--------+-----------+--------+--------+---------+---+
only showing top 3 rows



In [58]:
df_similars= df_similars.withColumn('similarity', 1-col('lev')/greatest(length(col('name_a')),length(col('name_b'))))

In [59]:
df_similars.show(3)

+----+----+--------+-----------+--------+--------+---------+---+------------------+
|id_a|id_b|  name_a|     name_b|  city_a|  city_b|block_key|lev|        similarity|
+----+----+--------+-----------+--------+--------+---------+---+------------------+
|   1|  41|john doe|   john doe|new york|new york| johnnewy|  0|               1.0|
|   1|  40|john doe|john a  d e|new york|new york| johnnewy|  4|0.6363636363636364|
|   1|  37|john doe|     john d|new york|new york| johnnewy|  2|              0.75|
+----+----+--------+-----------+--------+--------+---------+---+------------------+
only showing top 3 rows



In [60]:
# matching 
df_matched= df_similars.withColumn("ismatch", when(col("similarity")> 0.7, 1).otherwise(0))

In [65]:
df_matched.count()

80

In [69]:
df_matched.show(3)

+----+----+--------+-----------+--------+--------+---------+---+------------------+-------+
|id_a|id_b|  name_a|     name_b|  city_a|  city_b|block_key|lev|        similarity|ismatch|
+----+----+--------+-----------+--------+--------+---------+---+------------------+-------+
|   1|  41|john doe|   john doe|new york|new york| johnnewy|  0|               1.0|      1|
|   1|  40|john doe|john a  d e|new york|new york| johnnewy|  4|0.6363636363636364|      0|
|   1|  37|john doe|     john d|new york|new york| johnnewy|  2|              0.75|      1|
+----+----+--------+-----------+--------+--------+---------+---+------------------+-------+
only showing top 3 rows



In [62]:
linked= df_matched.filter(col('ismatch')==1).select('id_a','id_b').collect()
#linked

In [63]:
parent= {}

def find(x):
    if parent.get(x,x)!=x:
        parent[x]= find(parent[x])
    return parent.get(x,x)

def union(x,y):
    px,py= find(x),find(y)
    
    if px!=py:
        parent[py]=px

for row in linked:
    union(row['id_a'],row['id_b'])

cluster={}

for node in parent:
    root=find(node)
    cluster.setdefault(root,[]).append(node)

cluster

{1: [41, 37, 36, 19, 18, 9, 6, 40],
 2: [32, 8],
 3: [33, 15, 10],
 4: [35, 13, 11, 5],
 17: [39],
 20: [50, 42, 28],
 21: [43],
 22: [44],
 24: [46],
 25: [47],
 26: [48],
 27: [49]}

## als

In [21]:
from pyspark.ml.recommendation import *
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import *

In [22]:
dff= spark.read.json('movies.json')


In [23]:
dff.printSchema()

root
 |-- helpfulness: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- profile_name: string (nullable = true)
 |-- review: string (nullable = true)
 |-- score: double (nullable = true)
 |-- summary: string (nullable = true)
 |-- time: long (nullable = true)
 |-- user_id: string (nullable = true)



In [24]:
dff.select('product_id','user_id','score').show(3)

+----------+--------------+-----+
|product_id|       user_id|score|
+----------+--------------+-----+
|B003AI2VGA|A141HP4LYPWMSR|  3.0|
|B003AI2VGA|A328S9RN3U5M68|  3.0|
|B003AI2VGA|A1I7QGUDP043DG|  5.0|
+----------+--------------+-----+
only showing top 3 rows



In [25]:
hex_to_bigint= udf(lambda v: int(v,16)%(10**8),LongType())

In [27]:
dff_hsh= dff.withColumn("usr_hasint",hex_to_bigint(sha1(col('user_id'))))\
            .withColumn("prod_hasint",hex_to_bigint(sha1(col('product_id'))))

In [28]:
dff_hsh.select("product_id","prod_hasint","user_id","usr_hasint","score").show(3)

+----------+-----------+--------------+----------+-----+
|product_id|prod_hasint|       user_id|usr_hasint|score|
+----------+-----------+--------------+----------+-----+
|B003AI2VGA|   51259877|A141HP4LYPWMSR|   5460385|  3.0|
|B003AI2VGA|   51259877|A328S9RN3U5M68|  64843361|  3.0|
|B003AI2VGA|   51259877|A1I7QGUDP043DG|   1480848|  5.0|
+----------+-----------+--------------+----------+-----+
only showing top 3 rows



In [29]:
(train,test)= dff_hsh.randomSplit([0.8,0.2])

In [30]:
als=ALS(
    userCol= 'usr_hasint',
    itemCol= 'prod_hasint',
    ratingCol='score',
    maxIter=10, rank=10, regParam=0.1,
    coldStartStrategy='drop'
)

In [31]:
model= als.fit(train)

In [32]:
preds= model.transform(test)


In [33]:
preds.show(3)

+-----------+----------+-------------------+--------------------+-----+--------------------+----------+--------------+----------+-----------+----------+
|helpfulness|product_id|       profile_name|              review|score|             summary|      time|       user_id|usr_hasint|prod_hasint|prediction|
+-----------+----------+-------------------+--------------------+-----+--------------------+----------+--------------+----------+-----------+----------+
|        0/0|B00005O438|sexylove "sexylove"|Cujo is one of th...|  5.0|CUJO SCARES ME AN...|1132012800|A2EIK4L9730WWJ|  46734671|   95464525| 2.4778795|
|        0/0|B00015HX90|     MSWenger "MSW"|One of the most w...|  5.0|    Timeless Classic|1226448000|A10FH6E78NLZXW|  95104474|   46954731| 4.8464494|
|        0/0|B0002EXFNI|      David Baldwin|At it's best "The...|  4.0|Better Than Avera...|1305244800|A2B73CL3QSYWLB|  80209220|   36602589| 3.8921976|
+-----------+----------+-------------------+--------------------+-----+-----------

In [34]:
evl= RegressionEvaluator(
    labelCol='score', predictionCol='prediction', metricName='rmse'
)

In [35]:
rms= evl.evaluate(preds)


In [36]:
rms

1.8336504933191677

In [37]:
# hyper param tuning 
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [38]:
rcd=model.recommendForAllUsers(3)
rcd.show(3)
rcd2=model.recommendForAllItems(3)
rcd2.show(3,truncate=False)

+----------+--------------------+
|usr_hasint|     recommendations|
+----------+--------------------+
|      5652|[{96052693, 6.602...|
|     12260|[{34166803, 6.856...|
|     26780|[{34166803, 3.412...|
+----------+--------------------+
only showing top 3 rows

+-----------+--------------------------------------------------------------------+
|prod_hasint|recommendations                                                     |
+-----------+--------------------------------------------------------------------+
|72430      |[{8434038, 5.5753965}, {51493655, 5.146796}, {98722846, 5.072886}]  |
|109252     |[{50187053, 5.671388}, {46555864, 5.4793205}, {63772730, 5.4200892}]|
|152962     |[{25534919, 7.1694813}, {70060750, 7.048391}, {91830145, 6.5907264}]|
+-----------+--------------------------------------------------------------------+
only showing top 3 rows



In [40]:
als_tuned= ALS(
    userCol='usr_hasint', itemCol='prod_hasint', ratingCol='score',coldStartStrategy='drop',
    rank=10,
    maxIter=10,
    regParam=0.1,
)

In [41]:
grid= ParamGridBuilder().addGrid(als_tuned.rank,[8,10]).addGrid(als_tuned.maxIter,[10,15]).addGrid(als_tuned.regParam,[0.1,1]).build()

In [42]:
evl2= RegressionEvaluator(
    labelCol='score', predictionCol='prediction', metricName='rmse'  
)

In [43]:
cross= CrossValidator(
    estimator=als_tuned, estimatorParamMaps=grid, numFolds=3, evaluator=evl2
)

In [44]:
cvmod= cross.fit(train)

In [45]:
preds_tuned= cvmod.transform(test)
preds_tuned.show(3)

+-----------+----------+-------------------+--------------------+-----+--------------------+----------+--------------+----------+-----------+----------+
|helpfulness|product_id|       profile_name|              review|score|             summary|      time|       user_id|usr_hasint|prod_hasint|prediction|
+-----------+----------+-------------------+--------------------+-----+--------------------+----------+--------------+----------+-----------+----------+
|        0/0|B00005O438|sexylove "sexylove"|Cujo is one of th...|  5.0|CUJO SCARES ME AN...|1132012800|A2EIK4L9730WWJ|  46734671|   95464525| 3.5288506|
|        0/0|B00015HX90|     MSWenger "MSW"|One of the most w...|  5.0|    Timeless Classic|1226448000|A10FH6E78NLZXW|  95104474|   46954731| 3.9134817|
|        0/0|B0002EXFNI|      David Baldwin|At it's best "The...|  4.0|Better Than Avera...|1305244800|A2B73CL3QSYWLB|  80209220|   36602589| 2.9984193|
+-----------+----------+-------------------+--------------------+-----+-----------

In [46]:
rmse= evl2.evaluate(preds_tuned)
rmse

1.587379442437405

## l1,l2

In [5]:
network_data = [
    ("192.68.1.2", "POST/jahshs/jshsh", "yaha bhi kuch tha"),
    ("190.43.1.2", "GET/....", " "),
    ("10.0.0.1", "POST/data/upload", "some data here"),
    ("192.68.1.2", "GET/index.html", "html content"),
    ("10.0.0.2", "POST/login", "user login"),
    ("10.0.0.2", "POST/logout", "user logout"),
]

In [6]:
rdd=sc.parallelize(network_data)


In [7]:
rdd.collect()

[('192.68.1.2', 'POST/jahshs/jshsh', 'yaha bhi kuch tha'),
 ('190.43.1.2', 'GET/....', ' '),
 ('10.0.0.1', 'POST/data/upload', 'some data here'),
 ('192.68.1.2', 'GET/index.html', 'html content'),
 ('10.0.0.2', 'POST/login', 'user login'),
 ('10.0.0.2', 'POST/logout', 'user logout')]

A. Count no. Of each http requests , POST , GET eg.

B. Print all the ip address with POST request , no duplicates

In [10]:
m1= rdd.map(lambda x:(x[1].split("/")[0]))
cnting= m1.map(lambda x:(x,1))
cnting= cnting.reduceByKey(lambda x,y: x+y)
cnting.collect()

[('POST', 4), ('GET', 2)]

In [11]:
m2= rdd.map(lambda x: (x[1].split("/")[0],x[0]))
pnt= m2.filter(lambda x: (x[0].startswith("POST"))).distinct()
pnt.collect()


[('POST', '192.68.1.2'), ('POST', '10.0.0.1'), ('POST', '10.0.0.2')]

In [13]:
data = [
    ("p1", "d1", "Dr. Smith", 4.5),
    ("p2", "d2", "Dr. John", 3.8),
    ("p3", "d1", "Dr. Smith", 4.0),
    ("p4", "d3", "Dr. Alice", 4.9),
    ("p5", "d1", "Dr. Smith", 5.0),
    ("p6", "d2", "Dr. John", 3.5),
    ("p7", "d3", "Dr. Alice", 4.7),
]

In [14]:
rdd2= sc.parallelize(data)


In [15]:
rdd2.collect()

[('p1', 'd1', 'Dr. Smith', 4.5),
 ('p2', 'd2', 'Dr. John', 3.8),
 ('p3', 'd1', 'Dr. Smith', 4.0),
 ('p4', 'd3', 'Dr. Alice', 4.9),
 ('p5', 'd1', 'Dr. Smith', 5.0),
 ('p6', 'd2', 'Dr. John', 3.5),
 ('p7', 'd3', 'Dr. Alice', 4.7)]

In [17]:
m3= rdd2.map(lambda x:(x[1],(x[3],1)))
m4= m3.reduceByKey(lambda x,y: (x[0]+y[0] , x[1]+y[1])) 

In [18]:

m3.collect()

[('d1', (4.5, 1)),
 ('d2', (3.8, 1)),
 ('d1', (4.0, 1)),
 ('d3', (4.9, 1)),
 ('d1', (5.0, 1)),
 ('d2', (3.5, 1)),
 ('d3', (4.7, 1))]

In [19]:
m4.collect()

[('d1', (13.5, 3)), ('d2', (7.3, 2)), ('d3', (9.600000000000001, 2))]

In [20]:
m5= m4.mapValues(lambda x: (x[0]/x[1]))
m5.collect()

[('d1', 4.5), ('d2', 3.65), ('d3', 4.800000000000001)]

In [5]:
data= [
    "Electronics,TV,5|4|5", "Electronics,Laptop,4|5", "Furniture,Chair,3|4|5","Furniture,Table,5|4",  
]

In [6]:
rdd=sc.parallelize(data)


In [8]:
rdd.collect()

['Electronics,TV,5|4|5',
 'Electronics,Laptop,4|5',
 'Furniture,Chair,3|4|5',
 'Furniture,Table,5|4']

In [14]:
m1=rdd.map(lambda x: x.split(","))
m1.collect()

[['Electronics', 'TV', '5|4|5'],
 ['Electronics', 'Laptop', '4|5'],
 ['Furniture', 'Chair', '3|4|5'],
 ['Furniture', 'Table', '5|4']]

In [17]:
m2= m1.map(lambda x: (x[0],x[2].split("|")))
m2.collect()

[('Electronics', ['5', '4', '5']),
 ('Electronics', ['4', '5']),
 ('Furniture', ['3', '4', '5']),
 ('Furniture', ['5', '4'])]

In [29]:
m3= m2.flatMapValues(lambda x: x).map(lambda x: (x[0],int(x[1])))

In [30]:
m3.collect()

[('Electronics', 5),
 ('Electronics', 4),
 ('Electronics', 5),
 ('Electronics', 4),
 ('Electronics', 5),
 ('Furniture', 3),
 ('Furniture', 4),
 ('Furniture', 5),
 ('Furniture', 5),
 ('Furniture', 4)]

In [31]:
m3=m3.reduceByKey(lambda x,y: x+y)

In [32]:
m3.collect()

[('Electronics', 23), ('Furniture', 21)]