In [2]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/itv000173/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [3]:
spark

In [4]:
rdd_orders = spark.sparkContext.textFile('/public/trendytech/retail_db/orders/*')
rdd_orders.take(5)

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013-07-25 00:00:00.0,12111,COMPLETE',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '5,2013-07-25 00:00:00.0,11318,COMPLETE']

In [5]:
rdd_customers = spark.sparkContext.textFile('/public/trendytech/retail_db/customers/*')
rdd_customers.take(5)

['1,Richard,Hernandez,XXXXXXXXX,XXXXXXXXX,6303 Heather Plaza,Brownsville,TX,78521',
 '2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,80126',
 '3,Ann,Smith,XXXXXXXXX,XXXXXXXXX,3422 Blue Pioneer Bend,Caguas,PR,00725',
 '4,Mary,Jones,XXXXXXXXX,XXXXXXXXX,8324 Little Common,San Marcos,CA,92069',
 '5,Robert,Hudson,XXXXXXXXX,XXXXXXXXX,"10 Crystal River Mall ",Caguas,PR,00725']

In [6]:
rdd_items = spark.sparkContext.textFile('/public/trendytech/retail_db/order_items/*')
rdd_items.take(5)

['1,1,957,1,299.98,299.98',
 '2,2,1073,1,199.99,199.99',
 '3,2,502,5,250.0,50.0',
 '4,2,403,1,129.99,129.99',
 '5,4,897,2,49.98,24.99']

# Question 1 : We need to find top 10 customers who have spent the most amount(premium customers)

````1. we need to find top 10 customers who have spent the most amount
(premium customers)
2. top 10 product id's with most quantities sold
3. how many customers are from Caguas city
4. Top 3 states with maximum customers
5. how many customers have spent more than $1000 in total
6. which state has most number of orders in CLOSED status
7. how many customers are active (active customers are the one's who placed
atleast one order)
8. What is the revenue generated by each state in sorted order````

In [7]:
mapped_orders = rdd_orders.map(lambda x: (int(x.split(',')[0]),(int(x.split(',')[2]))))
mapped_orders.take(5)

[(1, 11599), (2, 256), (3, 12111), (4, 8827), (5, 11318)]

In [8]:
mapped_order_item = rdd_items.map(lambda x : (int(x.split(',')[1]), (float(x.split(',')[4]))))
mapped_order_item.take(5)

[(1, 299.98), (2, 199.99), (2, 250.0), (2, 129.99), (4, 49.98)]

In [9]:
joined = mapped_orders.join(mapped_order_item)
joined.take(5)

[(4, (8827, 49.98)),
 (4, (8827, 299.95)),
 (4, (8827, 150.0)),
 (4, (8827, 199.92)),
 (8, (2911, 179.97))]

In [10]:
mapped_join = joined.map(lambda x : (x[1][0], x[1][1]))
mapped_join.take(5)

[(8827, 49.98), (8827, 299.95), (8827, 150.0), (8827, 199.92), (2911, 179.97)]

In [11]:
reduced = mapped_join.reduceByKey(lambda x,y : x+y).sortBy(lambda x : x[1],ascending = False)


In [12]:
reduced.take(10)

[(791, 10524.169999999996),
 (9371, 9299.029999999999),
 (8766, 9296.14),
 (1657, 9223.71),
 (2641, 9130.92),
 (1288, 9019.11),
 (3710, 9019.099999999999),
 (4249, 8918.85),
 (5654, 8904.95),
 (5624, 8761.98)]

# Question 2 : Consider the Covid19 Dataset

## We need to consider the 2 datasets

**Cases:**
date,state,positive,negative,pending,hospitalizedCurrently,hospitalizedCumula
tive,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,onVentilatorCumulati
ve,recovered,dataQualityGrade,lastUpdateEt,dateModified,checkTimeEt,death
,hospitalized,dateChecked,totalTestsViral,positiveTestsViral,negativeTestsViral
,positiveCasesViral,deathConfirmed,deathProbable,fips,positiveIncrease,nega
tiveIncrease,total,totalTestResults,totalTestResultsIncrease,posNeg,deathIncr
ease,hospitalizedIncrease,hash,commercialScore,negativeRegularScore,nega
tiveScore,positiveScore,score,grade
**States:**
state,notes,covid19Site,covid19SiteSecondary,covid19SiteTertiary,twitter,covid
19SiteOld,name,fips,pui,pum

````1. Find the top 10 states with the highest no.of positive cases.
2. Find the total count of people in ICU currently.
3. Find the top 15 States having maximum no.of recovery
4. Find the top 3 States having least no.of deaths
5. Find the total number of people hospitalized currently.
6. List the twitter handle and fips code for the top 15 states with the highest
   number of total cases.````

In [13]:
cases_rdd = spark.sparkContext.textFile('/public/trendytech/covid19/cases/covid_dataset_cases.csv')

In [14]:
states_rdd = spark.sparkContext.textFile("/public/trendytech/covid19/states/covid_dataset_states.csv")
states_rdd.take(5)

['HP,null,https://covid19.hp.gov.in/ ,https://covid-19archive.org/,https://www.azdhs.gov,@HPCovid,https://arcg.is/0brSGj,null,53,,',
 'AS,null,https://covid19.hp.gov.in/ ,https://covid-19archive.org/,https://www.azdhs.gov,@ASCovid,null,null,6,null,null',
 'HR,null,https://covid19.hp.gov.in/ ,https://covid-19archive.org/,https://www.azdhs.gov,@HRCovid,null,null,9,null,null',
 'KA,null,https://covid19.hp.gov.in/ ,https://covid-19archive.org/,https://www.azdhs.gov,@KACovid,null,null,53,null,null',
 'WA,null,https://covid19.hp.gov.in/ ,https://covid-19archive.org/,https://www.azdhs.gov,@WACovid,null,null,44,null,null']

In [15]:
cases_rdd.take(5)

['20200122,AP,2,0,48,26,15,18,2,38,10,34,B,18,19/05/2022,23,24,29,34,19,45,5,44,42,49,53,0,0,2,2,0,2,0,0,8f8db794931706272489cddd51e917a4a69c8c9b,0,0,0,0,0',
 '20200123,AP,2,0,48,41,2,20,30,40,5,50,B,1,08/11/2022,14,7,33,36,14,18,36,37,45,8,53,0,0,2,2,0,2,0,0,e16af2a6a8f060355ff5ba499a28309a262c0b1e,0,0,0,0,0',
 '20200124,HP,2,0,16,14,5,29,43,22,11,11,D,31,17/05/2022,10,37,11,25,45,25,2,32,30,41,53,0,0,2,2,0,2,0,0,094154f68e74bfc30b977cdee888f9c07be4360e,0,0,0,0,0',
 '20200125,HP,2,0,10,13,41,50,26,19,34,8,D,40,07/10/2022,32,5,33,9,50,31,18,38,7,16,53,0,0,2,2,0,2,0,0,9b52ca94dd2a996822542ea5f17a7363e7ad91cf,0,0,0,0,0',
 '20200126,AS,2,0,15,43,23,45,20,46,15,30,D,31,28/12/2022,22,14,1,29,2,24,15,12,9,10,53,0,0,2,2,0,2,0,0,7acb526e14f20a29cc74a0b32a37328bc6eac6c2,0,0,0,0,0']

In [16]:
cases_positive = cases_rdd.map(lambda x : (x.split(',')[1],int(x.split(',')[2])))
cases_positive.take(5)

[('AP', 2), ('AP', 2), ('HP', 2), ('HP', 2), ('AS', 2)]

In [17]:
cases_agg = cases_positive.reduceByKey(lambda x,y : x+y).sortBy(lambda x : x[1],ascending = False)
cases_agg.take(10)

[('WA', 1701),
 ('GA', 1017),
 ('MH', 730),
 ('MI', 61),
 ('CA', 53),
 ('GJ', 35),
 ('BR', 23),
 ('JH', 13),
 ('CG', 8),
 ('RI', 6)]

In [18]:
cases_icu = cases_rdd.map(lambda x : int(x.split(',')[7]))
cases_icu.take(5)

[18, 20, 29, 50, 45]

In [19]:
icu = cases_icu.reduce(lambda x,y : x+y)
icu

1344

In [20]:
deaths = cases_rdd.map(lambda x : (x.split(',')[1], int(x.split(',')[23])))
deaths.take(5)

[('AP', 42), ('AP', 45), ('HP', 30), ('HP', 7), ('AS', 9)]

In [21]:
deaths_summed = deaths.reduceByKey(lambda x,y : x+y).sortBy(lambda x : x[1], ascending = True)
deaths_summed.take(3)

[('AS', 9), ('JH', 10), ('CG', 31)]

In [22]:
hos = cases_rdd.map(lambda x : int(x.split(',')[5])).sum()
hos

1319

In [23]:
state_rdd = states_rdd.map(lambda x : (x.split(',')[0], x.split(',')[5]))
state_rdd.take(5)

[('HP', '@HPCovid'),
 ('AS', '@ASCovid'),
 ('HR', '@HRCovid'),
 ('KA', '@KACovid'),
 ('WA', '@WACovid')]

In [24]:
rdd1 = cases_rdd.map(lambda x: (x.split(",")[1],int(x.split(",")[28])))
rdd1.take(5)

[('AP', 2), ('AP', 2), ('HP', 2), ('HP', 2), ('AS', 2)]

In [25]:
case = rdd1.reduceByKey(lambda x, y: x + y)
case.take(5)

[('AP', 4), ('HP', 4), ('AS', 2), ('CG', 8), ('BR', 23)]

In [27]:
total_cases = case.join(state_rdd)
total_cases.take(5)

[('AS', (2, '@ASCovid')),
 ('GJ', (35, '@GJCovid')),
 ('MH', (730, '@MHCovid')),
 ('HR', (2, '@HRCovid')),
 ('KA', (5, '@KACovid'))]

In [29]:
final_rdd = total_cases.sortBy(lambda x: x[1][0], ascending=False)
final_rdd.take(15)

[('WA', (2100, '@WACovid')),
 ('GA', (1034, '@GACovid')),
 ('MH', (730, '@MHCovid')),
 ('CA', (515, '@CACovid')),
 ('MI', (61, '@MICovid')),
 ('GJ', (35, '@GJCovid')),
 ('AZ', (34, '@AZCovid')),
 ('BR', (23, '@BRCovid')),
 ('RI', (16, '@RICovid')),
 ('JH', (13, '@JHCovid')),
 ('CG', (8, '@CGCovid')),
 ('KA', (5, '@KACovid')),
 ('HP', (4, '@HPCovid')),
 ('AS', (2, '@ASCovid')),
 ('HR', (2, '@HRCovid'))]

# Question 3 :


In [30]:
reviews_rdd = spark.sparkContext.textFile('/public/trendytech/reviews/trendytech-student-reviews.csv')
reviews_rdd.take(5)

['I got to know about this course by recommendation from one of my senior and i would say that the course has Excellent lectures(content) that explored my mind.',
 'The Sumit is very much passionate about teaching Big Data and his style of teaching is very unique and engaging the flow of course content.',
 'Any One can learn from the scratch and do wonders if he follows the course regularly with discipline .I highly recommend this course to everyone who is looking to learn and grow in big data domain.',
 "I was working in a SQL support project and wanted to switch my career to a big data domain. I got to know this course from a friend who took it and benifitted immensely in his career. I  highly recommend the Big Data course at Trendytech as the curriculum is well designed as per current industry demands and includes all the needed topics. Sumit Sir's way of explaining the topics is the best part of this course, he has  deep knowledge of the subject which makes the program worthwhile."

In [46]:
reviews_lower = reviews_rdd.map(lambda x : x.lower()).flatMap(lambda x : x.split(' '))
reviews_lower.take(5)

['i', 'got', 'to', 'know', 'about']

In [50]:
final_reviews = reviews_lower.map(lambda x: (x,1)).reduceByKey(lambda x,y : x+y).sortBy(lambda x : x[1], ascending = False)
final_reviews.take(10)

[('the', 427),
 ('and', 326),
 ('to', 319),
 ('is', 223),
 ('course', 217),
 ('i', 215),
 ('big', 213),
 ('data', 201),
 ('of', 182),
 ('in', 167)]

In [41]:
boring_rdd = spark.sparkContext.textFile('/user/itv009033/landing/boringwords.txt')
boring_rdd.take(5)

['shouldnt', 'worrying', 'simplify', 'tidy', 'shouldnt']

In [43]:
broadcast_bw = spark.sparkContext.broadcast(boring_rdd.collect())

In [51]:
filtered = final_reviews.filter(lambda x : x[0] not in broadcast_bw.value)
filtered.take(5)

[('data', 201), ('sumit', 109), ('trendytech', 67), ('', 64), ('data.', 34)]

In [52]:
final_result = filtered.reduceByKey(lambda x,y : x+y).sortBy(lambda x: x[1], ascending=False)
final_result.collect()

[('data', 201),
 ('sumit', 109),
 ('trendytech', 67),
 ('', 64),
 ('data.', 34),
 ('course.', 33),
 ("sir's", 23),
 ('trendy', 14),
 ('course,', 13),
 ("master's", 13),
 ('domain.', 12),
 ("trendytech's", 12),
 ('sir.', 11),
 ('program.', 9),
 ('concepts.', 9),
 ('field.', 9),
 ('hands-on', 8),
 ('fresher', 8),
 ('amazing.', 8),
 ('career.', 7),
 ('well-structured', 7),
 ('interviews.', 7),
 ('trendytech.', 7),
 ('way.', 7),
 ('.', 7),
 ('overall,', 7),
 ("i'm", 7),
 ('-', 7),
 ('depth.', 7),
 ('passionate', 6),
 ('it.', 6),
 ('slack', 6),
 ('bigdata', 6),
 ('engineering.', 6),
 ('doubts', 6),
 ('journey.', 6),
 ('data,', 6),
 ('awesome.', 5),
 ('confidence.', 5),
 ('knowledge.', 5),
 ('real-world', 5),
 ("don't", 5),
 ('support.', 5),
 ('team.', 5),
 ('tech.', 5),
 ('experience.', 5),
 ('pipeline.', 4),
 ('grabbed', 4),
 ('in-depth', 4),
 ('now.', 4),
 ('*', 4),
 ('easily.', 4),
 ('mock', 4),
 ('good.', 4),
 ('excellently', 4),
 ('enrolling', 4),
 ('structured.', 4),
 ('end-to-end', 4