# Assignment on Map-Reduce

In the following questions, you will solve real problems with the techniques you have learned before. You will be working with data of **google play dataset** which includes the following datasets: 
1. googleplaystore
2. googleplaystore review

*You can find the data files on the archive have attached to this exercise.*

**T** (10pts) Number of applications according to the version of Android(output must be sorted)


> result: 
`<version, count>`

In [68]:
%%writefile nikoo_big_data_hw1.py

from mrjob.job import MRJob,MRStep

class VersionCount(MRJob):
     
    def mapper(self,_, lines):
        for line in lines.split('\n'):
            item = line.split('∑') 
            if item[-1] != "Android Ver":
                yield item[-1],1
            
    def reducer(self, item, counts):
        yield item, sum(counts)
      
        
if __name__ == '__main__':
    VersionCount.run()

Overwriting nikoo_big_data_hw1.py


In [69]:
!python nikoo_big_data_hw1.py googleplaystore.txt -q

"1.0 and up"	2
"1.5 and up"	20
"1.6 and up"	116
"2.0 and up"	32
"2.0.1 and up"	7
"2.1 and up"	134
"2.2 - 7.1.1"	1
"2.2 and up"	244
"2.3 and up"	652
"2.3.3 and up"	281
"3.0 and up"	241
"3.1 and up"	10
"3.2 and up"	36
"4.0 and up"	1375
"4.0.3 - 7.1.1"	2
"4.0.3 and up"	1501
"4.1 - 7.1.1"	1
"4.1 and up"	2451
"4.2 and up"	394
"4.3 and up"	243
"4.4 and up"	980
"4.4W and up"	12
"5.0 - 6.0"	1
"5.0 - 7.1.1"	1
"5.0 - 8.0"	2
"5.0 and up"	600
"5.1 and up"	24
"6.0 and up"	60
"7.0 - 7.1.1"	1
"7.0 and up"	42
"7.1 and up"	3
"8.0 and up"	6
"NaN"	2
"Varies with device"	1362


**T** (10pts) K of the best applications in every category(K should be specified by user)


> result:
`<appname,{other fields} >`

In [72]:
%%writefile nikoo_big_data_hw1.py

from mrjob.job import MRJob,MRStep
#from mrjob.compat import get_jobconf_value
import pandas as pd
from operator import itemgetter

class BestApplications(MRJob):
    k = 3
    def mapper(self,_, lines):
        apps = []        
        for line in lines.split('\n'):
            app = line.split('∑')
            if app[2] == "NaN":
                app[2] = "-1"
            if app[1] != "Category":
                yield app[1],app

            
    def reducer(self, category, apps):        
        sorted_list = sorted(apps, key=itemgetter(2), reverse=True)
        for app in sorted_list[0:self.k]:
            yield app[0],app

    
if __name__ == '__main__':
    BestApplications.run()     

Overwriting nikoo_big_data_hw1.py


In [73]:
!python nikoo_big_data_hw1.py googleplaystore.txt -q

"Spring flowers theme couleurs d t space"	["Spring flowers theme couleurs d t space","ART_AND_DESIGN","5","1","2.9M","100+","Free","0","Everyone","Art & Design","18-Apr-18","1.0.2","4.0 and up"]
"Harley Quinn wallpapers HD"	["Harley Quinn wallpapers HD","ART_AND_DESIGN","4.8","192","6.0M","10,000+","Free","0","Everyone","Art & Design","25-Apr-18","1.5","3.0 and up"]
"Cardi B Wallpaper"	["Cardi B Wallpaper","ART_AND_DESIGN","4.8","253","3.7M","50,000+","Free","0","Everyone","Art & Design","1-Nov-17","1.0.0","4.0 and up"]
"Tickets SDA 2018 and Exam from the State Traffic Safety Inspectorate with Drom.ru"	["Tickets SDA 2018 and Exam from the State Traffic Safety Inspectorate with Drom.ru","AUTO_AND_VEHICLES","4.9","10479","33M","100,000+","Free","0","Everyone","Auto & Vehicles","18-Jul-18","1.7.1","4.0 and up"]
"CDL Practice Test 2018 Edition"	["CDL Practice Test 2018 Edition","AUTO_AND_VEHICLES","4.9","7774","17M","100,000+","Free","0","Everyone","Auto & Vehicles","3-Jul-18","1.7","4.2 a

**T** (20pts) Number of applications in every category according to version of Android(output must be sorted on Count)

> result:
`<category, {count, version} >`

In [5]:
%%writefile nikoo_big_data_hw1.py

from mrjob.job import MRJob,MRStep
#from mrjob.compat import get_jobconf_value
import pandas as pd
from operator import itemgetter

class NumOfApplication(MRJob):

    def steps(self):
        return [
            MRStep(mapper = self.mapper1,
                  reducer = self.reducer1),
            MRStep(reducer = self.reducer_sort)
        ]


    def mapper1(self,_, lines):
        apps = []        
        for line in lines.split('\n'):
            app = line.split('∑')
            yield (app[1], app[-1]), 1

            
    def reducer1(self, category_ver, count):        
        cat, ver = category_ver
        yield cat, (sum(count), ver)

   
    def reducer_sort(self, category, count_version):
        for count, ver in sorted(count_version, reverse=True):
            yield (category, (count, ver))
            
            

if __name__ == '__main__':
    NumOfApplication.run()  

Overwriting nikoo_big_data_hw1.py


In [6]:
!python nikoo_big_data_hw1.py googleplaystore.txt -q

"ART_AND_DESIGN"	[21,"4.1 and up"]
"ART_AND_DESIGN"	[16,"4.0.3 and up"]
"ART_AND_DESIGN"	[8,"2.3 and up"]
"ART_AND_DESIGN"	[7,"4.0 and up"]
"ART_AND_DESIGN"	[4,"4.2 and up"]
"ART_AND_DESIGN"	[3,"4.4 and up"]
"ART_AND_DESIGN"	[2,"Varies with device"]
"ART_AND_DESIGN"	[2,"3.0 and up"]
"ART_AND_DESIGN"	[1,"5.0 and up"]
"ART_AND_DESIGN"	[1,"2.3.3 and up"]
"AUTO_AND_VEHICLES"	[15,"4.1 and up"]
"AUTO_AND_VEHICLES"	[15,"4.0 and up"]
"AUTO_AND_VEHICLES"	[12,"4.0.3 and up"]
"AUTO_AND_VEHICLES"	[8,"4.4 and up"]
"AUTO_AND_VEHICLES"	[7,"Varies with device"]
"AUTO_AND_VEHICLES"	[7,"4.2 and up"]
"AUTO_AND_VEHICLES"	[6,"5.0 and up"]
"AUTO_AND_VEHICLES"	[4,"6.0 and up"]
"AUTO_AND_VEHICLES"	[3,"2.3.3 and up"]
"AUTO_AND_VEHICLES"	[3,"2.3 and up"]
"AUTO_AND_VEHICLES"	[2,"4.3 and up"]
"AUTO_AND_VEHICLES"	[1,"4.4W and up"]
"AUTO_AND_VEHICLES"	[1,"3.2 and up"]
"AUTO_AND_VEHICLES"	[1,"2.2 and up"]
"BEAUTY"	[18,"4.0.3 and up"]
"BEAUTY"	[11,"4.0 and up"]
"BEAUTY"	[9,"4.1 and up"]
"BEAUTY"	[4,"5.0 and up"]
"BEA

**T** (60pts) In the review dataset which words have more occurrence in every application(output must be sorted on Count)

> result: 
`<appname, {count, word1, word2} >`

`hint:` use secondary sort

In [94]:
%%writefile nikoo_big_data_hw1.py

from mrjob.job import MRJob,MRStep
#import pandas as pd
from operator import itemgetter
import re

class NumOfFrequentWords(MRJob):
    
    def steps(self):
        return [
            MRStep(
                mapper=self.mapper1,
                reducer=self.reducer1
            ),

            MRStep(
                reducer=self.reducer_sort
            )
        ]
    
    
    def mapper1(self,_, lines):
        allWords = []
        a = []
        for line in lines.split('\n'):
            app = line.split('∑')
            appName = app[0]
            a.append(appName)
            translated_review = app[1] = re.sub(r'[^\w]', ' ', app[1])
            for word in translated_review.split(' '):
                if word != "":
                    word = word.lower()
                    a.append(word)
            allWords.append(a)
        for i in range(0, len(allWords) ):
            for j in range(1, len(allWords[i]) ):
                yield (allWords[i][0], allWords[i][j]), 1
            
            
            
    def reducer1(self, appName_word, count):
        appName, word = appName_word
        yield appName, (word, sum(count))
        
     
           
    def reducer_sort(self, appName, word_count):
        word_count = sorted(word_count, key=itemgetter(1), reverse=True)  
        #yield appName, word_count
        
        for i in range(0, len(word_count) - 1):
            if word_count[i][1] == word_count[i+1][1]:
                yield appName, (word_count[i][1], word_count[i][0], word_count[i+1][0])
                break
                
         
 
          

if __name__ == '__main__':
    NumOfFrequentWords.run()  

Overwriting nikoo_big_data_hw1.py


In [95]:
!python nikoo_big_data_hw1.py googleplaystore_user_reviews.txt -q

"10 Best Foods for You"	[26,"best","great"]
"104 \u627e\u5de5\u4f5c - \u627e\u5de5\u4f5c \u627e\u6253\u5de5 \u627e\u517c\u8077 \u5c65\u6b77\u5065\u6aa2 \u5c65\u6b77\u8a3a\u7642\u5ba4"	[12,"easy","the"]
"11st"	[11,"and","is"]
"1800 Contacts - Lens Store"	[14,"it","love"]
"1LINE \u2013 One Line with One Touch"	[4,"ads","love"]
"2018Emoji Keyboard \ud83d\ude02 Emoticons Lite -sticker&gif"	[8,"i","nan"]
"21-Day Meditation Experience"	[21,"full","review"]
"2Date Dating App, Love and matching"	[11,"app","dating"]
"2GIS: directory & navigator"	[11,"but","in"]
"2RedBeans"	[4,"app","it"]
"2ndLine - Second Phone Number"	[9,"calls","occurred"]
"30 Day Fitness Challenge - Workout at Home"	[9,"great","nan"]
"365Scores - Live Scores"	[3,"highlights","please"]
"3D Live Neon Weed Launcher"	[1,"best","it"]
"4 in a Row"	[5,"i","it"]
"4K Wallpapers and Ultra HD Backgrounds"	[3,"look","re"]
"591\u623f\u5c4b\u4ea4\u6613-\u79df\u5c4b\u3001\u4e2d\u53e4\u5c4b\u3001\u65b0\u5efa\u6848\u3001\u5be6\u50f9\u767b\u9