# Analyzing Wikipedia Pages

Preparing for analysis.

In [26]:
import os

file_names = os.listdir('wiki')
print(len(file_names))

with open(os.path.join('wiki', file_names[0])) as f:
    lines_of_all_files = [line for line in f.readlines()]

for line in lines_of_all_files[:20]:
    print(line)

999
<!DOCTYPE html>

<html class="client-nojs" lang="en" dir="ltr">

<head>

<meta charset="UTF-8"/>

<title>Bay of Concepción - Wikipedia</title>

<script>document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );</script>

<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Bay_of_Concepción","wgTitle":"Bay of Concepción","wgCurRevisionId":647460156,"wgRevisionId":647460156,"wgArticleId":16044270,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Coordinates on Wikidata","All stub articles","Landforms of Bío Bío Region","Bays of Chile","Bío Bío Region geography stubs"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy",

# Adding MapReduce

Count total number of lines in all files with MapReduce.

In [2]:
import math
import functools
from multiprocessing import Pool

def make_chunks(data, num_chunks):
    chunk_size = math.ceil(len(data) / num_chunks)
    return [data[i:i+chunk_size] for i in range(0, len(data), chunk_size)]

def map_reduce(data, num_processes, mapper, reducer):
    chunks = make_chunks(data, num_processes)
    pool = Pool(num_processes)
    chunk_results = pool.map(mapper, chunks)
    return functools.reduce(reducer, chunk_results)


# Count the total number of lines in all files

folder_name = 'wiki'

def map_total_lines(data):
    total_lines_in_chunk = 0
    for file_name in data:
        with open(os.path.join(folder_name, file_name)) as f:
            lines = [line for line in f.readlines()]
        total_lines_in_chunk += len(lines)
    return total_lines_in_chunk
        
def reduce_total_lines(chunk1, chunk2):
    return chunk1 + chunk2

total_lines = map_reduce(file_names, 4, map_total_lines, reduce_total_lines)
print(total_lines)

499797


# Searching for the value with MapReduce

Creating a dictionary where the keys are the file names and the values are lists with all line indexes that contain the given string.

In [3]:
from functools import partial

def map_reduce(data, num_processes, mapper, reducer, value = None):
    chunks = make_chunks(data, num_processes)
    pool = Pool(num_processes)
    chunk_results = pool.map(partial(mapper, value=value), chunks)
    return functools.reduce(reducer, chunk_results)

In [4]:
import re

def map_grep(data, value):
    grep_dict = {}
    for file_name in data:
        with open(os.path.join(folder_name, file_name)) as f:
            lines = [line for line in f.readlines()]
#             print(lines)
        for line_index in range(len(lines)):
            if re.search(value, lines[line_index]):
                if file_name not in grep_dict:
                    grep_dict[file_name] = []
                else:
                    grep_dict[file_name].append(line_index)
    return grep_dict

def reduce_grep(dict1, dict2):
    for file_name in dict1:
        if file_name not in dict2:
            dict2[file_name] = dict1[file_name]
        else:
            dict2[file_name] = dict2[file_name] + dict1[file_name]
    return dict2
    
    
grep_dict1 = map_reduce(file_names, 4, map_grep, reduce_grep, 'data')
print(grep_dict1)

{'Saravan_Gilan.html': [58, 60, 70, 74, 127, 847, 850, 852, 901, 984, 1001], 'Agaritine_gammaglutamyltransferase.html': [350, 352, 356, 406, 489, 506], 'Baadj.html': [56, 58, 68, 72, 109, 116, 120, 128, 132, 139, 160, 188, 414, 416, 465, 548, 565], 'George_Ellicott.html': [96, 108, 110, 159, 242, 259], 'Cortile_del_Belvedere.html': [45, 71, 83, 90, 99, 127, 138, 158, 186, 420, 503, 528], 'Mick_Kelly_(Australian_footballer).html': [221, 238], 'Somarasampettai.html': [52, 88, 119, 126, 209, 226, 253], 'PlzeC588_Zoo.html': [44, 48, 95, 158, 160, 209, 292, 317], '2007E2809308_Huddersfield_Town_A.F.C._season.html': [101, 102, 104, 105, 106, 107, 108, 117, 118, 119, 121, 123, 125, 134, 135, 136, 138, 140, 142, 212, 218, 224, 230, 236, 242, 248, 254, 260, 266, 272, 278, 295, 301, 307, 313, 319, 325, 331, 337, 343, 349, 355, 361, 405, 411, 417, 423, 429, 435, 441, 447, 453, 459, 465, 471, 477, 483, 489, 506, 512, 518, 524, 530, 536, 542, 548, 554, 560, 566, 572, 578, 584, 590, 609, 615, 621, 6

Now improving our map function by making it case insensitive.

In [5]:
def map_grep_insensitive_case(data, value):
    grep_dict = {}
    for file_name in data:
        with open(os.path.join(folder_name, file_name)) as f:
            lines = [line for line in f.readlines()]
        for line_index in range(len(lines)):
# changing only next line
            if re.search(value.lower(), lines[line_index].lower()):
#                 print(lines[i].lower())
                if file_name not in grep_dict:
                    grep_dict[file_name] = []
                grep_dict[file_name].append(line_index)
    return grep_dict
    
grep_dict2 = map_reduce(file_names, 4, map_grep_insensitive_case, reduce_grep, 'data')
print(grep_dict2)

{'Saravan_Gilan.html': [6, 58, 60, 70, 74, 112, 127, 847, 850, 852, 901, 984, 1001], 'Agaritine_gammaglutamyltransferase.html': [59, 86, 87, 350, 352, 356, 406, 489, 506], 'Baadj.html': [6, 56, 58, 68, 72, 109, 116, 120, 128, 132, 139, 160, 188, 414, 416, 465, 548, 565], 'George_Ellicott.html': [94, 96, 108, 110, 159, 242, 259], 'Cortile_del_Belvedere.html': [6, 45, 71, 83, 90, 99, 127, 138, 158, 186, 420, 503, 528], 'Mick_Kelly_(Australian_footballer).html': [138, 221, 238], 'Somarasampettai.html': [6, 52, 88, 119, 126, 209, 226, 253], 'PlzeC588_Zoo.html': [6, 44, 48, 95, 158, 160, 209, 292, 317], '2007E2809308_Huddersfield_Town_A.F.C._season.html': [100, 101, 102, 104, 105, 106, 107, 108, 117, 118, 119, 121, 123, 125, 134, 135, 136, 138, 140, 142, 212, 218, 224, 230, 236, 242, 248, 254, 260, 266, 272, 278, 295, 301, 307, 313, 319, 325, 331, 337, 343, 349, 355, 361, 405, 411, 417, 423, 429, 435, 441, 447, 453, 459, 465, 471, 477, 483, 489, 506, 512, 518, 524, 530, 536, 542, 548, 554, 

Let's check quantity of values that we found by first and second search.

In [6]:
def map_count_values(grep_dict1, grep_dict2):
    for file_name in grep_dict2:
        if file_name not in grep_dict1:
            print(f'New matches: {len(grep_dict2[file_name])} in {file_name}')
        elif len(grep_dict2[file_name]) > len(grep_dict1[file_name]):
            print(f'New matches: {len(grep_dict2[file_name]) - len(grep_dict1[file_name])} in {file_name}')

map_count_values(grep_dict1, grep_dict2)

New matches: 2 in Saravan_Gilan.html
New matches: 3 in Agaritine_gammaglutamyltransferase.html
New matches: 1 in Baadj.html
New matches: 1 in George_Ellicott.html
New matches: 1 in Cortile_del_Belvedere.html
New matches: 1 in Mick_Kelly_(Australian_footballer).html
New matches: 1 in Somarasampettai.html
New matches: 1 in PlzeC588_Zoo.html
New matches: 1 in 2007E2809308_Huddersfield_Town_A.F.C._season.html
New matches: 1 in David_Jesson.html
New matches: 2 in Nuno_Leal_Maia.html
New matches: 1 in Isaac_GrC3BCnewald.html
New matches: 2 in Battle_of_Wattignies.html
New matches: 1 in SBOA_School_26_Junior_College.html
New matches: 2 in Colchester_Village_Historic_District.html
New matches: 1 in Mohamed_ElSayed.html
New matches: 2 in Hayateumi_Hidehito.html
New matches: 1 in Oliver_Twist_(1912_American_film).html
New matches: 1 in Omar_Onsi.html
New matches: 8 in List_of_people_from_Bangor_Maine.html
New matches: 1 in Bijou_California.html
New matches: 1 in Volume_One_(The_West_Coast_Pop_Ar

# Finding Match Positions on Lines

In each pair in dictionary, the first number should be the line index and the second number should be the index on the line (first letter index, where the match starts).

Two ways of implementation:
* `map_grep_inner_index1`
example of result:

{
**'Saravan_Gilan.html': {6: (416, 420), 58: (540, 544)}, 'Agaritine_gammaglutamyltransferase.html': {59: (65, 69), 86: (73, 77)}**
}

* `map_grep_inner_index2`
example of result:

{
**'Saravan_Gilan.html': \[(6, 416), (6, 420), (58, 540), (58, 544)], 'Agaritine_gammaglutamyltransferase.html': [(59, 65), (59, 69), (86, 73)]**
}

In [7]:
def map_grep_inner_index1(data, value):
    grep_dict = {}
    for file_name in data:
        with open(os.path.join(folder_name, file_name)) as f:
            lines = [line for line in f.readlines()]
        for line_index in range(len(lines)):
            if re.search(value.lower(), lines[line_index].lower()):
                match = re.search(value.lower(), lines[line_index].lower())
                if file_name not in grep_dict:
                    grep_dict[file_name] = {}
                grep_dict[file_name].update({line_index : match.span()})
    return grep_dict

def reduce_grep(dict1, dict2):
    for file_name in dict1:
        if file_name not in dict2:
            dict2[file_name] = dict1[file_name]
    return dict2
    
    
grep_dict3 = map_reduce(file_names, 4, map_grep_inner_index1, reduce_grep, 'data')
print(grep_dict3)

{'Saravan_Gilan.html': {6: (416, 420), 58: (540, 544), 60: (382, 386), 70: (371, 375), 74: (391, 395), 112: (448, 452), 127: (516, 520), 847: (490, 494), 850: (18, 22), 852: (376, 380), 901: (40, 44), 984: (1007, 1011), 1001: (125, 129)}, 'Agaritine_gammaglutamyltransferase.html': {59: (65, 69), 86: (73, 77), 87: (423, 427), 350: (24, 28), 352: (526, 530), 356: (18, 22), 406: (40, 44), 489: (1091, 1095), 506: (125, 129)}, 'Baadj.html': {6: (486, 490), 56: (563, 567), 58: (378, 382), 68: (353, 357), 72: (409, 413), 109: (398, 402), 116: (426, 430), 120: (338, 342), 128: (333, 337), 132: (404, 408), 139: (338, 342), 160: (491, 495), 188: (366, 370), 414: (18, 22), 416: (486, 490), 465: (40, 44), 548: (971, 975), 565: (125, 129)}, 'George_Ellicott.html': {94: (18, 22), 96: (425, 429), 108: (18, 22), 110: (682, 686), 159: (40, 44), 242: (1011, 1015), 259: (124, 128)}, 'Cortile_del_Belvedere.html': {6: (525, 529), 45: (362, 366), 71: (927, 931), 83: (771, 775), 90: (771, 775), 99: (501, 505

In [8]:
def map_grep_inner_index2(data, value):
    grep_dict = {}
    for file_name in data:
        with open(os.path.join(folder_name, file_name)) as f:
            lines = [line for line in f.readlines()]
        for line_index in range(len(lines)):
            if re.search(value.lower(), lines[line_index].lower()):
                if file_name not in grep_dict:
                    grep_dict[file_name] = []
                match = re.search(value.lower(), lines[line_index].lower())
                for m in match.span():
                    grep_dict[file_name].append((line_index, m))
    return grep_dict
    
    
grep_dict4 = map_reduce(file_names, 4, map_grep_inner_index2, reduce_grep, 'data')
print(grep_dict4)

{'Saravan_Gilan.html': [(6, 416), (6, 420), (58, 540), (58, 544), (60, 382), (60, 386), (70, 371), (70, 375), (74, 391), (74, 395), (112, 448), (112, 452), (127, 516), (127, 520), (847, 490), (847, 494), (850, 18), (850, 22), (852, 376), (852, 380), (901, 40), (901, 44), (984, 1007), (984, 1011), (1001, 125), (1001, 129)], 'Agaritine_gammaglutamyltransferase.html': [(59, 65), (59, 69), (86, 73), (86, 77), (87, 423), (87, 427), (350, 24), (350, 28), (352, 526), (352, 530), (356, 18), (356, 22), (406, 40), (406, 44), (489, 1091), (489, 1095), (506, 125), (506, 129)], 'Baadj.html': [(6, 486), (6, 490), (56, 563), (56, 567), (58, 378), (58, 382), (68, 353), (68, 357), (72, 409), (72, 413), (109, 398), (109, 402), (116, 426), (116, 430), (120, 338), (120, 342), (128, 333), (128, 337), (132, 404), (132, 408), (139, 338), (139, 342), (160, 491), (160, 495), (188, 366), (188, 370), (414, 18), (414, 22), (416, 486), (416, 490), (465, 40), (465, 44), (548, 971), (548, 975), (565, 125), (565, 129

# Displaying the Results

Let's write new map and reduce functions to put the results of grep into a CSV file.

In [31]:
import csv
import re
import pandas as pd

def map_grep_into_table(data, value):
    chunk_of_matches = []
    for file_name in data:
        #read input csv
        with open(os.path.join(folder_name, file_name)) as f:
            lines = [line.lower() for line in f.readlines()]
            # search for value in each line of each file
            for line_index in range(len(lines)):
                match = re.search(value.lower(), lines[line_index])
                if match:
                    new_line = []
                    new_line.append(file_name)
                    new_line.append(line_index)
                    new_line.append(match.start())

                    context_start = max(match.start() - 30, 0)
                    context_end = match.end() + 31
                    new_line.append(lines[line_index][context_start:context_end])
                    
                    chunk_of_matches.append(new_line)
    return chunk_of_matches                       

def reduce_grep_into_table(list_of_lists1, list_of_lists2):
    grep_results = list_of_lists1 + list_of_lists2
    return grep_results

#open new csv for writing grep results
with open('new_file.csv', 'w', newline='') as new_file:
    header = ['File', 'Line', 'Start index', 'Context']
    writer = csv.writer(new_file)
    writer.writerow(header)
        
    grep_list_of_lists = map_reduce(file_names, 4, map_grep_into_table, reduce_grep_into_table, value = 'science')  
        
    writer.writerows(grep_list_of_lists)

grep_results_df = pd.read_csv('new_file.csv')
print(grep_results_df.head())

                          File  Line  Start index  \
0          Valentin_Yanin.html     6          840   
1          Valentin_Yanin.html    66           90   
2          Valentin_Yanin.html   144         1440   
3  William_Harvey_Lillard.html    80          166   
4       Victor_S._Mamatey.html    48          682   

                                             Context  
0  embers of the ussr academy of sciences","full ...  
1  href="/wiki/soviet_academy_of_sciences" class=...  
2  embers_of_the_ussr_academy_of_sciences" title=...  
3   - palmer, d.d. (1910) <i>the science, art and...  
4  /franklin_college_of_arts_and_sciences" title=...  


# Conclusion

Now, using the last `map_reduce` function we can find a target word, its context (30 symbols before and after) and its "address", including the file name, the line index and then its starting index in the line.

In this guided project we've implemented a **MapReduce grep algorithm that locates all matches of a given string within all files in a given folder**.