# Notes, Links, Code Snippets During Common Crawl Data Processing

In [98]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq


### Links

Example Repo: https://github.com/commoncrawl/cc-pyspark

Common Crawl Format Example: https://gist.github.com/Smerity/e750f0ef0ab9aa366558#file-bbc-warc

Implementing a Search Engine with Ranking in Python: http://aakashjapi.com/fuckin-search-engines-how-do-they-work/


### Scripts

In [None]:
# run extract_keyword.py in shell
$ cd data/ad-free-search-engine
$ python extract_keyword.py input/test_wat.txt output

# activate vertual environment    
$ cd data/ad-free-search-engine/
$ . venv/bin/activate
$ deactivate 

# Point the environment variable SPARK_HOME to your Spark installation
$ export SPARK_HOME="/Users/lxu213/spark/"

# submit example job to spark
$ $SPARK_HOME/bin/spark-submit ./server_count.py \ --num_output_partitions 1 --log_level WARN \ ./input/test_warc.txt servernames

# readWARC: assuming that you have the aws command line tools installed, you can list the contents of a crawl using:
$ aws s3 ls s3://commoncrawl/crawl-data/CC-MAIN-2014-10/ --recursive | head -6
    
# copy one segment to local using:
$ aws s3 cp s3://commoncrawl/crawl-data/CC-MAIN-2014-10/segments/1394023864559/warc/CC-MAIN-20140305125104-00002-ip-10-183-142-35.ec2.internal.warc.gz .

### Extract Keywords Python Function

Inherits from `CCSparkJob` and can run locally. # total number of records in warc.gz = 138,865

In [None]:
# extract keywords
kw_path = '/Users/lxu213/data/ad-free-search-engine/spark-warehouse/output_features/part-00000-34ccb7a1-4cbe-413d-bb91-165ea931b1f8-c000.snappy.parquet'
data = pq.read_table(kw_path, nthreads=4).to_pandas()
data['section'].unique()

trips_wkd_rain = trips.loc[(trips['DoW'].isin([5,6])) & trips['PRCP'] != 0] 
data[['url', 'description']].loc[data['keywords'].isin(query.split(' '))] 
data[['url', 'description']].loc[data['keywords'].isin(['lily','cyrus'])] 
kw_data = data[['url', 'title', 'description']].loc[data['keywords'] == 'cities'][:10]
data['url'].loc[data['keywords'] == 'cities']

for index, row in data[:10].iterrows():
    print row['keywords']
    
val = warc['val'] 
url = []
for i in range(len(val)):
    url.append(val[i]['url']) 
    
data['description'].unique()
# with adwords in links
data.describe()
# without adwords in links > removed about 25% of web pages
data_adfree.describe()

# Percent of crawled web pages that contain (detected) ad links:   # 26%
100 - (100*data_adfree.count()['url']/data.count()['url'])

### Rank by tf-idf

In [160]:
# Process WARC
PQPATH='/Users/lxu213/data/ad-free-search-engine/spark-warehouse/add_count/part-00000-6d70eeb9-5f8f-450b-ad70-f431c336e72d-c000.snappy.parquet'
warc = pq.read_table(PQPATH, nthreads=4).to_pandas()
# pq.ParquetFile(PQPATH).metadata

In [161]:
warc['tf-idf'] = pd.Series(0, index=warc.index)
tot_doc = len(warc)/10       # total documents

for i in range(len(warc)):
    docs_w_term = len(warc.loc[warc['keywords'] == warc['keywords'][i]])
    warc.loc[i,'tf-idf'] = warc['val'][i]['count'] * np.log(tot_doc/docs_w_term)
    

In [188]:
# pyarrow and fastparquet cannot handle nested dict in list
warc = warc.rename(columns={'keywords': 'keywords', 'val': 'val', 'tf-idf': 'tf-idf'})
type(warc.columns[0])
warc.to_parquet(output_path, engine='fastparquet', compression='snappy')

# check if kw_data[50] is dict not string
PQPATH='/Users/lxu213/data/ad-free-search-engine/spark-warehouse/add_count/tf_idf.parquet'
new = pq.read_table(PQPATH, nthreads=4).to_pandas()
kw_data = new['val'].loc[new['keywords'].isin(['dog'])]
type(kw_data[50])

from fastparquet import write
output_path = '/Users/lxu213/data/ad-free-search-engine/spark-warehouse/add_count/tf_idf.parquet'
write(output_path, warc, file_scheme)

str

In [212]:
# turns out json.loads can convert string to dict
import json

kw_dict = []
for row in kw_data:
    kw_dict.append(json.loads(row))

{u'count': 10,
 u'description': u'None',
 u'title': u'Video - Cincinnati News, FOX19-WXIX TV',
 u'url': u'http://www.fox19.com/category/240225/video-landing-page?clipId=8475230&autostart=true'}

In [148]:
new.loc[new['keywords'] == 'dog'].sort_values('tf-idf', ascending=False)


Unnamed: 0,keywords,val,tf-idf
3862,dog,"{""url"": ""https://www.bestofdog.com/collections...",122.680971
2122,dog,"{""url"": ""http://www.fox19.com/category/240225/...",47.184989
50,dog,"{""url"": ""http://almosthomeohio.org/in-memory/l...",33.029492
2698,dog,"{""url"": ""http://www.miragepetproducts.com/Brig...",14.155497


In [219]:
PQPATH='/Users/lxu213/data/ad-free-search-engine/spark-warehouse/add_count/tf_idf.parquet'
data = pq.read_table(PQPATH, nthreads=4).to_pandas()
kw_data = data[['val', 'tf-idf']].loc[data['keywords'].isin(['dog'])]
kw_data.sort_values('tf-idf', ascending=False)['val']

0       False
1       False
2       False
3       False
4       False
5       False
6       False
7       False
8       False
9       False
10      False
11      False
12      False
13      False
14      False
15      False
16      False
17      False
18      False
19      False
20      False
21      False
22      False
23      False
24      False
25      False
26      False
27      False
28      False
29      False
        ...  
4459    False
4460    False
4461    False
4462    False
4463    False
4464    False
4465    False
4466    False
4467    False
4468    False
4469    False
4470    False
4471    False
4472    False
4473    False
4474    False
4475    False
4476    False
4477    False
4478    False
4479    False
4480    False
4481    False
4482    False
4483    False
4484    False
4485    False
4486    False
4487    False
4488    False
Name: keywords, Length: 4489, dtype: bool