In [9]:
import numpy as np
import pandas as pd 
import re
from collections import Counter 
import itertools
import matplotlib as plt
import csv
import string

# Task 2: Cleaning
## 2.1: Cleaning the data
A collection of functions used for cleaning the data as according to the description of the milestone. 

In [2]:
#Reads data
def readData(path, size):
    chunklist = []
    i= 1
    for chunk in pd.read_csv(path, sep=',', error_bad_lines=False, index_col=False, chunksize = size):
        cleaner(chunk)
        chunklist.append(chunk)
        print("Chunk cleaned", i)
        i += 1
    return pd.concat(chunklist)


#Function to find and replace URLs with <URL>
urlPattern = r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*'
def swapUrl(line):
    line = re.sub(urlPattern,' <URL> ', line)
    return line

#Function to find and replace dates with <DATE>
re1 = re.compile(r'[\d]{1,2}(th)? [adfjmnos]\w*[,]?[.]? ([\d]{2,4})?')
re2 = re.compile(r'[adfjmnos]\w*[,]?[.]? [\d]{1,2}(th)?[,]? ([\d]{2,4})?')
re3 = re.compile(r'[adfjmnos]\w* [\d]{1,2}[,]?[.]?([\d]{2,4})?')
re4 = re.compile(r'[\d]{1,2}-[\d]{1,2}-[\d]{2,4}')
re5 = re.compile(r'[\d]{1,2}/[\d]{1,2}/[\d]{2,4}')
re6 = re.compile(r'[\d]{1,2} [\d]{1,2} [\d]{2,4}')
re7 = re.compile(r'[\d].{1,2}.[\d]{1,2}.[\d]{2,4}')
finReg = [re1, re2, re3, re4, re5, re6, re7]
def swapDates(line):
    for reg in finReg:
        line = re.sub(reg, ' <DATE> ', line)
    return line

#Function to find and replace numbers with <NUM>
pattern = r'[\d]+[,]?([\d]+)?'
def swapNumb(line):
    line = re.sub(pattern, ' <NUM> ', line)
    return line

#Main function for cleaning the data
def cleaner(rawData):
    #Removing rows without articleID
    for index, row in rawData.iterrows():
        if str(row['id']).isdigit():
            continue
        else:
            rawData.drop(index, inplace=True)
    pattern = re.compile(r'\s+')
    #Tokenizes the content & change urls, date n numb.
    for index, row in rawData.iterrows():
        row['content'] = row['content'].lower()
        row['content'] = re.sub(pattern, ' ', row['content'])
        row['content'] = swapUrl(row['content'])
        row['content'] = swapDates(row['content'])
        row['content'] = swapNumb(row['content'])
    metaList = []
    #Reformats the row where meta_keywords are empty
    for line in rawData['meta_keywords']:
        if (line ==  "['']"):
            metaList.append(np.nan)
        else: 
            metaList.append(line)
    rawData['meta_keywords'] = metaList
    return rawData


## 2.1: Structuring the data and creating .csv files.
Below is block of code which loads the set of 1 million news article and structures it according to our database schema and takes care of eventuall type errors and alike. There is alot of code, but most of it is pretty much the same only with different values and pairing different IDs up. 

In [10]:
df = readData('news_sample.csv', 20000)
allTags = []
allMeta = []
allAuthors = []
# Some further cleaning
for i in range(len(df.index)):
    line = str(df['tags'].iloc[i]).lower()
    line2 = str(df['meta_keywords'].iloc[i]).lower()
    line3 = str(df['authors'].iloc[i]).lower()
    line, line2, line3 = line.replace('[', ''), line2.replace('[', ''), line3.replace('[', '')
    line, line2, line3 = line.replace(']', ''), line2.replace(']', ''), line3.replace(']', '')
    line, line2, line3 = line.split(', '), line2.split(', '), line3.split(', ')
    allAuthors.append(line3)
    allMeta.append(line2)
    allTags.append(line)
allAuthors = (list(itertools.chain.from_iterable(allAuthors)))
allTags = (list(itertools.chain.from_iterable(allTags)))
allMeta= (list(itertools.chain.from_iterable(allMeta)))
authorList = list(dict.fromkeys(allAuthors))
metaList = list(dict.fromkeys(allMeta))
tagList = list(dict.fromkeys(allTags))
tagDict = {}
# Making dictionaries for tags, authors and meta keywords.
for i in range (len(tagList)):
    tagDict[tagList[i]] = i + 1
tagDict.update({'nan':0})
metaDict = {}
for i in range (len(metaList)):
    metaDict[metaList[i]] = i + 1
metaDict.update({'nan':0})
authorDict = {}
for i in range (len(authorList)):
    authorDict[authorList[i]] = i + 1
authorDict.update({'nan':0})

articleTagList = []
metaKeyList = []
authorIdList = []
# Making dictionaries by using IDs to pair too articleID.
# The dictionaries also gets written as csv files for the database.
for i in range(len(df.index)):
    article_tags = df['tags'].iloc[i]
    meta_keys = df['meta_keywords'].iloc[i]
    articleId = df['id'].iloc[i]
    author_s = df['authors'].iloc[i]
    # If there are no tags, tagID = 0
    if isinstance(article_tags, float):
        row = {'tagID': 0, 'articleID': articleId}
        articleTagList.append(row)
    else:
        article_tags = article_tags.lower().split(', ')
        for tag in article_tags:  
            tag = tag.replace('[', '')
            tag = tag.replace(']', '')      
            tagId = int(tagDict[tag])
            row = {'tagID': tagId, 'articleID': articleId}
            articleTagList.append(row)
    # If there are no meta keyword, meta_keyID = 0
    if isinstance(meta_keys, float):
        row = {'meta_keyID': 0, 'articleID': articleId}
        metaKeyList.append(row) 
    else: 
        meta_keys = meta_keys.lower().split(', ') 
        for keyword in meta_keys:
            keyword = keyword.replace('[', '')
            keyword = keyword.replace(']', '')  
            keyID = metaDict[keyword]
            row = {'meta_keyID': keyID, 'articleID': articleId}
            metaKeyList.append(row)
    # If there are no authors, authorID = 0
    if isinstance(author_s, float):
        row = {'authorID': 0, 'articleID': articleId}
        authorIdList.append(row) 
    else: 
        author_s = author_s.lower().split(', ') 
        for author in author_s:
            author = author.replace('[', '')
            author = author.replace(']', '')  
            authorId = authorDict[author]
            row = {'authorID': authorId, 'articleID': articleId}
            authorIdList.append(row)
# Loads the dictionaries to csv files.
authorDict = {y:x for x,y in authorDict.items()}
tagDict = {y:x for x,y in tagDict.items()}
metaDict = {y:x for x,y in metaDict.items()}
authorFrame = pd.DataFrame(list(authorDict.items()), columns = ['authorID', 'name'])
tagFrame = pd.DataFrame(list(tagDict.items()), columns = ['tagID', 'tag'])
keyFrame = pd.DataFrame(list(metaDict.items()), columns = ['meta_keyID', 'meta_keyword'])
authorFrame.to_csv('author_name.csv', index=False)
tagFrame.to_csv('tag_tag.csv', index=False)
keyFrame.to_csv('key_id_word.csv', index=False)

# Load csv files.
article_tag = pd.DataFrame(articleTagList)
article_tag.to_csv('article_tag.csv', index=False)

article_metaKey = pd.DataFrame(metaKeyList)
article_metaKey.to_csv('met_key_article.csv', index=False)

authorIdFrame = pd.DataFrame(authorIdList)
authorIdFrame.to_csv('authorID.csv', index=False)


# Create dictionaries for loading to csv.
df['type']=df['type'].fillna('NULL')
typeDict = df.type.drop_duplicates().to_dict()
domainDict = df.domain.drop_duplicates().to_dict()
typeDict = {y:x for x,y in typeDict.items()}
domainDict = {y:x for x,y in domainDict.items()}

scrapeDict = df.scraped_at.drop_duplicates().to_dict()
insertDict = df.inserted_at.drop_duplicates().to_dict()
updatedDict = df.updated_at.drop_duplicates().to_dict()

scrapeDict = {y:x for x,y in scrapeDict.items()}
insertDict = {y:x for x,y in insertDict.items()}
updatedDict = {y:x for x,y in updatedDict.items()}
timeDict = {**scrapeDict, **insertDict, **updatedDict}
i = 1 
#Giving timestamps IDs
for key  in timeDict:
    timeDict[key] = i*3
    i += 1

# Creating csv files.
df['domainID'] = df.apply(lambda row: domainDict[row['domain']], axis = 1)
df['typeID'] = df.apply(lambda row: typeDict[row['type']], axis= 1)
df['scrapedID'] = df.apply(lambda row: timeDict[row['scraped_at']], axis=1)
df['insertedID'] = df.apply(lambda row: timeDict[row['inserted_at']], axis=1)
df['updatedID'] = df.apply(lambda row: timeDict[row['updated_at']], axis= 1)

Articles = df[['id', 'title','url','content','summary','scrapedID', 'insertedID', 'updatedID', 'meta_description']].copy()
Articles.rename(columns={"id" : "articleID"}, inplace=True)
Articles.to_csv('articles.csv', index=False)

timeDict = {y:x for x,y in timeDict.items()}
TimeStamps = pd.DataFrame(list(timeDict.items()), columns=['timeID', 'timestamp'], )
TimeStamps.to_csv('timestamps.csv', index=False)

typeDict = {y:x for x,y in typeDict.items()}
types = pd.DataFrame(list(typeDict.items()), columns = ['typeID', 'type'])
types.to_csv('Types.csv', index=False)

DomainTypes = df[['domainID','domain', 'typeID']].copy()
DomainTypes.drop_duplicates(subset='domainID', inplace=True)
DomainTypes.to_csv('domain_types.csv', index=False)

Domains = df[['id','domainID']].copy()
Domains.rename(columns={"id":"articleID"}, inplace=True)
Domains.to_csv('domains.csv', index=False)

Chunk cleaned 1


# Task 5
## 5.1 Spider
In order to scrape wikipedia for articels we have used the scrapy framework. Below is the code for our scrapy.Spider which scrapes the article obtaining the HTML code. Then 