# DS320 Final Project
## Data Cleaning and Sentiment/Tag Addition Notebook
### Noah B Johnson

## Import Modules

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import urllib3
import random
import time
import json
from tinydb import TinyDB, Query
import unicodedata
from textblob import TextBlob
import indicoio
indicoio.config.api_key = ''

## Configure Environment

In [3]:
# setup json file for tinydb
db = TinyDB('db-backup1.json')

## Load from db

In [4]:
# loads the database into a dataframe
df = pd.DataFrame(db.all())

## Basic Cleaning

In [44]:
cleanBody = []
for line in df.body:
    # Normalize unicode characters
    noUnicode = unicodedata.normalize('NFKD', line).encode('ascii', 'ignore').decode()
    # Replace newlines and tabs with spaces and srip trailing whitespace
    noWhitespace = noUnicode.replace("\n"," ").replace("\t"," ").strip()
    cleanBody.append(noWhitespace)
df.body = cleanBody

In [45]:
# remove "-guest author" tag from author column
df.author = df.author.str.replace(" -- Guest Contributor","")
df.author = df.author.str.replace(" - Guest Contributor","")

## Data Type Assignment

In [46]:
# all of the columns are just the 'object' data type probably read as stings
# This needs to be corrected for full functionality
df.get_dtype_counts()

object    6
dtype: int64

In [37]:
# make the time column a datetime object
df.time = pd.to_datetime(df.time)
df.get_dtype_counts()

datetime64[ns]     1
float64           25
int64              4
object            10
dtype: int64

In [38]:
# make news site a categorical variable
df.site = df.site.astype('category')
df.get_dtype_counts()

datetime64[ns]     1
float64           25
int64              4
object             9
category           1
dtype: int64

In [39]:
# make author a categorical variable
df.author = df.author.astype('category')
df.get_dtype_counts()

category           2
datetime64[ns]     1
float64           25
int64              4
object             8
dtype: int64

## Metadata Construction

In [40]:
# Create a title length column
df['titleLength'] = df.title.str.len()

In [41]:
# Create a body length column
df['bodyLength'] = df.body.str.len()

## TextBlob Basic Sentiment
> Not expected to be used much, more of a baseline comparison

In [52]:
tb_BodyPolarity = []
tb_BodySubjectivity = []
tb_TitlePolarity = []
tb_TitleSubjectivity = []

for line in range(len(df)):
    body = TextBlob(df.body.values[line]).sentiment
    title = TextBlob(df.title.values[line]).sentiment
    tb_BodyPolarity.append(body.polarity)
    tb_TitlePolarity.append(title.polarity)
    tb_BodySubjectivity.append(body.subjectivity)
    tb_TitleSubjectivity.append(title.subjectivity)

In [53]:
df['tb_BodyPolarity'] = tb_BodyPolarity
df['tb_BodySubjectivity'] = tb_BodySubjectivity
df['tb_TitlePolarity'] = tb_TitlePolarity
df['tb_TitleSubjectivity'] = tb_TitleSubjectivity

## Indico.io Sentiment

In [87]:
# Get the sentiment of every title and write each one to a file to prevent failure

indico_titleSentiment = open('indico.txt','w')
working_list = []
for line in range(len(df)):
    working_list.append(df.title.values[line])
    if len(working_list) > 69:
        print(str(line))
        for l in indicoio.sentiment(working_list):
            indico_titleSentiment.write(str(l) + "\n")
            working_list = []
for l in indicoio.sentiment(working_list):
    indico_titleSentiment.write(str(l) + "\n")
    print(str(line), str(l))
    working_list = []

In [92]:
# add to dataframe
f = open('indico.txt','r')
df['indico_TitleSentiment'] = f.readlines()
df['indico_TitleSentiment'] = df['indico_TitleSentiment'].str.replace("\n","")
df['indico_TitleSentiment'] = df['indico_TitleSentiment'].astype(float)

In [113]:
# Get the sentiment of every body

indico_bodySentiment = []
working_list = []
for line in range(len(df)):
    working_list.append(df.body.values[line])
    if len(working_list) > 29:
        print(str(line))
        for l in indicoio.sentiment(working_list):
            indico_bodySentiment.append(l)
            working_list = []
print(str(line))
for l in indicoio.sentiment(working_list):
    indico_bodySentiment.append(l)
    working_list = []

In [114]:
# add to dataframe
df['indico_BodySentiment'] = indico_bodySentiment
df['indico_BodySentiment'] = df['indico_BodySentiment'].astype(float)

In [123]:
df.to_csv("indico_backup.csv")

## Indico Political Lean

In [142]:
# Get the lean of every title

indico_titleLean = []
working_list = []
for line in range(len(df)):
    working_list.append(df.title.values[line])
    if len(working_list) > 59:
        print(str(line))
        for l in indicoio.political(working_list):
            indico_titleLean.append(l)
            working_list = []
print(str(line))
for l in indicoio.political(working_list):
    indico_titleLean.append(l)
    working_list = []

In [134]:
conservativeLean = []
greenLean = []
liberalLean= []
libertarianLean = []
for line in indico_titleLean:
#     print(line)
    if type(line) == dict:
        conservativeLean.append(line['Conservative'])
        greenLean.append(line['Green'])
        liberalLean.append(line['Liberal'])
        libertarianLean.append(line['Libertarian'])
    else:
        conservativeLean.append(0)
        greenLean.append(0)
        liberalLean.append(0)
        libertarianLean.append(0)

df['indico_TitleConservativeLean'] = conservativeLean
df['indico_TitleGreenLean'] = greenLean
df['indico_TitleLiberalLean'] = liberalLean
df['indico_TitleLibertarianLean'] = libertarianLean

In [143]:
# Get the lean of every body

indico_bodyLean = []
working_list = []
for line in range(len(df)):
    working_list.append(df.body.values[line])
    if len(working_list) > 59:
        print(str(line))
        for l in indicoio.political(working_list):
            indico_bodyLean.append(l)
            working_list = []
print(str(line))
for l in indicoio.political(working_list):
    indico_bodyLean.append(l)
    working_list = []

In [136]:
conservativeLean = []
greenLean = []
liberalLean= []
libertarianLean = []
for line in indico_bodyLean:
#     print(line)
    if type(line) == dict:
        conservativeLean.append(line['Conservative'])
        greenLean.append(line['Green'])
        liberalLean.append(line['Liberal'])
        libertarianLean.append(line['Libertarian'])
    else:
        conservativeLean.append(0)
        greenLean.append(0)
        liberalLean.append(0)
        libertarianLean.append(0)

df['indico_BodyConservativeLean'] = conservativeLean
df['indico_BodyGreenLean'] = greenLean
df['indico_BodyLiberalLean'] = liberalLean
df['indico_BodyLibertarianLean'] = libertarianLean

In [146]:
df.to_csv("indico_backup2.csv")

## Indico Emotion

In [259]:
# Get the emotion of every title

indico_titleEmotion = []
working_list = []
for line in range(len(df)):
    working_list.append(df.title.values[line])
    if len(working_list) > 59:
        print(str(line))
        for l in indicoio.emotion(working_list):
            indico_titleEmotion.append(l)
            working_list = []
print(str(line))
for l in indicoio.emotion(working_list):
    indico_titleEmotion.append(l)
    working_list = []

In [155]:
anger = []
joy = []
fear = []
sadness = []
surprise = []
for line in indico_titleEmotion:
#     print(line)
    if type(line) == dict:
        anger.append(line['anger'])
        joy.append(line['joy'])
        sadness.append(line['sadness'])
        fear.append(line['fear'])
        surprise.append(line['surprise'])
    else:
        anger.append(0)
        joy.append(0)
        sadness.append(0)
        fear.append(0)
        surprise.append(0)

df["indico_TitleAnger"] = anger
df["indico_TitleJoy"] = joy
df["indico_TitleFear"] = fear
df["indico_TitleSadness"] = sadness
df["indico_TitleSurprise"] = surprise

In [202]:
# Get the emotion of every body

indico_bodyEmotion = []
working_list = []
for line in range(len(df)):
    working_list.append(df.body.values[line])
    if len(working_list) > 59:
        print(str(line))
        for l in indicoio.emotion(working_list):
            indico_bodyEmotion.append(l)
            working_list = []
print(str(line))
for l in indicoio.emotion(working_list):
    indico_bodyEmotion.append(l)
    working_list = []

In [158]:
anger = []
joy = []
fear = []
sadness = []
surprise = []
for line in indico_bodyEmotion:
#     print(line)
    if type(line) == dict:
        anger.append(line['anger'])
        joy.append(line['joy'])
        sadness.append(line['sadness'])
        fear.append(line['fear'])
        surprise.append(line['surprise'])
    else:
        anger.append(0)
        joy.append(0)
        sadness.append(0)
        fear.append(0)
        surprise.append(0)

df["indico_BodyAnger"] = anger
df["indico_BodyJoy"] = joy
df["indico_BodyFear"] = fear
df["indico_BodySadness"] = sadness
df["indico_BodySurprise"] = surprise

In [160]:
df.to_csv("indico_backup3.csv")

## Indico People

In [1]:
def getPeople(inputList):
    # eliminates duplicates and partial duplicates, prioritizing longer strings (ie full names)
    people = set()
    for person in inputList:
        if person['confidence'] > .55:
            people.add(person['text'])
    peopleList = sorted(list(people), key=len, reverse=True)
    people = set()
    for person in peopleList:
        found = False
        for oldPerson in people:
            if oldPerson.find(person) > -1:
                found = True
        if not found:
            people.add(person)
    return list(people)

In [208]:
indico_People = []
working_list = []
for line in range(len(df)):
    working_list.append(df.body.values[line])
    if len(working_list) > 59:
        print(str(line))
        for l in indicoio.people(working_list):
            indico_People.append(l)
            working_list = []
print(str(line))
for l in indicoio.people(working_list):
    indico_People.append(l)
    working_list = []

In [205]:
indico_cleanPeople = []
for line in indico_People:
    indico_cleanPeople.append(getPeople(line))

In [209]:
df['indico_People'] = indico_cleanPeople
df.to_csv("indico_backup4.csv")

## Indico Places

In [253]:
indico_Places = []
working_list = []
for line in range(len(df)):
    working_list.append(df.body.values[line])
    if len(working_list) > 59:
        print(str(line))
        for l in indicoio.places(working_list):
            indico_Places.append(l)
            working_list = []
print(str(line))
for l in indicoio.places(working_list):
    indico_Places.append(l)
    working_list = []

In [250]:
indico_cleanPlaces = []
for line in indico_Places:
    indico_cleanPlaces.append(getPeople(line))

In [252]:
df['indico_Places'] = indico_cleanPlaces
df.to_csv("indico_backup5.csv")

## Indico Organizations

In [255]:
indico_Organizations = []
working_list = []
for line in range(len(df)):
    working_list.append(df.body.values[line])
    if len(working_list) > 59:
        print(str(line))
        for l in indicoio.organizations(working_list):
            indico_Organizations.append(l)
            working_list = []
print(str(line))
for l in indicoio.organizations(working_list):
    indico_Organizations.append(l)
    working_list = []

In [256]:
indico_cleanOrganizations = []
for line in indico_Organizations:
    indico_cleanOrganizations.append(getPeople(line))

In [257]:
df['indico_Organizations'] = indico_cleanOrganizations
df.to_csv("indico_backup6.csv")

## Indico Tags/Keywords

In [43]:
#df = pd.read_csv("indico_backup6.csv")

indico_textTags = []
working_list = []
for line in range(len(df)):
    working_list.append(df.body.values[line])
    if len(working_list) > 59:
        print(str(line))
        for l in indicoio.text_tags(working_list):
            indico_textTags.append(l)
            working_list = []
print(str(line))
for l in indicoio.text_tags(working_list):
    indico_textTags.append(l)
    working_list = []

In [19]:
def getTags(inputList):
    # eliminates duplicates and partial duplicates, prioritizing longer strings (ie full names)
    output = set()
    for tag in inputList.keys():
        if inputList[tag] > .1:
            output.add(tag)
    tagList = sorted(list(output), key=len, reverse=True)
    output = set()
    for tag in tagList:
        found = False
        for oldTag in output:
            if oldTag.find(tag) > -1:
                found = True
        if not found:
            output.add(tag)
    return list(output)

indico_cleanTextTags = []
for line in indico_textTags:
    indico_cleanTextTags.append(getTags(line))

In [27]:
df['indico_textTags'] = indico_cleanTextTags
df.to_csv("indico_backup7.csv")

In [44]:
# df = pd.read_csv("indico_backup7.csv")

indico_keywords = []
working_list = []
for line in range(len(df)):
    working_list.append(df.body.values[line])
    if len(working_list) > 59:
        print(str(line))
        for l in indicoio.keywords(working_list, version=4):
            indico_keywords.append(l)
            working_list = []
print(str(line))
for l in indicoio.keywords(working_list, version=4):
    indico_keywords.append(l)
    working_list = []

In [28]:
indico_cleanKeywords = []
for line in indico_keywords:
    indico_cleanKeywords.append(getTags(line))

In [31]:
df['indico_keywords'] = indico_cleanKeywords
df.to_csv("indico_backup8.csv")

## Indico HQ Sentiment

In [45]:
# Get the sentiment of every body

indico_hqSentiment = []
working_list = []
for line in range(len(df)):
    working_list.append(df.title.values[line] + df.body.values[line])
    if len(working_list) > 99:
        print(str(line))
        for l in indicoio.sentiment_hq(working_list):
            indico_hqSentiment.append(l)
            working_list = []
print(str(line))
for l in indicoio.sentiment_hq(working_list):
    indico_hqSentiment.append(l)
    working_list = []

In [35]:
# add to dataframe
df['indico_hqSentiment'] = indico_hqSentiment
df['indico_hqSentiment'] = df['indico_hqSentiment'].astype(float)

## Save DataFrame to File

In [42]:
df.to_pickle("finalDF.pickle")