In [7]:
import pandas as pd
import nltk
from pathlib import Path
import re
import string
import requests
from nltk import sent_tokenize
import spacy
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns
import matplotlib.pyplot as plt
from afinn import Afinn
import numpy as np
import networkx as nx
from tabulate import tabulate

In [8]:
from re import search
import s1_analysis
import collections
s1 = s1_analysis.s1_analysis()
s1.train_load("s1_model_2.sav")
pubs = []
names_list = ["Biden", "Bernie", "Warren", "Kamala"]


# read in data and sentence tokenize it
df = pd.read_csv('vikas.csv')
df['tokenized_sents'] = df.apply(
    lambda row: nltk.sent_tokenize(row['full_art']), axis=1)


# split df into dictionary with keys representing publisher and values representing df for those publishers
sources = {}
for source, df_source in df.groupby('source'):
    sources[source] = df_source

# grabs all sentences which mention candidate


def get_candidate_mentions(candidate_name, publisher, tokenize_level):
    mentions = []
    substring = candidate_name
    for article in sources[publisher][tokenize_level]:
        for sentence in article:
            if search(substring, sentence):
                mentions.append(sentence)
            else:
                continue
    return(mentions)


def get_sentiment_scores(names_list, publisher, tokenize_level, no_duplicates=None):
   # will store cand name and sentiment scores
    cand_sent = {}

    # build cand mention lists
    joe_biden_mentions = get_candidate_mentions(
        "Biden|Joe Biden", publisher, tokenize_level)
    bernie_mentions = get_candidate_mentions(
        "Bernie|Sanders", publisher, tokenize_level)
    Warren_mentions = get_candidate_mentions(
        "Warren|Elizabeth Warren", publisher, tokenize_level)
    Kamala_mentions = get_candidate_mentions(
        "Kamala|Harris", publisher, tokenize_level)

    # build lists for sentiment scoring loop
    cand_list = [joe_biden_mentions, bernie_mentions,
                 Warren_mentions, Kamala_mentions]

    # If we want to eliminate sentences with multiple candididate mentions, we call fucntion
    # with no_duplicates=1 and this block executes

    if no_duplicates == 1:

        # shared_sent will grab any shared sentences
        shared_sent = []
        for x in joe_biden_mentions:
            if search("Bernie|Sanders|Warren|Elizabeth Warren|Kamala|Harris", x):
                shared_sent.append(x)
        for x in bernie_mentions:
            if search("Joe|Joe Biden|Warren|Elizabeth Warren|Kamala|Harris", x):
                shared_sent.append(x)
        for x in Warren_mentions:
            if search("Joe|Joe Biden|Bernie|Sanders|Kamala|Harris", x):
                shared_sent.append(x)
        for x in Kamala_mentions:
            if search("Joe|Joe Biden|Bernie|Sanders|Warren|Elizabeth Warren", x):
                shared_sent.append(x)

        # use shared sent list to remove shared sent from each candidates sentence list
        for cand in cand_list:
            for sent in cand:
                if sent in shared_sent:
                    cand.remove(sent)

    # run sentiment score on each candidate and get count/store in returned dictionary
    for i, candidate in enumerate(cand_list):
        features = []
        results = []
        features = [s1.find_features(x) for x in candidate]
        results = [s1.classifier.classify(x) for x in features]
        counter = collections.Counter(results)
        cand_sent[names_list[i]] = counter
    ##print(cand_sent)
    return(cand_sent)

In [9]:
# will return a dictionary that as a key gives candidate name and as a value
# gives another dictionary with candidates name as key and sentiment scores as values
all_publishers = {}
pubnames = [x for x in sources.keys()]
for i, publisher in enumerate(pubnames):
   # pubs.append(get_sentiment_scores(names_list,publisher))
    scores = get_sentiment_scores(names_list, publisher, 'tokenized_sents')
    print(scores)
    all_publishers[publisher] = scores

{'Biden': Counter({'pos': 6, 'neg': 6}), 'Bernie': Counter({'neg': 5}), 'Warren': Counter({'neg': 7, 'pos': 6}), 'Kamala': Counter({'neg': 2})}
{'Biden': Counter({'neg': 1525, 'pos': 1473}), 'Bernie': Counter({'neg': 538, 'pos': 484}), 'Warren': Counter({'neg': 602, 'pos': 519}), 'Kamala': Counter({'pos': 216, 'neg': 205})}
{'Biden': Counter({'neg': 962, 'pos': 883}), 'Bernie': Counter({'neg': 306, 'pos': 276}), 'Warren': Counter({'neg': 434, 'pos': 387}), 'Kamala': Counter({'neg': 121, 'pos': 100})}
{'Biden': Counter({'pos': 2126, 'neg': 2044}), 'Bernie': Counter({'neg': 574, 'pos': 482}), 'Warren': Counter({'neg': 726, 'pos': 624}), 'Kamala': Counter({'neg': 244, 'pos': 193})}
{'Biden': Counter({'neg': 24, 'pos': 13}), 'Bernie': Counter({'neg': 30, 'pos': 20}), 'Warren': Counter({'neg': 41, 'pos': 20}), 'Kamala': Counter({'pos': 5, 'neg': 4})}
{'Biden': Counter({'pos': 119, 'neg': 113}), 'Bernie': Counter({'neg': 38, 'pos': 32}), 'Warren': Counter({'neg': 40, 'pos': 32}), 'Kamala': C

In [10]:
all_publishers

{'Bloomberg': {'Biden': Counter({'pos': 6, 'neg': 6}),
  'Bernie': Counter({'neg': 5}),
  'Warren': Counter({'pos': 6, 'neg': 7}),
  'Kamala': Counter({'neg': 2})},
 'Breitbart News': {'Biden': Counter({'neg': 1525, 'pos': 1473}),
  'Bernie': Counter({'neg': 538, 'pos': 484}),
  'Warren': Counter({'neg': 602, 'pos': 519}),
  'Kamala': Counter({'pos': 216, 'neg': 205})},
 'CNN': {'Biden': Counter({'pos': 883, 'neg': 962}),
  'Bernie': Counter({'neg': 306, 'pos': 276}),
  'Warren': Counter({'pos': 387, 'neg': 434}),
  'Kamala': Counter({'neg': 121, 'pos': 100})},
 'Fox News': {'Biden': Counter({'neg': 2044, 'pos': 2126}),
  'Bernie': Counter({'pos': 482, 'neg': 574}),
  'Warren': Counter({'pos': 624, 'neg': 726}),
  'Kamala': Counter({'neg': 244, 'pos': 193})},
 'Google News': {'Biden': Counter({'pos': 13, 'neg': 24}),
  'Bernie': Counter({'neg': 30, 'pos': 20}),
  'Warren': Counter({'neg': 41, 'pos': 20}),
  'Kamala': Counter({'neg': 4, 'pos': 5})},
 'MSNBC': {'Biden': Counter({'pos': 1

In [11]:
## smae as above block but with no_duplicates=1
all_publishers2 = {}
pubnames = [x for x in sources.keys()]
for i, publisher in enumerate(pubnames):
   # pubs.append(get_sentiment_scores(names_list,publisher))
    scores = get_sentiment_scores(names_list, publisher, 'tokenized_sents', 1)
    print(scores)
    all_publishers2[publisher] = scores

{'Biden': Counter({'neg': 4, 'pos': 3}), 'Bernie': Counter({'neg': 2}), 'Warren': Counter({'neg': 4, 'pos': 4}), 'Kamala': Counter({'neg': 1})}
{'Biden': Counter({'neg': 1428, 'pos': 1359}), 'Bernie': Counter({'neg': 440, 'pos': 388}), 'Warren': Counter({'neg': 494, 'pos': 383}), 'Kamala': Counter({'pos': 179, 'neg': 170})}
{'Biden': Counter({'neg': 896, 'pos': 799}), 'Bernie': Counter({'neg': 243, 'pos': 207}), 'Warren': Counter({'neg': 344, 'pos': 284}), 'Kamala': Counter({'neg': 98, 'pos': 80})}
{'Biden': Counter({'neg': 1958, 'pos': 1943}), 'Bernie': Counter({'neg': 458, 'pos': 349}), 'Warren': Counter({'neg': 578, 'pos': 430}), 'Kamala': Counter({'neg': 219, 'pos': 159})}
{'Biden': Counter({'neg': 16, 'pos': 12}), 'Bernie': Counter({'neg': 20, 'pos': 15}), 'Warren': Counter({'neg': 27, 'pos': 15}), 'Kamala': Counter({'pos': 5, 'neg': 1})}
{'Biden': Counter({'pos': 106, 'neg': 101}), 'Bernie': Counter({'neg': 28, 'pos': 24}), 'Warren': Counter({'neg': 26, 'pos': 19}), 'Kamala': Cou

## Need to look into this tokenizer, it should tokenize paragraphs but i couldn't install it

In [None]:
# from estnltk import Tokenizer
# tokenizer = Tokenizer()
# document = tokenizer.tokenize(text)

## Extra code from previous project

In [None]:
`
sns.set_style("darkgrid")


def name_entity_recognition(sentence):
    '''
    A function to retrieve name entities in a sentence.
    :param sentence: the sentence to retrieve names from.
    :return: a name entity list of the sentence.
    '''

    doc = nlp(sentence)
    # retrieve person and organization's name from the sentence
    name_entity = [x for x in doc.ents if x.label_ in ['PERSON']]
    # convert all names to lowercase and remove 's and ’s in names
    name_entity = [str(x).lower().replace("'s", "") for x in name_entity]
    name_entity = [x.replace("’s", "") for x in name_entity]
    # remove name words that are less than 3 letters to raise recognition accuracy
    name_entity = [x for x in name_entity if len(x) >= 3]

    return name_entity


def flatten(l):
    """A function that flattens a complex list"""
    flat_list = []
    for i in l:
        for j in i:
            flat_list.append(j)
    return flat_list


def nlist(book):
    """Returns a unique list of names from a sentence tokenized book"""
    names = []
    for i in book:
        if name_entity_recognition(i) != []:
            names.append(name_entity_recognition(i))
    names = list(set(flatten(names)))
    return names


def top_names(name_list, novel, top_num=25):
    '''
    Returns name freq of a book for each name
    '''

    vect = CountVectorizer(vocabulary=name_list, stop_words='english')
    name_frequency = vect.fit_transform([novel.lower()])
    name_frequency = pd.DataFrame(
        name_frequency.toarray(), columns=vect.get_feature_names())
    name_frequency = name_frequency.T
    name_frequency = name_frequency.sort_values(by=0, ascending=False)
    name_frequency = name_frequency[0:top_num]
    names = list(name_frequency.index)
    name_frequency = list(name_frequency[0])

    return name_frequency, names


def name_freq_plot(df, title):
    """plot for name freq"""
    sns.barplot(data=df,
                y=df.names,
                x=df.freq,
                color='blue')
    plt.title(title)
    plt.show()

## Builds a dictionary which stores all the names used in all articles from CNN via Spacy-NER

In [None]:
# this will build a dictionary which stores all the names used in all articles from CNN
sources["CNN"]['tokenized_sents'] = sources["CNN"].apply(
    lambda row: nltk.sent_tokenize(row['full_art']), axis=1)
article_dict = {}
for i, x in enumerate(sources["CNN"]['tokenized_sents']):
    article_dict[i] = nlist(x)
    print(i)