# Set up, data, dataframe creation

In [1]:
import pandas as pd
import numpy as np
from textblob import TextBlob
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter

#Set working directory 
#os.chdir('/Users/Davide/Documents/University/RA/text_analysis')
os.getcwd()


'/Users/Davide/Documents/University/RA/text_analysis/sentiment_analysis-master'

In [2]:
#scraped data
headlines_df = pd.read_csv('express.csv', index_col = 'date')

daily_mail_df = pd.read_excel('daily_mail_after_filters.xlsx', index_col = 'date')
evening_standard_df = pd.read_excel('evening_standard_after_filters.xlsx', index_col = 'date')
express_df = pd.read_excel('express_after_filters.xlsx', index_col = 'date')
guardian_df = pd.read_excel('guardian_after_filters.xlsx', index_col = 'date')
independent_df = pd.read_excel('independent_after_filters.xlsx', index_col = 'date')
times_df = pd.read_excel('times_after_filters.xlsx', index_col = 'date')

#Concatenating all articles together
newspapers = [daily_mail_df, evening_standard_df, express_df, guardian_df, independent_df, times_df]
articles_df = pd.concat(newspapers)


#labeled data
bbc_df = pd.read_csv('News_dataset.csv', sep=';')
kaggle_df = pd.read_json (r'/Users/Davide/Documents/University/RA/text_analysis/sentiment_analysis-master/News_Category_Dataset_v2.json', lines = True)

In [3]:
articles_df

Unnamed: 0_level_0,headline,newspaper,author,page,text
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1-3-2015,Two-thirds of laws in Britain over the past tw...,Daily Mail,"Tamara Cohen, Political Correspondent for the ...",,Study found 64.7 per cent of the laws made bet...
1-3-2015,Now Europe wants to ban your halogen light bul...,Daily Mail,"Sean Poulter, Consumer Affairs Editor For The ...",,Follows ban of incandescent bulbs in bid to cu...
1-3-2015,Scott Walker admits flip-flop on illegal immig...,Daily Mail,Associated Press,,Wisconsin governor once favored plan whereby u...
1-3-2015,"Don't blame immigrants for ills of society, sa...",Daily Mail,"Matt Chorley, Political Editor for MailOnline",,"Letter to 500,000 parishes warns of blaming im..."
2-3-2015,Landmark EU ruling to cut plastic bag use by 8...,Daily Mail,Sean Poulter for the Daily Mail,,EU decision is a victory for Daily Mail reader...
...,...,...,...,...,...
1-3-2017,Warning that future depends on better infrastr...,The Times,Peter O’Dwyer,,Ireland’s leading construction lobby group h...
24-3-2017,Your five-minute digest,The Times,,,1 Gambling bosses are at war over fixed-odds...
27-3-2017,Your five-minute digest,The Times,,,TodayThe Bank of England’s financial policy ...
30-3-2017,Your five-minute digest,The Times,,,"1 Kim Mears, a managing director of BT Openr..."


In [4]:
headlines_df

Unnamed: 0_level_0,headline,newspaper
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1-3-2015,Prince Andrew 'frozen out by Charles over dama...,Express
1-3-2015,EXCLUSIVE: Migrants to put Britain's populatio...,Express
1-3-2015,EXCLUSIVE: Jihadi John exposed by web error: K...,Express
1-3-2015,Missing Becky Watts: Two arrested as family ad...,Express
1-3-2015,We can all benefit from a positive approach to...,Express
...,...,...
31-3-2017,Man United player: I was worried after manager...,Express
31-3-2017,Mark Lawrenson: My biggest worry about Liverpool,Express
31-3-2017,La Liga ace reveals messages received from Man...,Express
31-3-2017,Real Madrid News: James Rodriguez wants to joi...,Express


In [5]:
bbc_df

Unnamed: 0,File_Name,Content,Category,Complete_Filename
0,001.txt,Ad sales boost Time Warner profit\n\nQuarterly...,business,001.txt-business
1,002.txt,Dollar gains on Greenspan speech\n\nThe dollar...,business,002.txt-business
2,003.txt,Yukos unit buyer faces loan claim\n\nThe owner...,business,003.txt-business
3,004.txt,High fuel prices hit BA's profits\n\nBritish A...,business,004.txt-business
4,005.txt,Pernod takeover talk lifts Domecq\n\nShares in...,business,005.txt-business
...,...,...,...,...
2220,397.txt,BT program to beat dialler scams\n\nBT is intr...,tech,397.txt-tech
2221,398.txt,Spam e-mails tempt net shoppers\n\nComputer us...,tech,398.txt-tech
2222,399.txt,Be careful how you code\n\nA new European dire...,tech,399.txt-tech
2223,400.txt,US cyber security chief resigns\n\nThe man mak...,tech,400.txt-tech


In [6]:
#Slice the df to have only Green and environment articles
green_df = kaggle_df[kaggle_df.category == 'GREEN']
env_df = kaggle_df[kaggle_df.category == 'ENVIRONMENT']


# Extracting Keywords

In [7]:
# Cleaning
# Downloading punkt and wordnet from NLTK
nltk.download('punkt')
nltk.download('wordnet')


def cleaning(df, column_str):
    '''function to clean a text within a column of a dataframe. The column name has to be a string'''
    #Special character
    df[column_str] = df[column_str].str.replace("\r", " ")
    df[column_str] = df[column_str].str.replace("\n", " ")
    df[column_str] = df[column_str].str.replace("    ", " ")
    df[column_str] = df[column_str].str.replace('"', '')

    #Lowercase
    df[column_str] = df[column_str].str.lower()

    #Punctuation
    punctuation_signs = list("?:!.,;-")

    for punct_sign in punctuation_signs:
        df[column_str] = df[column_str].str.replace(punct_sign, '')

    #Possessive pronouns
    df[column_str] = df[column_str].str.replace("'s", "")
    df[column_str] = df[column_str].str.replace("'", "")

    #Lemmatization
    # Saving the lemmatizer into an object
    wordnet_lemmatizer = WordNetLemmatizer()

    nrows = len(df)
    lemmatized_text_list = []

    for row in range(0, nrows):

        # Create an empty list containing lemmatized words
        lemmatized_list = []

        # Save the text and its words into an object
        text = df.iloc[row][column_str]
        text_words = text.split(" ")

        # Iterate through every word to lemmatize
        for word in text_words:
            lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))

        # Join the list
        lemmatized_text = " ".join(lemmatized_list)

        # Append to the list containing the texts
        lemmatized_text_list.append(lemmatized_text)

    df[column_str] = lemmatized_text_list

    #Stopwords
    # Downloading the stop words list
    nltk.download('stopwords')
    # Loading the stop words in english
    stop_words = list(stopwords.words('english'))

    df[column_str] = df[column_str]

    for stop_word in stop_words:

        regex_stopword = r"\b" + stop_word + r"\b"
        df[column_str] = df[column_str].str.replace(regex_stopword, '')

    return df

[nltk_data] Downloading package punkt to /Users/Davide/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Davide/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
green_df = cleaning(green_df, 'headline')
bbc_df = cleaning(bbc_df, 'Content')
env_df = cleaning(env_df, 'headline')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .

In [9]:
biz_df = bbc_df[bbc_df.Category == 'business']
pol_df = bbc_df[bbc_df.Category == 'politics']

In [10]:
#Most commonly used words extractor
def keywords_extractor(df, headline_str, n):
    '''Return the most commonly used words from a dataframe with a column containing text.
    Arguments are dataframe, the name of the column in string format, and the number n of keywords needed'''
    df_words = Counter()
    df[headline_str].str.split().apply(df_words.update)
    df_words = df_words.most_common(20)
    df_words_lst = []
    for tup in df_words:
        df_words_lst.append(tup[0])
    return df_words_lst

#Keywords list creation
biz_keywords = keywords_extractor(biz_df, 'Content', n = 20)
pol_keywords = keywords_extractor(pol_df, 'Content', n = 20)
green_keywords = keywords_extractor(green_df, 'headline', n = 20)
env_keywords = keywords_extractor(env_df, 'headline', n = 20)

In [11]:
print(biz_keywords)

['say', 'us', 'year', 'company', 'mr', 'firm', 'market', 'would', 'bank', 'rise', 'also', 'new', 'price', 'share', 'growth', 'last', 'economy', 'make', 'government', 'sales']


In [12]:
print(pol_keywords)

['say', 'mr', 'would', 'labour', 'government', 'party', 'people', 'blair', 'election', 'minister', 'plan', 'make', 'also', 'new', 'tell', 'could', 'brown', 'go', 'tax', 'lord']


In [13]:
#Combining green and env keywords
climate_keywords = []
climate_keywords.extend(green_keywords)
climate_keywords.extend(env_keywords)

#Eliminating duplicates
climate_keywords = list(set(climate_keywords))

#Removing not useful keywords
climate_keywords.remove('(photos)')
climate_keywords.remove('(video)')

#Including some additional important words
climate_keywords.extend(["climate change", "paris agreement", "environment", "global warming", "unfccc"])

print(climate_keywords)

['climate', 'kill', 'california', 'day', 'dog', 'oil', 'week', 'find', 'extreme', 'energy', 'get', 'make', 'save', 'us', 'green', 'change', 'trump', 'animal', 'picture', 'world', 'water', 'could', 'take', 'say', 'photos', 'watch', 'weather', '2012', 'baby', 'hurricane', 'new', 'climate change', 'paris agreement', 'environment', 'global warming', 'unfccc']


In [14]:
british_keywords = ["brexit", "european union", " eu ", "british", "british identity", "british passport",
                    "british culture", "british heritage", "british goods", "british products","british manufacturing",
                    "made in britain"]
immigration_keywords = ['migrant', 'refugee', 'immigrant', 'asylum', 'Calais']


# Classification

In [15]:
#Defining the functions that need to be used 


def topic_classifier(string, keywords_tuple, name_class_tuple):
    '''Classifies a list of strings according to keywords lists presented inside a tuple.
    Also requires the names of the classes in string from in a second tuple'''
    string = string.lower()
    if len(keywords_tuple) == len(name_class_tuple) and len(keywords_tuple) > 0:
        no_of_classes = len(keywords_tuple)
        counts = []
        count = 0
        while no_of_classes > 0:
            no_of_classes -= 1
            for keyword in keywords_tuple[no_of_classes]:
                if keyword in string:
                    count += 1
            counts.insert(0, count)
            count = 0
    else:
        return "List of keywords do not match list of classes or the list of keywords is empty"
        
    if max(counts) > 0:
        max_list = max(counts)
        values = np.array(counts)
        ii = np.where(values==max_list)[0]
        results_temp = [name_class_tuple[index] for index in ii]
        sep = ","
        results = sep.join(results_temp)
        return results
    else:
        return 'Other'


def headline_classifier(string, keyword_lst_class_1, keyword_lst_class_2, name_class_1 = 'class_1', name_class_2 = 'class_2'):
    '''Classifies a list of strings according to keyword lists for class_1 and class_2'''
    string = string.lower()
    count_1 = 0
    count_2 = 0
    for keyword in keyword_lst_class_1:
        if keyword in string:
            count_1 =+ 1
    for keyword in keyword_lst_class_2:

        if keyword in string:
            count_2 =+ 1
    
    if count_1 > 0 and count_2 > 0:
        return 'both'
    elif count_1 > 0:
        return name_class_1
    elif count_2 > 0:
        return name_class_2
    else:
        return 'none'
    
def topic_classification(data, keywords_tuple, name_class_tuple):
    n_row, n_col = data.shape
    data_lst = list(data['headline'].values)

    #Headline classification
    classification_lst = []
    for i in range(n_row):
        data_i = str(data_lst[i]) #Include this becasue of potential nan values
        classification_lst.append(topic_classifier(data_i, keywords_tuple, name_class_tuple))
    return classification_lst

In [16]:
#Daily mail first classification
key_tup = (british_keywords, climate_keywords)
name_tup = ('Brexit', 'Climate')

daily_mail_df['classification'] = ''
classification_lst = []

daily_mail_classifications = topic_classification(daily_mail_df, key_tup, name_tup)
daily_mail_df = daily_mail_df.assign(classification = daily_mail_classifications)
daily_mail_df.classification.value_counts()

Climate           5601
Other             1776
Brexit            1242
Brexit,Climate    1152
Name: classification, dtype: int64

In [17]:
#Slice of only the Brexit terms
brexit_df = daily_mail_df[daily_mail_df.classification == 'Brexit']

#Further classification for Brexit articles
key_tup2 = (biz_keywords, pol_keywords, immigration_keywords)
name_tup2 = ('Business', 'Politics', 'Immigration')
classification_lst = []

daily_mail_brexit_class = topic_classification(brexit_df, key_tup2, name_tup2)
brexit_df = brexit_df.assign(classification = daily_mail_brexit_class)
brexit_df.classification.value_counts()

Other                            411
Politics                         374
Business                         230
Business,Politics                111
Immigration                       76
Politics,Immigration              24
Business,Politics,Immigration      8
Business,Immigration               8
Name: classification, dtype: int64

In [None]:
# n_row, n_col = headlines_df.shape
# headlines_lst = list(headlines_df['headline'].values)

# #Headline classification
# classification_lst = []
# for i in range(n_row):
#     headline_i = str(headlines_lst[i]) #Include this becasue of potential nan values
#     classification_lst.append(headline_classifier(headline_i, british_keywords, climate_keywords, name_class_1 = 'Brexit', name_class_2 = 'Climate'))

# headlines_df['classification'] = classification_lst
# headlines_df.head(50)
# headlines_df.classification.value_counts()

In [20]:
# #Testing
# #To do find more elegant way to deal with eu than " eu "
# test_lst = []

# test_headline = headlines_df.iloc[43]['headline']     
# test_lst.append(headline_classifier(test_headline, british_keywords, climate_keywords, name_class_1 = 'Brexit', name_class_2 = 'Climate'))              
# test_lst

# test_lst


["Two-thirds of laws in Britain over the past two decades 'have been inspired by Brussels'",
 'Now Europe wants to ban your halogen light bulbs: Sales could be stopped as early as next year as part of energy-saving drive',
 "Scott Walker admits flip-flop on illegal immigrants: GOP presidential hopeful ditches previous position for hard line against 'amnesty'",
 "Don't blame immigrants for ills of society, says Catholic church as Nichols voices 'dismay' that it is an issue for election",
 'Landmark EU ruling to cut plastic bag use by 80%: Countries set deadline of 2025 to meet target through charges or bans\xa0',
 "Tory rift over PM's pledge to slash migrant numbers: Cameron and May increasingly isolated over party's plans",
 "The wages of sin: Why do drugs and prostitution contribute so much more to Italy's GDP than any other European country?",
 "British-bound migrant wins £750 in damages for being 'wrongly evicted' from his Calais squat by French police",
 'Jobs surge means Britain n

# Sentiment Analysis

In [26]:
def sentiment_analysis (df, column_str):
    '''function needs a dataframe with a string column where to perfrom sentiment analysis
    provide the column name in string format'''
    n_row, n_col = df.shape
    headlines_lst = list(df[column_str].values)
    #Empty list to add the polarity score
    polarity_lst = []
    subjectivity_lst = []

    #Headline sentiment
    for i in range(n_row):
        headline_i = str(headlines_lst[i]) #Include this becasue of potential nan values
        blob_headline_i = TextBlob(headline_i) #transforming string into textblob
        polarity_lst.append(blob_headline_i.sentiment.polarity)
        subjectivity_lst.append(blob_headline_i.sentiment.subjectivity)

    #Adding polarity and subjectivity scores to the headlines dataframe
    df['polarity'] = polarity_lst
    df['subjectivity'] = subjectivity_lst
    return df

In [27]:
sentiment_analysis(daily_mail_df, 'headline')

Unnamed: 0_level_0,headline,newspaper,author,page,text,classification,polarity,subjectivity
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1-3-2015,Two-thirds of laws in Britain over the past tw...,Daily Mail,"Tamara Cohen, Political Correspondent for the ...",,Study found 64.7 per cent of the laws made bet...,Climate,-0.250000,0.250000
1-3-2015,Now Europe wants to ban your halogen light bul...,Daily Mail,"Sean Poulter, Consumer Affairs Editor For The ...",,Follows ban of incandescent bulbs in bid to cu...,Climate,0.175000,0.275000
1-3-2015,Scott Walker admits flip-flop on illegal immig...,Daily Mail,Associated Press,,Wisconsin governor once favored plan whereby u...,Climate,-0.319444,0.402778
1-3-2015,"Don't blame immigrants for ills of society, sa...",Daily Mail,"Matt Chorley, Political Editor for MailOnline",,"Letter to 500,000 parishes warns of blaming im...",Climate,0.000000,0.100000
2-3-2015,Landmark EU ruling to cut plastic bag use by 8...,Daily Mail,Sean Poulter for the Daily Mail,,EU decision is a victory for Daily Mail reader...,Climate,0.000000,0.000000
...,...,...,...,...,...,...,...,...
31-3-2017,People smuggler who 'drove into the US with fo...,Daily Mail,Mail Online Reporter,,"Jose Emiliano Aguilar, 24, is accused of tryin...",Climate,0.000000,0.500000
31-3-2017,Bank of England keeps the vegans happy with pl...,Daily Mail,Giulia Crouch For Mailonline,,The plastic note was slammed by vegans when it...,Climate,0.485227,0.727273
31-3-2017,Women putting themselves at risk by being pros...,Daily Mail,Rebecca Taylor For Mailonline,,Victoria Bateman called the ban on prostitutio...,Climate,0.200000,0.600000
31-3-2017,EU Council President Tusk demands a Brexit dea...,Daily Mail,"Tim Sculthorpe, Deputy Political Editor For Ma...",,Theresa May in standoff with EU leaders after ...,Brexit,0.000000,0.000000


In [None]:
#Creating dataframe for each class
brexit_df = headlines_df[headlines_df.classification == 'Brexit']
climate_df = headlines_df[headlines_df.classification == 'Climate']
both_df = headlines_df[headlines_df.classification == 'both']

#
results_df = pd.DataFrame(index = ['polarity', 'subjectivity'], columns = ['Brexit', 'Climate', 'Both'])
results_df['Brexit'] = brexit_df.mean(axis = 0, numeric_only = True)
results_df['Climate'] = climate_df.mean(axis = 0, numeric_only = True)
results_df['Both'] = both_df.mean(axis = 0, numeric_only = True)

results_df