# Set up, data, dataframe creation

In [24]:
import pandas as pd
import numpy as np
from textblob import TextBlob
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter

#Set working directory 
#os.chdir('/Users/Davide/Documents/University/RA/text_analysis')
os.getcwd()


'/Users/Davide/Documents/University/RA/text_analysis'

In [28]:
headlines_df = pd.read_csv('express.csv', index_col = 'date')
bbc_df = pd.read_csv('News_dataset.csv', sep=';')

In [29]:
headlines_df

Unnamed: 0_level_0,headline,newspaper
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1-3-2015,Prince Andrew 'frozen out by Charles over dama...,Express
1-3-2015,EXCLUSIVE: Migrants to put Britain's populatio...,Express
1-3-2015,EXCLUSIVE: Jihadi John exposed by web error: K...,Express
1-3-2015,Missing Becky Watts: Two arrested as family ad...,Express
1-3-2015,We can all benefit from a positive approach to...,Express
...,...,...
31-3-2017,Man United player: I was worried after manager...,Express
31-3-2017,Mark Lawrenson: My biggest worry about Liverpool,Express
31-3-2017,La Liga ace reveals messages received from Man...,Express
31-3-2017,Real Madrid News: James Rodriguez wants to joi...,Express


In [30]:
bbc_df

Unnamed: 0,File_Name,Content,Category,Complete_Filename
0,001.txt,Ad sales boost Time Warner profit\n\nQuarterly...,business,001.txt-business
1,002.txt,Dollar gains on Greenspan speech\n\nThe dollar...,business,002.txt-business
2,003.txt,Yukos unit buyer faces loan claim\n\nThe owner...,business,003.txt-business
3,004.txt,High fuel prices hit BA's profits\n\nBritish A...,business,004.txt-business
4,005.txt,Pernod takeover talk lifts Domecq\n\nShares in...,business,005.txt-business
...,...,...,...,...
2220,397.txt,BT program to beat dialler scams\n\nBT is intr...,tech,397.txt-tech
2221,398.txt,Spam e-mails tempt net shoppers\n\nComputer us...,tech,398.txt-tech
2222,399.txt,Be careful how you code\n\nA new European dire...,tech,399.txt-tech
2223,400.txt,US cyber security chief resigns\n\nThe man mak...,tech,400.txt-tech


# Extracting Keywords

In [31]:
# Cleaning

#Special character
bbc_df['Content'] = bbc_df['Content'].str.replace("\r", " ")
bbc_df['Content'] = bbc_df['Content'].str.replace("\n", " ")
bbc_df['Content'] = bbc_df['Content'].str.replace("    ", " ")
bbc_df['Content'] = bbc_df['Content'].str.replace('"', '')

#Lowercase
bbc_df['Content'] = bbc_df['Content'].str.lower()

#Punctuation
punctuation_signs = list("?:!.,;")
bbc_df['Content'] = bbc_df['Content']

for punct_sign in punctuation_signs:
    bbc_df['Content'] = bbc_df['Content'].str.replace(punct_sign, '')

#Possessive pronouns
bbc_df['Content'] = bbc_df['Content'].str.replace("'s", "")

#Lemmatization
# Downloading punkt and wordnet from NLTK
nltk.download('punkt')
print("------------------------------------------------------------")
nltk.download('wordnet')

# Saving the lemmatizer into an object
wordnet_lemmatizer = WordNetLemmatizer()

nrows = len(bbc_df)
lemmatized_text_list = []

for row in range(0, nrows):
    
    # Create an empty list containing lemmatized words
    lemmatized_list = []
    
    # Save the text and its words into an object
    text = bbc_df.loc[row]['Content']
    text_words = text.split(" ")

    # Iterate through every word to lemmatize
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        
    # Join the list
    lemmatized_text = " ".join(lemmatized_list)
    
    # Append to the list containing the texts
    lemmatized_text_list.append(lemmatized_text)

bbc_df['Content'] = lemmatized_text_list

#Stopwords
# Downloading the stop words list
nltk.download('stopwords')
# Loading the stop words in english
stop_words = list(stopwords.words('english'))

bbc_df['Content'] = bbc_df['Content']

for stop_word in stop_words:

    regex_stopword = r"\b" + stop_word + r"\b"
    bbc_df['Content'] = bbc_df['Content'].str.replace(regex_stopword, '')


[nltk_data] Downloading package punkt to /Users/Davide/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Davide/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


------------------------------------------------------------


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Davide/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
bbc_df

Unnamed: 0,File_Name,Content,Category,Complete_Filename
0,001.txt,ad sales boost time warner profit quarterly p...,business,001.txt-business
1,002.txt,dollar gain greenspan speech dollar hit h...,business,002.txt-business
2,003.txt,yukos unit buyer face loan claim owners emb...,business,003.txt-business
3,004.txt,high fuel price hit ba profit british airways...,business,004.txt-business
4,005.txt,pernod takeover talk lift domecq share uk dr...,business,005.txt-business
...,...,...,...,...
2220,397.txt,bt program beat dialler scam bt introduce t...,tech,397.txt-tech
2221,398.txt,spam e-mail tempt net shoppers computer users...,tech,398.txt-tech
2222,399.txt,careful code new european directive could...,tech,399.txt-tech
2223,400.txt,us cyber security chief resign man make sure...,tech,400.txt-tech


In [33]:
biz_df = bbc_df[bbc_df.Category == 'business']
pol_df = bbc_df[bbc_df.Category == 'politics']

In [37]:
#Frequency count of individual words for business articles
biz_words = Counter()
biz_df['Content'].str.split().apply(biz_words.update)
print(biz_words)

Counter({'say': 1964, 'us': 808, '-': 788, 'year': 634, 'company': 625, 'mr': 600, 'firm': 556, 'market': 539, 'would': 463, 'bank': 456, 'rise': 452, 'also': 439, 'new': 415, 'price': 392, 'share': 391, 'growth': 383, 'last': 368, 'economy': 357, 'make': 350, 'government': 340, 'sales': 316, 'could': 311, '2004': 310, 'economic': 310, 'oil': 293, 'report': 292, 'expect': 272, 'take': 266, 'however': 256, 'trade': 255, 'world': 251, 'may': 248, 'years': 245, 'one': 243, 'profit': 238, 'chief': 236, 'deal': 235, 'two': 231, 'increase': 229, 'plan': 229, 'time': 226, 'group': 223, 'china': 222, 'business': 217, 'cost': 215, 'come': 213, 'fall': 208, 'uk': 207, 'state': 205, 'analysts': 204, 'cut': 203, 'financial': 197, 'job': 196, 'spend': 193, 'figure': 193, '2005': 191, 'stock': 190, 'continue': 187, 'see': 187, 'interest': 186, 'buy': 186, 'add': 185, 'offer': 184, 'since': 182, 'dollar': 178, 'go': 177, 'yukos': 174, 'tax': 174, 'december': 173, 'country': 172, 'months': 170, 'peopl

In [38]:
#Frequency count of individual words for business articles
pol_words = Counter()
pol_df['Content'].str.split().apply(pol_words.update)
print(pol_words)

Counter({'say': 2892, 'mr': 1684, 'would': 1049, '-': 766, 'labour': 759, 'government': 728, 'party': 696, 'people': 621, 'blair': 571, 'election': 565, 'minister': 563, 'plan': 485, "'": 483, 'make': 476, 'also': 452, 'new': 429, 'tell': 418, 'brown': 384, 'could': 381, 'go': 370, 'tax': 360, 'lord': 345, 'uk': 337, 'take': 336, 'public': 322, 'get': 321, 'howard': 319, 'one': 303, 'time': 291, 'issue': 290, 'work': 289, 'prime': 283, 'want': 276, 'tory': 267, 'secretary': 266, 'come': 266, 'claim': 266, 'right': 262, 'britain': 262, 'home': 260, 'campaign': 257, 'vote': 254, 'chancellor': 249, 'leader': 248, 'bbc': 247, 'give': 245, 'need': 240, 'think': 239, 'use': 230, 'general': 229, 'tories': 226, 'year': 222, 'add': 222, 'police': 216, 'next': 215, 'change': 213, 'report': 212, 'service': 211, 'call': 209, 'see': 207, 'last': 207, 'back': 206, 'believe': 198, 'tony': 195, 'two': 192, 'spokesman': 190, 'rule': 188, 'years': 186, 'mps': 186, 'spend': 185, 'law': 184, 'british': 18

# Classification

In [4]:
british_keywords = ["brexit", "european union", " eu ", "british", "british identity", "british passport", "british culture", "british heritage", "british goods", "british products","british manufacturing", "made in britain"]
climate_keywords = ["climate change", "paris agreement", "environment", "global warming", "unfccc"]
immigration_keywords = [""]

headlines_df['classification'] = ''
classification_lst = []


def headline_classifier(string, keyword_lst_class_1, keyword_lst_class_2, name_class_1 = 'class_1', name_class_2 = 'class_2'):
    '''Classifies a list of strings according to keyword lists for class_1 and class_2'''
    string = string.lower()
    count_1 = 0
    count_2 = 0
    for keyword in keyword_lst_class_1:
        if keyword in string:
            count_1 =+ 1
    for keyword in keyword_lst_class_2:

        if keyword in string:
            count_2 =+ 1
    
    if count_1 > 0 and count_2 > 0:
        return 'both'
    elif count_1 > 0:
        return name_class_1
    elif count_2 > 0:
        return name_class_2
    else:
        return 'none'

In [6]:
n_row, n_col = headlines_df.shape
headlines_lst = list(headlines_df['headline'].values)

#Headline classification
classification_lst = []
for i in range(n_row):
    headline_i = str(headlines_lst[i]) #Include this becasue of potential nan values
    classification_lst.append(headline_classifier(headline_i, british_keywords, climate_keywords, name_class_1 = 'Brexit', name_class_2 = 'Climate'))

headlines_df['classification'] = classification_lst
headlines_df.head(50)
headlines_df.classification.value_counts()

none       207415
Brexit      11789
Climate       207
both           11
Name: classification, dtype: int64

In [232]:
#Testing
#To do find more elegant way to deal with eu than " eu "
test_lst = []

test_headline = headlines_df.iloc[43]['headline']     
test_lst.append(headline_classifier(test_headline, british_keywords, climate_keywords, name_class_1 = 'Brexit', name_class_2 = 'Climate'))              
test_lst

test_lst

['none']

# Sentiment Analysis

In [12]:
#Empty list to add the polarity score
polarity_lst = []
subjectivity_lst = []

#Headline sentiment
for i in range(n_row):
    headline_i = str(headlines_lst[i]) #Include this becasue of potential nan values
    blob_headline_i = TextBlob(headline_i) #transforming string into textblob
    polarity_lst.append(blob_headline_i.sentiment.polarity)
    subjectivity_lst.append(blob_headline_i.sentiment.subjectivity)

#Adding polarity and subjectivity scores to the headlines dataframe
headlines_df['polarity'] = polarity_lst
headlines_df['subjectivity'] = subjectivity_lst
headlines_df

Unnamed: 0_level_0,headline,newspaper,classification,polarity,subjectivity
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1-3-2015,'We must monitor Islamic student societies wit...,Express,Brexit,0.000000,0.000000
2-3-2015,Now EU targets halogen bulbs: Brussels could B...,Express,Brexit,0.200000,0.350000
2-3-2015,In bed with Brook: British model Kelly shows o...,Express,Brexit,0.000000,0.000000
2-3-2015,Tory MP says Britain would be better off witho...,Express,Brexit,0.500000,0.500000
2-3-2015,Two out of three British laws were made in Bru...,Express,Brexit,0.000000,0.000000
...,...,...,...,...,...
31-3-2017,Paul Nuttall hails Britain’s ‘great future’ & ...,Express,Brexit,0.400000,0.437500
31-3-2017,Hitler 'stopped advancing on British troops at...,Express,Brexit,0.000000,0.000000
31-3-2017,Chris Ashton opens up about possibility of get...,Express,Brexit,0.000000,0.000000
31-3-2017,British endurance cyclist dies after being str...,Express,Brexit,0.000000,0.000000


In [25]:
#Creating dataframe for each class
brexit_df = headlines_df[headlines_df.classification == 'Brexit']
climate_df = headlines_df[headlines_df.classification == 'Climate']
both_df = headlines_df[headlines_df.classification == 'both']

#
results_df = pd.DataFrame(index = ['polarity', 'subjectivity'], columns = ['Brexit', 'Climate', 'Both'])
results_df['Brexit'] = brexit_df.mean(axis = 0, numeric_only = True)
results_df['Climate'] = climate_df.mean(axis = 0, numeric_only = True)
results_df['Both'] = both_df.mean(axis = 0, numeric_only = True)

results_df

Unnamed: 0,Brexit,Climate,Both
polarity,0.0263,0.012187,-0.139876
subjectivity,0.246679,0.28284,0.33905
