# Group assignment: Using unstructured data for sentiment analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import pickle
from collections import Counter
import psutil

In [2]:
nlp = spacy.load('en_core_web_sm')
group_10_tickers = ['ABBV', 'ADBE', 'AMGN', 'ASML', 'AZN', 'BKNG', 'BLK', 'CHTR', 'CMCSA', 'COST', 'CRM', 'DUK', 'EL', 'GM', 'GS', 'IBM', 'INTC', 'JNJ', 'JPM', 'KO', 'LMT', 'LOW', 'MA', 'MCD', 'MDT', 'MET', 'MRK', 'NKE', 'NVO', 'NVS', 'ORCL', 'PEP', 'TMO', 'TMUS', 'TSLA', 'UNH', 'UNP', 'UPS', 'WFC', 'XOM']

# Process text to lemmatize, remove stop words and punctuation
def process_text(text):
    words = [word.lemma_.lower() for word in nlp(text) if not (word.is_space or word.is_stop or word.is_punct)]
    return words

# Function to load and process word lists
def load_and_process_LM(sheet_name):
    try:
        word_list = pd.read_excel('LoughranMcDonald_SentimentWordLists_2018.xlsx', sheet_name=sheet_name, header=None).squeeze().tolist()
        processed_set = set(process_text(' '.join(word_list)))
        return processed_set
    except Exception as e:
        print(f"Error loading {sheet_name} sheet: {e}")
        return set()

# Load and process Loughran & McDonald Word Lists
LM_negative_set = load_and_process_LM('Negative')
LM_positive_set = load_and_process_LM('Positive')
LM_uncertainty_set = load_and_process_LM('Uncertainty')

In [3]:
def score_transcript(text):
    max_words = 20_000
    if len(text) > max_words:
        text = text[:max_words]
    
    words = process_text(text)  # Process the text
    counts = Counter(words)     # Count the words, returns a dictionary with the word counts
    keys = set(counts.keys())   # Get the unique words in the transcript

    # Function to calculate the sentiment score
    # The score is the sum of the counts of the words in both the word set and the transcript divided by the total number of words in the transcript
    def calculate_score(keys, counts, word_set, total_words):
        return round((sum([counts[k] for k in (keys & word_set)]) / total_words), 4)
    
    total_words = len(words)
    pos = calculate_score(keys, counts, LM_positive_set, total_words)
    neg = calculate_score(keys, counts, LM_negative_set, total_words)
    unc = calculate_score(keys, counts, LM_uncertainty_set, total_words)

    return (pos, neg, unc) # Return a tuple with the positive, negative and uncertainty scores

def generate_sentiment_scores(data):
    transcripts_list = []

    for t in data:
        try:
            # Combine all speech segments into a single text string
            text = ' '.join(s['speech'][0] for s in t['transcript'])
            # Score the transcript
            scores = score_transcript(text)
            print(f'scores: {scores} Memory Usage: {psutil.virtual_memory().percent}% and CPU Usage: {psutil.cpu_percent(interval=1)}%')
            
            # Create a dictionary entry for the transcript with sentiment scores and metadata
            transcript_entry = {
                'date': pd.to_datetime(t['time']),
                'ticker': t['symbol'],
                'pos': scores[0],
                'neg': scores[1],
                'unc': scores[2]
            }
            transcripts_list.append(transcript_entry)
        except Exception as e:
            print(f'Error processing transcript: {e}')
            continue
    
    # Convert the list of transcript entries into a pandas DataFrame
    return pd.DataFrame(transcripts_list)

In [4]:
sentiment_scores = pd.DataFrame(columns=['date', 'ticker', 'pos', 'neg', 'unc'])

for ticker in group_10_tickers:
    filename = f'{ticker}.p'
    print(f'Processing {filename}')
    data = pickle.load(open(f'transcripts/{filename}', 'rb'))
    result = generate_sentiment_scores(data)
    sentiment_scores = pd.concat([sentiment_scores, result], ignore_index=True)


Processing ABBV.p
scores: (0.0409, 0.0104, 0.0169) Memory Usage: 77.8% and CPU Usage: 22.0%
scores: (0.0537, 0.0069, 0.0092) Memory Usage: 77.7% and CPU Usage: 18.1%
scores: (0.0546, 0.0128, 0.0145) Memory Usage: 77.7% and CPU Usage: 19.4%
scores: (0.03, 0.0483, 0.0249) Memory Usage: 77.5% and CPU Usage: 22.0%
scores: (0.0315, 0.0448, 0.0124) Memory Usage: 77.6% and CPU Usage: 43.8%
scores: (0.0487, 0.0085, 0.0256) Memory Usage: 78.3% and CPU Usage: 18.3%
scores: (0.0341, 0.053, 0.0088) Memory Usage: 77.8% and CPU Usage: 16.3%
scores: (0.0412, 0.0226, 0.0146) Memory Usage: 77.9% and CPU Usage: 17.8%
scores: (0.0302, 0.0488, 0.0098) Memory Usage: 78.1% and CPU Usage: 23.4%
scores: (0.0331, 0.0515, 0.0177) Memory Usage: 77.0% and CPU Usage: 13.4%
scores: (0.0363, 0.0162, 0.0209) Memory Usage: 76.7% and CPU Usage: 16.9%
scores: (0.047, 0.0136, 0.0159) Memory Usage: 77.0% and CPU Usage: 16.6%
scores: (0.0426, 0.01, 0.0133) Memory Usage: 77.0% and CPU Usage: 18.7%
scores: (0.0476, 0.0126, 0

  sentiment_scores = pd.concat([sentiment_scores, result], ignore_index=True)


scores: (0.0766, 0.0105, 0.0093) Memory Usage: 77.9% and CPU Usage: 25.7%
scores: (0.0522, 0.0072, 0.0119) Memory Usage: 77.5% and CPU Usage: 18.0%
scores: (0.0407, 0.0103, 0.0034) Memory Usage: 77.4% and CPU Usage: 22.4%
scores: (0.0506, 0.0165, 0.0075) Memory Usage: 77.7% and CPU Usage: 25.2%
scores: (0.0373, 0.0302, 0.0186) Memory Usage: 77.6% and CPU Usage: 18.7%
scores: (0.0419, 0.0191, 0.0111) Memory Usage: 77.2% and CPU Usage: 20.2%
scores: (0.0363, 0.0204, 0.0121) Memory Usage: 77.5% and CPU Usage: 22.7%
scores: (0.0547, 0.0283, 0.0094) Memory Usage: 77.9% and CPU Usage: 20.7%
scores: (0.0512, 0.0045, 0.0155) Memory Usage: 77.2% and CPU Usage: 20.9%
scores: (0.0473, 0.0039, 0.0123) Memory Usage: 77.1% and CPU Usage: 22.0%
scores: (0.0405, 0.006, 0.0166) Memory Usage: 77.1% and CPU Usage: 21.5%
scores: (0.0527, 0.006, 0.012) Memory Usage: 77.1% and CPU Usage: 22.1%
scores: (0.049, 0.0087, 0.0101) Memory Usage: 76.9% and CPU Usage: 22.5%
scores: (0.0544, 0.007, 0.016) Memory Usag

In [7]:
# Set date as index
sentiment_scores.set_index('date', inplace=True)

# Export to CSV
sentiment_scores.to_csv('sentiment_scores_group10.csv')

sentiment_scores

Unnamed: 0_level_0,ticker,pos,neg,unc
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-02-04 00:59:09,ABBV,0.0409,0.0104,0.0169
2020-10-30 20:18:17,ABBV,0.0537,0.0069,0.0092
2020-07-31 19:59:11,ABBV,0.0546,0.0128,0.0145
2020-05-01 19:27:53,ABBV,0.0300,0.0483,0.0249
2020-02-07 18:36:46,ABBV,0.0315,0.0448,0.0124
...,...,...,...,...
2006-10-26 19:17:46,XOM,0.0287,0.0161,0.0075
2006-07-28 00:01:19,XOM,0.0222,0.0251,0.0131
2006-04-27 18:32:03,XOM,0.0376,0.0140,0.0096
2006-01-31 14:15:56,XOM,0.0282,0.0197,0.0080
