# Word2Vec

In [28]:
import pandas as pd
import nltk
import re
import gensim

from langdetect import detect
from gensim.models import Word2Vec 
from nltk.corpus import stopwords

In [29]:
source_file = 'data/news_firmnames.csv'

In [79]:
def preprocess(df):
    corpus = df['title']
    corpus = corpus.to_list()
    
    corpus = join_into_string(corpus)
    
    #corpus = clean_text(corpus)
    #all_words = gensim.utils.simple_preprocess(corpus)
    all_words = tokenize(corpus)
    return all_words

def join_into_string(corpus):
    corpus = [i for i in corpus if detect(i) == 'en']
    corpus = ". ".join(corpus)
    return corpus

def clean_text(corpus):
    processed_text = corpus.lower()
    processed_text = re.sub('[^a-zA-Z]', ' ', processed_text)
    processed_text = re.sub(r'\s+', ' ', processed_text)
    return processed_text

def tokenize(processed_text):
    all_sentences = nltk.sent_tokenize(processed_text)
    all_words = [nltk.word_tokenize(sent) for sent in all_sentences]
    for i in range(len(all_words)):
        all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]
    return all_words

In [87]:
df = pd.read_csv(source_file)
all_words = preprocess(df)
for word in all_words:
    "".join(word)
all_words

[['US',
  'swoop',
  'sees',
  '3M',
  'masks',
  'allegedly',
  'diverted',
  'Berlin',
  '.'],
 ['3M',
  'Says',
  'No',
  'Evidence',
  'That',
  'Products',
  'Have',
  'Been',
  'Seized',
  '.'],
 ['3M',
  'Showcase',
  'Industry-First',
  '3D',
  'Printed',
  'PTFE',
  'Formnext',
  '.'],
 ['It',
  "'s",
  "'absurd",
  "'",
  '–',
  '3M',
  'CEO',
  'defends',
  'coronavirus',
  'response',
  'Trump',
  'invokes',
  'DPA',
  '.'],
 ['3M',
  ':',
  'Management',
  "'s",
  'Discussion',
  'Analysis',
  'Financial',
  'Condition',
  'Results',
  'Operations',
  '.'],
 ['(', 'form', '10-Q', ')', '.'],
 ['Trump',
  'Seeks',
  'Block',
  '3M',
  'Mask',
  'Exports',
  'Grab',
  'Masks',
  'From',
  'Its',
  'Overseas',
  'Customers',
  '.'],
 ['Is', '3M', 'Oversold', 'At', '$', '155', '?', '.'],
 ['Digital', 'pilots', 'move', 'production', '3M', '.'],
 ['COVID',
  '19',
  'Roundup',
  ':',
  '3M',
  ',',
  'DSM',
  'contribute',
  '....',
  'Continental',
  '3M',
  'develop',
  'new',


In [88]:
word2vec = Word2Vec(all_words)
word2vec.wv['Amazon']

array([ 0.08304121,  0.19763143, -0.00599724, -0.05828396, -0.14206687,
       -0.14795819,  0.04528178,  0.12785545,  0.06138549,  0.06736533,
        0.27948093, -0.12975012, -0.21234435,  0.07645258,  0.07878023,
       -0.0220421 , -0.15914115, -0.05280257,  0.03616823,  0.08441284,
        0.03608496, -0.19201785,  0.03438567,  0.11135137, -0.0672859 ,
       -0.1094491 , -0.13030739,  0.0494657 ,  0.11911286,  0.01310428,
       -0.00501043, -0.09667324,  0.01165086, -0.00705838,  0.07181918,
        0.00608134, -0.04990504, -0.1302433 ,  0.12483603,  0.08733457,
       -0.10613253,  0.03896108,  0.08578913, -0.10453094,  0.09530341,
        0.10496309, -0.21840815, -0.00414203,  0.02403379, -0.04714741,
        0.2039163 , -0.038161  , -0.09622689,  0.00613147, -0.04751429,
        0.04652321,  0.01149416,  0.15077797, -0.15235762,  0.00092528,
       -0.00257382,  0.07883206,  0.02214593, -0.04623231, -0.0571675 ,
       -0.10038663, -0.0065578 , -0.10748696, -0.04741748,  0.09

In [99]:
topics = ['scandal', 'greenwashing', 'corruption', 'fraud', 'bribe', 'tax', 'forced', 'harassment', 'violation', 
          'human rights', 'conflict', 'weapons', 'arms trade', 'pollution', 'CO2', 'emission', 'fossil fuel',
          'gender inequality', 'discrimination', 'sexism', 'racist', 'intransparent', 'data privacy', 'lawsuit', 
          'unfair', 'bad', 'problem', 'hate', 'issues', 'controversial']

for topic in topics:
    try:
        word2vec.wv.similarity('amazon', topic)
    except:
        print(topic + ": nothing found")

scandal: nothing found
greenwashing: nothing found
corruption: nothing found
bribe: nothing found
forced: nothing found
harassment: nothing found
violation: nothing found
human rights: nothing found
conflict: nothing found
weapons: nothing found
arms trade: nothing found
pollution: nothing found
CO2: nothing found
emission: nothing found
fossil fuel: nothing found
gender inequality: nothing found
discrimination: nothing found
sexism: nothing found
racist: nothing found
intransparent: nothing found
data privacy: nothing found
unfair: nothing found
problem: nothing found
hate: nothing found
controversial: nothing found


# Sentiment Analysis

In [66]:
from pprint import pprint
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

In [49]:
df = pd.read_csv(source_file)
df

Unnamed: 0,title,date,keyword
0,Lieferung für Berliner Polizei wurde in Thaila...,04.04.2020,3M
1,Innensenator Geisel: 200.000 Schutzmasken für ...,04.04.2020,3M
2,Beim US-Konzern 3M: Zoll kontrollierte Schutzm...,22.03.2020,3M
3,US swoop sees 3M masks allegedly diverted from...,03.04.2020,3M
4,3M Says No Evidence That Products Have Been Se...,05.04.2020,3M
...,...,...,...
1995,"Archer-Daniels-Midland: A 10-Year, Full-Cycle ...",19.12.2019,Archer-Daniels-Midland
1996,A Closer Look At Archer-Daniels-Midland Compan...,31.01.2020,Archer-Daniels-Midland
1997,Global Egg Replacement Ingredients Market 2019...,22.11.2019,Archer-Daniels-Midland
1998,Archer-Daniels-Midland Q2 2020 Earnings Preview,vor 3 Wochen,Archer-Daniels-Midland


In [65]:
headlines = df[['title', 'keyword']]
headlines = [headline for headline in headlines['title']]
headlines

['Lieferung für Berliner Polizei wurde in Thailand zu besserem ...',
 'Innensenator Geisel: 200.000 Schutzmasken für Berlin in die USA "umgeleitet"',
 'Beim US-Konzern 3M: Zoll kontrollierte Schutzmasken und ...',
 'US swoop sees 3M masks allegedly diverted from Berlin',
 '3M Says No Evidence That Products Have Been Seized',
 '3M to Showcase Industry-First 3D Printed PTFE at Formnext',
 "It's 'absurd' – 3M CEO defends coronavirus response after Trump invokes DPA",
 "3M : Management's Discussion and Analysis of Financial Condition and Results of Operations. (form 10-Q)",
 'Trump Seeks to Block 3M Mask Exports and Grab Masks From Its Overseas Customers',
 'Is 3M Oversold At $155?',
 'Digital pilots move to production at 3M',
 'COVID 19 Roundup: 3M, DSM and more contribute to ...',
 'Continental and 3M to develop new V2I interfaces',
 '3M Response to Defense Production Act Order',
 '3M Reports Fourth-Quarter, Full-Year 2019 Results',
 '3M promises monthly updates in lieu of earnings forec

In [67]:
sia = SIA()
results = []

for line in headlines:
    if detect(line) != 'en':
        continue
    pol_score = sia.polarity_scores(line)
    pol_score['headline'] = line
    results.append(pol_score)

pprint(results[:50], width=100)

[{'compound': 0.0,
  'headline': 'US swoop sees 3M masks allegedly diverted from Berlin',
  'neg': 0.0,
  'neu': 1.0,
  'pos': 0.0},
 {'compound': -0.296,
  'headline': '3M Says No Evidence That Products Have Been Seized',
  'neg': 0.216,
  'neu': 0.784,
  'pos': 0.0},
 {'compound': 0.0,
  'headline': '3M to Showcase Industry-First 3D Printed PTFE at Formnext',
  'neg': 0.0,
  'neu': 1.0,
  'pos': 0.0},
 {'compound': 0.0,
  'headline': "It's 'absurd' – 3M CEO defends coronavirus response after Trump invokes DPA",
  'neg': 0.0,
  'neu': 1.0,
  'pos': 0.0},
 {'compound': 0.0,
  'headline': "3M : Management's Discussion and Analysis of Financial Condition and Results of "
              'Operations. (form 10-Q)',
  'neg': 0.0,
  'neu': 1.0,
  'pos': 0.0},
 {'compound': -0.4404,
  'headline': 'Trump Seeks to Block 3M Mask Exports and Grab Masks From Its Overseas Customers',
  'neg': 0.182,
  'neu': 0.818,
  'pos': 0.0},
 {'compound': 0.0, 'headline': 'Is 3M Oversold At $155?', 'neg': 0.0, '

In [68]:
df = pd.DataFrame.from_records(results)

df['label'] = 0
df.loc[df['compound'] > 0.2, 'label'] = 1
df.loc[df['compound'] < -0.2, 'label'] = -1
df

Unnamed: 0,neg,neu,pos,compound,headline,label
0,0.000,1.000,0.000,0.0000,US swoop sees 3M masks allegedly diverted from...,0
1,0.216,0.784,0.000,-0.2960,3M Says No Evidence That Products Have Been Se...,-1
2,0.000,1.000,0.000,0.0000,3M to Showcase Industry-First 3D Printed PTFE ...,0
3,0.000,1.000,0.000,0.0000,It's 'absurd' – 3M CEO defends coronavirus res...,0
4,0.000,1.000,0.000,0.0000,3M : Management's Discussion and Analysis of F...,0
...,...,...,...,...,...,...
1897,0.133,0.867,0.000,-0.3182,If You Had Bought Archer-Daniels-Midland (NYSE...,-1
1898,0.000,0.741,0.259,0.4215,Is There Now An Opportunity In Archer-Daniels-...,1
1899,0.000,1.000,0.000,0.0000,"Archer-Daniels-Midland: A 10-Year, Full-Cycle ...",0
1900,0.000,1.000,0.000,0.0000,A Closer Look At Archer-Daniels-Midland Compan...,0


In [69]:
sentiments = df[['headline', 'label']]
sentiments

Unnamed: 0,headline,label
0,US swoop sees 3M masks allegedly diverted from...,0
1,3M Says No Evidence That Products Have Been Se...,-1
2,3M to Showcase Industry-First 3D Printed PTFE ...,0
3,It's 'absurd' – 3M CEO defends coronavirus res...,0
4,3M : Management's Discussion and Analysis of F...,0
...,...,...
1897,If You Had Bought Archer-Daniels-Midland (NYSE...,-1
1898,Is There Now An Opportunity In Archer-Daniels-...,1
1899,"Archer-Daniels-Midland: A 10-Year, Full-Cycle ...",0
1900,A Closer Look At Archer-Daniels-Midland Compan...,0


In [70]:
print("Positive Headlines:")
pprint(list(sentiments[sentiments['label'] == 1].headline), width=200)

Positive Headlines:
['3M promises monthly updates in lieu of earnings forecasts',
 'Ford and 3M now shipping powered air-purifying respirators to health care workers; New Jersey orders 500,000 gowns',
 'Continental and 3M Partner to Develop Intelligent ...',
 'Penny Wise to lead 3M Canada as President Français',
 'Trump and 3M reach deal to allow N95 face masks to be exported to Canada',
 'U.S. Approves Abbott Labs Five-Minute ‘Rapid’ Coronavirus Test',
 'Free NHS antibody tests on way after Roche and Abbott Labs deals',
 'U.S. Approves Abbott Labs Coronavirus Test For Hospital Use',
 'With new FDA approval, Abbott sets goal of shipping 1 million ...',
 'U.S. approves Abbott coronavirus test; company set to ship 150,000',
 'Demand Surges For Abbott Labs Rapid Coronavirus Test As New Data Support Accuracy',
 'Abbott Tacks On Massive Coronavirus Test Growth — But Is ...',
 'Walgreens to expand COVID-19 test using Abbott test kits',
 'If You Had Bought Abbott Laboratories (NYSE:ABT) Stock

In [71]:
print("Negative Headlines:")
pprint(list(sentiments[sentiments['label'] == -1].headline), width=200)

Negative Headlines:
['3M Says No Evidence That Products Have Been Seized',
 'Trump Seeks to Block 3M Mask Exports and Grab Masks From Its Overseas Customers',
 '3M to Cut Costs as Business Cools Beyond Masks',
 "'Dark Waters' movie poses risk for 3M, analyst says",
 '3M Tumbles Most Since April After Revealing Subpoena, Job Cuts',
 '3M Tumbles as Industrial Weakness Forces Another Forecast Cut',
 'Office and stationery sales stall at 3M | OPI',
 '30% Downside For 3M?',
 'Abbott Laboratories (NYSE:ABT) Seems To Use Debt Rather Sparingly',
 'Abbott Laboratories Earnings Missed Analyst Estimates: Here’s What Analysts Are Forecasting Now',
 'Drugmaker AbbVie will pay $24 million to California, whistleblower to settle fraud lawsuit',
 'Drugmaker AbbVie to pay $24 million to settle California insurance fraud allegations',
 "AbbVie's Humira, Roche's Rituxan top ICER's list of worst ...",
 "Will Allergan's specialty drug woes drag down post-merger ...",
 'AbbVie signs potential $2.4bn cancer i