# Word2Vec

In [23]:
import pandas as pd
import nltk
import re

from gensim.models import Word2Vec
from nltk.corpus import stopwords

In [71]:
source_file = 'test_data.pkl'

In [31]:
def preprocess(df):
    corpus = df['title']
    corpus = join_into_string(corpus)
    
    processed_text = clean_text(corpus)
    
    all_words = tokenize(processed_text)
    return all_words

def join_into_string(corpus):
    corpus = [i for i in corpus]
    corpus = " ".join(corpus)
    return corpus

def clean_text(corpus):
    processed_text = corpus.lower()
    processed_text = re.sub('[^a-zA-Z]', ' ', processed_text)
    processed_text = re.sub(r'\s+', ' ', processed_text)
    return processed_text

def tokenize(processed_text):
    all_sentences = nltk.sent_tokenize(processed_text)
    all_words = [nltk.word_tokenize(sent) for sent in all_sentences]
    for i in range(len(all_words)):
        all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]
    return all_words

In [34]:
df = pd.read_pickle("test_data.pkl")
all_words = preprocess(df)

In [36]:
word2vec = Word2Vec(all_words, min_count=2)
sim_words = word2vec.wv.most_similar('microsoft')
sim_words

[('modern', 0.41677993535995483),
 ('cloud', 0.3742678761482239),
 ('available', 0.37268587946891785),
 ('software', 0.37263423204421997),
 ('acquisition', 0.3397260308265686),
 ('probleme', 0.32023799419403076),
 ('next', 0.3076329827308655),
 ('information', 0.30046969652175903),
 ('hit', 0.2910577654838562),
 ('ios', 0.28891003131866455)]

# Sentiment Analysis

In [44]:
from pprint import pprint
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

In [45]:
df = pd.read_pickle("test_data.pkl")
headlines = df["title"]
headlines = [headline for headline in headlines]
headlines

['Microsoft gewinnt Pentagon-Milliardenauftrag',
 'Mixer: 100 Millionen US-Dollar? Wie viel Microsoft der Exklusivvertrag mit Ninja gekostet haben könnte',
 'Login-Probleme bei Microsoft Teams: Softwareriese vergisst, Zertifikat zu erneuern',
 'Microsoft stellt Wunderlist ab - und launcht eigenes To-Do-Tool',
 'Bedienkonzept: Microsoft beerdigt Sets für Windows 10',
 'Multi-Cloud-Management: Mit Azure Arc drängt Microsoft ins ...',
 'Microsoft Flight Simulator 2020 fotorealistisch? Nvidia landet Twitter-Hit',
 'Microsoft Surface Pro X',
 'Xbox Live: Microsoft erlaubt mehr Freiheiten beim Gamertag - Golem.de',
 'Februar-Patchday: Microsoft stopft 99 Löcher',
 'Microsoft gewinnt Pentagon-Milliardenauftrag',
 'Mixer: 100 Millionen US-Dollar? Wie viel Microsoft der Exklusivvertrag mit Ninja gekostet haben könnte',
 'Login-Probleme bei Microsoft Teams: Softwareriese vergisst, Zertifikat zu erneuern',
 'Microsoft stellt Wunderlist ab - und launcht eigenes To-Do-Tool',
 'Bedienkonzept: Micros

In [47]:
sia = SIA()
results = []

for line in headlines:
    pol_score = sia.polarity_scores(line)
    pol_score['headline'] = line
    results.append(pol_score)

pprint(results[:50], width=100)

[{'compound': 0.0,
  'headline': 'Microsoft gewinnt Pentagon-Milliardenauftrag',
  'neg': 0.0,
  'neu': 1.0,
  'pos': 0.0},
 {'compound': 0.0,
  'headline': 'Mixer: 100 Millionen US-Dollar? Wie viel Microsoft der Exklusivvertrag mit Ninja '
              'gekostet haben könnte',
  'neg': 0.0,
  'neu': 1.0,
  'pos': 0.0},
 {'compound': 0.0,
  'headline': 'Login-Probleme bei Microsoft Teams: Softwareriese vergisst, Zertifikat zu erneuern',
  'neg': 0.0,
  'neu': 1.0,
  'pos': 0.0},
 {'compound': 0.0,
  'headline': 'Microsoft stellt Wunderlist ab - und launcht eigenes To-Do-Tool',
  'neg': 0.0,
  'neu': 1.0,
  'pos': 0.0},
 {'compound': 0.0,
  'headline': 'Bedienkonzept: Microsoft beerdigt Sets für Windows 10',
  'neg': 0.0,
  'neu': 1.0,
  'pos': 0.0},
 {'compound': 0.0,
  'headline': 'Multi-Cloud-Management: Mit Azure Arc drängt Microsoft ins ...',
  'neg': 0.0,
  'neu': 1.0,
  'pos': 0.0},
 {'compound': 0.0,
  'headline': 'Microsoft Flight Simulator 2020 fotorealistisch? Nvidia landet 

In [67]:
df = pd.DataFrame.from_records(results)

df['label'] = 0
df.loc[df['compound'] > 0.2, 'label'] = 1
df.loc[df['compound'] < -0.2, 'label'] = -1
df

Unnamed: 0,neg,neu,pos,compound,headline,label
0,0.000,1.000,0.000,0.0000,Microsoft gewinnt Pentagon-Milliardenauftrag,0
1,0.000,1.000,0.000,0.0000,Mixer: 100 Millionen US-Dollar? Wie viel Micro...,0
2,0.000,1.000,0.000,0.0000,Login-Probleme bei Microsoft Teams: Softwareri...,0
3,0.000,1.000,0.000,0.0000,Microsoft stellt Wunderlist ab - und launcht e...,0
4,0.000,1.000,0.000,0.0000,Bedienkonzept: Microsoft beerdigt Sets für Win...,0
...,...,...,...,...,...,...
205,0.000,1.000,0.000,0.0000,[New Pictures] Inside the 126 Metre Lürssen Su...,0
206,0.000,0.815,0.185,0.3612,BetterCloud is like Group Policy for your SaaS...,1
207,0.369,0.631,0.000,-0.6249,Is this the worst time ever to invest?,-1
208,0.000,1.000,0.000,0.0000,Online Learning: 10 Essential Computer Science...,0


In [68]:
sentiments = df[['headline', 'label']]
sentiments

Unnamed: 0,headline,label
0,Microsoft gewinnt Pentagon-Milliardenauftrag,0
1,Mixer: 100 Millionen US-Dollar? Wie viel Micro...,0
2,Login-Probleme bei Microsoft Teams: Softwareri...,0
3,Microsoft stellt Wunderlist ab - und launcht e...,0
4,Bedienkonzept: Microsoft beerdigt Sets für Win...,0
...,...,...
205,[New Pictures] Inside the 126 Metre Lürssen Su...,0
206,BetterCloud is like Group Policy for your SaaS...,1
207,Is this the worst time ever to invest?,-1
208,Online Learning: 10 Essential Computer Science...,0


In [69]:
print("Positive Headlines:")
pprint(list(sentiments[sentiments['label'] == 1].headline), width=200)

Positive Headlines:
['The new Surface Earbuds are Microsoft’s first truly wireless earbuds',
 'Microsoft shares rise after it beats revenue, profit expectations, Azure posts 62% growth',
 'Microsoft beats Amazon to win the Pentagon’s $10 billion JEDI cloud contract',
 'Protecting democratic elections through secure, verifiable ...',
 'Extending free Windows 7 security updates to voting systems ...',
 'AWS confirms reports it will challenge JEDI contract award to Microsoft',
 'New cyberthreats require new ways to protect democracy ...',
 'Microsoft Rides Cloud to a 21% Increase in Profits',
 'Microsoft reports a strong fiscal first quarter, but Azure’s growth rate continues to decline',
 'Microsoft invests $1 billion in artificial intelligence project co-founded by Elon Musk',
 'Microsoft is investing $1 billion in OpenAI to create brain-like machines',
 'India’s Reliance Jio inks deal with Microsoft to expand Office 365, Azure to more businesses; unveils broadband, blockchain and IoT p

In [70]:
print("Negative Headlines:")
pprint(list(sentiments[sentiments['label'] == -1].headline), width=200)

Negative Headlines:
['Microsoft snags hotly contested $10 billion defense contract, beating out Amazon',
 'Microsoft warns it will miss guidance for segment that includes Windows because of coronavirus',
 'Bill Gates says his ‘greatest mistake ever’ was Microsoft losing to Android',
 'Microsoft bans Slack and discourages AWS and Google Docs use internally',
 'Autostart, Uninstall & Co.: Wie Microsoft die PWA-Integration ...',
 'Microsoft Warns New Vulnerabilities Impact Every Version Of Windows 10',
 'Microsoft sneaks working Surface Duo demo into failed event recording',
 'Windows Hack Attackers Confirmed As Microsoft Responds With Powerful Counterpunch',
 'Microsoft’s Universal Windows Platform app dream is dead and buried',
 'Microsoft teases its secret dual-screen Surface device',
 'Microsoft stays silent on potential Huawei Windows ban',
 'How Top-Valued Microsoft Has Avoided the Big Tech Backlash',
 'Microsoft Surface Pro X review: ARM processor hurts app compatibility',
 'Micros