# Sports Sentiment Analysis Final Project

by Mikey McGrath

## 1. Library Imports

In [37]:
import pandas as pd
import nltk
from nltk.probability import FreqDist
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import requests

In [38]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mikeymcgrath/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mikeymcgrath/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mikeymcgrath/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

## 2. Data Processing

This data is only from September of the previous season. I can access the other months of other seasons as well. 

In [39]:
# Importing the html file and seeing what it looks like
f = open("data.html", "rb")
doc = BeautifulSoup(f, "html.parser")
doc.prettify()


'<!DOCTYPE html>\n<html xmlns:fb="https://www.facebook.com/2008/fbml">\n <head>\n  <script>\n   (function redirectToHttpIfHttps() {\n   var win      = typeof window !== \'undefined\' && window,\n       location = win && win.location,\n       protocol = location && location.protocol;\n\n   if (protocol === \'https:\' && !true) {\n        location.href = location.href.replace(\'https://\', \'http://\');\n   }\n})();\n  </script>\n  <meta charset="utf-8"/>\n  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>\n  <link href="https://a.espncdn.com/favicon.ico" mask="" rel="icon" sizes="any"/>\n  <meta content="#CC0000" name="theme-color"/>\n  <script type="text/javascript">\n   if(true && navigator && navigator.userAgent.toLowerCase().indexOf("teamstream") >= 0) {\n        window.location = \'http://m.espn.com/mobilecache/general/apps/sc\';\n    }\n  </script>\n  <script src="https://dcf.espn.com/TWDC-DTCI/prod/Bootstrap.js" type="text/javascript">\n  </script>\n  <title>\n   N

In [40]:
# List of NFL teams
teams = {"Bills" : [],"Dolphins": [],"Patriots": [],"Jets": [],
			"Ravens": [],"Bengals": [],"Browns": [],"Steelers": [],
			"Texans": [],"Colts": [],"Jaguars": [],"Titans": [],
			"Broncos": [],"Chiefs": [],"Raiders": [],"Chargers": [],
			"Cowboys": [],"Giants": [],"Eagles": [],"Commanders": [],
			"Bears": [],"Lions": [],"Packers": [],"Vikings": [],
			"Falcons": [],"Panthers": [],"Saints": [],"Buccaneers": [],
			"Cardinals": [],"Rams": [],"49ers": [],"Seahawks": []}
team_scores = {"Bills" : [],"Dolphins": [],"Patriots": [],"Jets": [],
			"Ravens": [],"Bengals": [],"Browns": [],"Steelers": [],
			"Texans": [],"Colts": [],"Jaguars": [],"Titans": [],
			"Broncos": [],"Chiefs": [],"Raiders": [],"Chargers": [],
			"Cowboys": [],"Giants": [],"Eagles": [],"Commanders": [],
			"Bears": [],"Lions": [],"Packers": [],"Vikings": [],
			"Falcons": [],"Panthers": [],"Saints": [],"Buccaneers": [],
			"Cardinals": [],"Rams": [],"49ers": [],"Seahawks": []}

In [41]:
headlines = []
for link in doc.find_all("a"):
    text = link.get("title")
    if text is not None:
        # Getting rid of headlines that don't mention a team
        for team in teams:
            if team in text and text not in headlines:
                headlines.append(text)


maybe find a way to include the cities in case the mascot is not a part of the headline

In [42]:
# Assign each team their headlines
for team in teams:
    for headline in headlines:
        if team in headline:
            teams[team].append(headline)


## 3. Sentiment Analysis

In [43]:
sia = SentimentIntensityAnalyzer()

In [44]:
# Return the sentiment score of each headline
def get_polarity_scores(headline):
    scores = sia.polarity_scores(headline)
    return scores['neg'], scores['pos'], scores['neu'], scores['compound']

In [45]:
# WORK ON FLUENCY OF THIS CHUNK
def get_avg_polarity_scores(team):
    neg_sum = 0
    pos_sum = 0
    neu_sum = 0
    compound_sum = 0
    for headline in teams[team]:
        scores = get_polarity_scores(headline)
        neg_sum += scores[0]
        pos_sum += scores[1]
        neu_sum += scores[2]
        compound_sum += scores[3]
    num_headlines = len(teams[team])
    neg = neg_sum / num_headlines
    pos = pos_sum / num_headlines
    neu = neu_sum / num_headlines
    compound = compound_sum / num_headlines
    return (neg, pos, neu, compound)

In [46]:
for team in teams:
    team_scores[team].append(get_avg_polarity_scores(team))
    team_scores[team] = team_scores[team][0]
team_scores

{'Bills': (0.10342307692307692,
  0.05188461538461538,
  0.8447307692307692,
  -0.08034615384615386),
 'Dolphins': (0.057571428571428565,
  0.03471428571428571,
  0.9077142857142856,
  -0.06531428571428573),
 'Patriots': (0.03564285714285714,
  0.07671428571428572,
  0.8877857142857143,
  0.08116428571428572),
 'Jets': (0.10664444444444443,
  0.06888888888888887,
  0.8244888888888889,
  -0.057717777777777785),
 'Ravens': (0.08407692307692308,
  0.060884615384615384,
  0.8550384615384614,
  -0.0261076923076923),
 'Bengals': (0.0926896551724138,
  0.10144827586206899,
  0.8058275862068964,
  0.03714137931034483),
 'Browns': (0.12048648648648647,
  0.10783783783783786,
  0.7716756756756756,
  0.010224324324324312),
 'Steelers': (0.1507333333333333,
  0.09706666666666666,
  0.7522333333333333,
  -0.05492333333333334),
 'Texans': (0.07666666666666666,
  0.12147619047619049,
  0.8018095238095236,
  0.13968571428571433),
 'Colts': (0.04270833333333333,
  0.05883333333333333,
  0.8984583333333

In [47]:
# NFL power rankings at the time
actual = ["49ers", "Chiefs", "Bills", "Eagles", "Cowboys", "Dolphins",
            "Lions", "Ravens", "Seahawks", "Buccaneers", "Browns",
            "Jaguars", "Chargers", "Packers", "Texans", "Titans",
            "Rams", "Saints", "Jets", "Bengals", "Falcons", "Commanders",
            "Steelers", "Patriots", "Vikings", "Colts", "Cardinals",
            "Broncos", "Raiders", "Giants", "Panthers", "Bears"]

In [48]:
dictionary = {"Team": [], "Negative Score": [], "Positive Score": [], "Neutral Score": [], "Compound Score": [], "Actual Ranking": []}
for team, value in team_scores.items():
    dictionary["Team"].append(team)
    dictionary["Negative Score"].append(value[0])
    dictionary["Positive Score"].append(value[1])
    dictionary["Neutral Score"].append(value[2])
    dictionary["Compound Score"].append(value[3])
    dictionary["Actual Ranking"].append(actual.index(team) + 1)

In [49]:
df = pd.DataFrame.from_dict(dictionary)
# FIGURE OUT HOW TO GET RID OF THE LEFT COLUMN
df

Unnamed: 0,Team,Negative Score,Positive Score,Neutral Score,Compound Score,Actual Ranking
0,Bills,0.103423,0.051885,0.844731,-0.080346,3
1,Dolphins,0.057571,0.034714,0.907714,-0.065314,6
2,Patriots,0.035643,0.076714,0.887786,0.081164,24
3,Jets,0.106644,0.068889,0.824489,-0.057718,19
4,Ravens,0.084077,0.060885,0.855038,-0.026108,8
5,Bengals,0.09269,0.101448,0.805828,0.037141,20
6,Browns,0.120486,0.107838,0.771676,0.010224,11
7,Steelers,0.150733,0.097067,0.752233,-0.054923,23
8,Texans,0.076667,0.121476,0.80181,0.139686,15
9,Colts,0.042708,0.058833,0.898458,0.064546,26


## 4. Ranking Teams

Uncomment one of the below lines to rank the teams according to one of the columns. Neutral does not reveal much (if anything, the middle of the pack teams should be at the top)

In [50]:
# df = df.sort_values("Negative Score")
# df = df.sort_values("Positive Score", ascending=False)
# df = df.sort_values("Neutral Score", ascending=False)
# df = df.sort_values("Compound Score", ascending=False)
df

Unnamed: 0,Team,Negative Score,Positive Score,Neutral Score,Compound Score,Actual Ranking
0,Bills,0.103423,0.051885,0.844731,-0.080346,3
1,Dolphins,0.057571,0.034714,0.907714,-0.065314,6
2,Patriots,0.035643,0.076714,0.887786,0.081164,24
3,Jets,0.106644,0.068889,0.824489,-0.057718,19
4,Ravens,0.084077,0.060885,0.855038,-0.026108,8
5,Bengals,0.09269,0.101448,0.805828,0.037141,20
6,Browns,0.120486,0.107838,0.771676,0.010224,11
7,Steelers,0.150733,0.097067,0.752233,-0.054923,23
8,Texans,0.076667,0.121476,0.80181,0.139686,15
9,Colts,0.042708,0.058833,0.898458,0.064546,26


In [51]:
# Assess the model based on how close the simulated rankings are to the actual rankings

# This doesn't work yet because I cannot figure out how to access the row of something in a dataframe given the team

# error_sum = 0
# for team in teams:
#     error_sum += df.

TypeError: 'method' object is not subscriptable