# Sports Sentiment Analysis Final Project

by Mikey McGrath

## 1. Library Imports

In [139]:
import pandas as pd
import nltk
from nltk.probability import FreqDist
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import requests

In [140]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mikeymcgrath/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mikeymcgrath/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mikeymcgrath/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

## 2. Data Processing

This data is only from September of the previous season. I can access the other months of other seasons as well. 

In [141]:
# Importing the html file and seeing what it looks like
f1 = open("decdata.html", "rb")
doc1 = BeautifulSoup(f1, "html.parser")
f2 = open("sepdata.html", "rb")
doc2 = BeautifulSoup(f2, "html.parser")
f3 = open("octdata.html", "rb")
doc3 = BeautifulSoup(f3, "html.parser")
f4 = open("novdata.html", "rb")
doc4 = BeautifulSoup(f4, "html.parser")
f5 = open("febdata.html", "rb")
doc5 = BeautifulSoup(f5, "html.parser")


In [142]:
# List of NFL teams
teams = {"Bills" : [],"Dolphins": [],"Patriots": [],"Jets": [],
			"Ravens": [],"Bengals": [],"Browns": [],"Steelers": [],
			"Texans": [],"Colts": [],"Jaguars": [],"Titans": [],
			"Broncos": [],"Chiefs": [],"Raiders": [],"Chargers": [],
			"Cowboys": [],"Giants": [],"Eagles": [],"Commanders": [],
			"Bears": [],"Lions": [],"Packers": [],"Vikings": [],
			"Falcons": [],"Panthers": [],"Saints": [],"Buccaneers": [],
			"Cardinals": [],"Rams": [],"49ers": [],"Seahawks": []}
team_scores = {"Bills" : [],"Dolphins": [],"Patriots": [],"Jets": [],
			"Ravens": [],"Bengals": [],"Browns": [],"Steelers": [],
			"Texans": [],"Colts": [],"Jaguars": [],"Titans": [],
			"Broncos": [],"Chiefs": [],"Raiders": [],"Chargers": [],
			"Cowboys": [],"Giants": [],"Eagles": [],"Commanders": [],
			"Bears": [],"Lions": [],"Packers": [],"Vikings": [],
			"Falcons": [],"Panthers": [],"Saints": [],"Buccaneers": [],
			"Cardinals": [],"Rams": [],"49ers": [],"Seahawks": []}

In [143]:
headlines = []
docs = [doc1, doc2, doc3, doc4, doc5]
for doc in docs:
    for link in doc.find_all("a"):
        text = link.get("title")
        if text is not None:
            # Getting rid of headlines that don't mention a team
            for team in teams:
                if team in text and text not in headlines:
                    headlines.append(text)

In [144]:
# Assign each team their headlines
for team in teams:
    for headline in headlines:
        if team in headline:
            teams[team].append(headline)

## 3. Sentiment Analysis

In [145]:
sia = SentimentIntensityAnalyzer()

In [146]:
# Return the sentiment score of each headline
def get_polarity_scores(headline):
    scores = sia.polarity_scores(headline)
    return scores['neg'], scores['pos'], scores['neu'], scores['compound']

In [147]:
# WORK ON FLUENCY OF THIS CHUNK
def get_avg_polarity_scores(team):
    neg_sum = 0
    pos_sum = 0
    neu_sum = 0
    compound_sum = 0
    for headline in teams[team]:
        scores = get_polarity_scores(headline)
        neg_sum += scores[0]
        pos_sum += scores[1]
        neu_sum += scores[2]
        compound_sum += scores[3]
    num_headlines = len(teams[team])
    neg = neg_sum / num_headlines
    pos = pos_sum / num_headlines
    neu = neu_sum / num_headlines
    compound = compound_sum / num_headlines
    return (neg, pos, neu, compound)

In [148]:
for team in teams:
    team_scores[team].append(get_avg_polarity_scores(team))
    team_scores[team] = team_scores[team][0]

In [149]:
# NFL power rankings at the time
actual = ["Chiefs", "49ers", "Ravens", "Lions", "Bills", "Cowboys",
            "Texans", "Browns", "Packers", "Eagles", "Dolphins",
            "Rams", "Buccaneers", "Steelers", "Saints", "Jaguars",
            "Colts", "Bengals", "Seahawks", "Raiders", "Bears", "Vikings",
            "Broncos", "Jets", "Chargers", "Titans", "Falcons",
            "Giants", "Cardinals", "Patriots", "Commanders", "Panthers"]

In [150]:
dictionary = {"Team": [], "Negative Score": [], "Positive Score": [], "Neutral Score": [], "Compound Score": [], "Actual Ranking": []}
for team, value in team_scores.items():
    dictionary["Team"].append(team)
    dictionary["Negative Score"].append(value[0])
    dictionary["Positive Score"].append(value[1])
    dictionary["Neutral Score"].append(value[2])
    dictionary["Compound Score"].append(value[3])
    dictionary["Actual Ranking"].append(actual.index(team) + 1)

In [151]:
df = pd.DataFrame.from_dict(dictionary)

## 4. Ranking Teams

Uncomment one of the below lines to rank the teams according to one of the columns. Neutral does not reveal much (if anything, the middle of the pack teams should be at the top)

In [152]:
# df = df.sort_values("Negative Score").reset_index(drop=True)
# df = df.sort_values("Positive Score", ascending=False).reset_index(drop=True)
# df = df.sort_values("Neutral Score", ascending=False).reset_index(drop=True)
df = df.sort_values("Compound Score", ascending=False).reset_index(drop=True)

def get_diff(index, rank):
    # print(index, rank)
    return abs(rank - index - 1)

df["Difference"] = df.apply(lambda row: get_diff(row.name, row["Actual Ranking"]), axis=1)
df

Unnamed: 0,Team,Negative Score,Positive Score,Neutral Score,Compound Score,Actual Ranking,Difference
0,Chiefs,0.077592,0.139109,0.783323,0.13534,2,1
1,Packers,0.076651,0.135679,0.787679,0.120584,14,12
2,Texans,0.064879,0.103545,0.831576,0.109205,15,12
3,49ers,0.073844,0.122894,0.803272,0.094461,1,3
4,Ravens,0.054655,0.102648,0.84269,0.092429,8,3
5,Colts,0.066328,0.112434,0.821238,0.090482,26,20
6,Broncos,0.06137,0.097378,0.841252,0.077689,28,21
7,Titans,0.060009,0.092477,0.847495,0.071846,16,8
8,Seahawks,0.054387,0.083521,0.862092,0.065563,9,0
9,Patriots,0.069168,0.098842,0.83204,0.060568,24,14


In [153]:
avg_error = df["Difference"].sum() / 32
avg_error

8.5