# Sports Sentiment Analysis Final Project

by Mikey McGrath

## 1. Library Imports

In [184]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from bs4 import BeautifulSoup

In [185]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mikeymcgrath/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mikeymcgrath/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mikeymcgrath/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

## 2. Data Processing

This data is from all of the months of the 2023 Football Season besides January becuase I could not access that webpage. 

In [186]:
# Importing the html files
f1 = open("decdata.html", "rb")
doc1 = BeautifulSoup(f1, "html.parser")
f2 = open("sepdata.html", "rb")
doc2 = BeautifulSoup(f2, "html.parser")
f3 = open("octdata.html", "rb")
doc3 = BeautifulSoup(f3, "html.parser")
f4 = open("novdata.html", "rb")
doc4 = BeautifulSoup(f4, "html.parser")
f5 = open("febdata.html", "rb")
doc5 = BeautifulSoup(f5, "html.parser")


In [187]:
# List of NFL teams
teams = {"Bills" : [],"Dolphins": [],"Patriots": [],"Jets": [],
			"Ravens": [],"Bengals": [],"Browns": [],"Steelers": [],
			"Texans": [],"Colts": [],"Jaguars": [],"Titans": [],
			"Broncos": [],"Chiefs": [],"Raiders": [],"Chargers": [],
			"Cowboys": [],"Giants": [],"Eagles": [],"Commanders": [],
			"Bears": [],"Lions": [],"Packers": [],"Vikings": [],
			"Falcons": [],"Panthers": [],"Saints": [],"Buccaneers": [],
			"Cardinals": [],"Rams": [],"49ers": [],"Seahawks": []}
team_scores = {"Bills" : [],"Dolphins": [],"Patriots": [],"Jets": [],
			"Ravens": [],"Bengals": [],"Browns": [],"Steelers": [],
			"Texans": [],"Colts": [],"Jaguars": [],"Titans": [],
			"Broncos": [],"Chiefs": [],"Raiders": [],"Chargers": [],
			"Cowboys": [],"Giants": [],"Eagles": [],"Commanders": [],
			"Bears": [],"Lions": [],"Packers": [],"Vikings": [],
			"Falcons": [],"Panthers": [],"Saints": [],"Buccaneers": [],
			"Cardinals": [],"Rams": [],"49ers": [],"Seahawks": []}

In [188]:
# Scraping all of the headlines from all of the HTML files
headlines = []
docs = [doc1, doc2, doc3, doc4, doc5]
for doc in docs:
    for link in doc.find_all("a"):
        text = link.get("title")
        if text is not None:
            # Getting rid of headlines that don't mention a team
            for team in teams:
                if team in text and text not in headlines:
                    headlines.append(text)

In [189]:
# Assign each team their headlines
for team in teams:
    for headline in headlines:
        if team in headline:
            teams[team].append(headline)

## 3. Sentiment Analysis

In [190]:
sia = SentimentIntensityAnalyzer()

In [191]:
# Return the sentiment score of each headline
def get_polarity_scores(headline):
    scores = sia.polarity_scores(headline)
    return scores['neg'], scores['pos'], scores['neu'], scores['compound']

In [192]:
# Function to return the polarity avg polarity scores of the headlines for each team
def get_avg_polarity_scores(team):
    neg_sum = 0
    pos_sum = 0
    neu_sum = 0
    compound_sum = 0
    for headline in teams[team]:
        scores = get_polarity_scores(headline)
        neg_sum += scores[0]
        pos_sum += scores[1]
        neu_sum += scores[2]
        compound_sum += scores[3]
    num_headlines = len(teams[team])
    neg = neg_sum / num_headlines
    pos = pos_sum / num_headlines
    neu = neu_sum / num_headlines
    compound = compound_sum / num_headlines
    return (neg, pos, neu, compound)

In [193]:
# Assigns each team their scores in the form of a tuple
for team in teams:
    team_scores[team].append(get_avg_polarity_scores(team))
    team_scores[team] = team_scores[team][0]

In [194]:
# NFL power rankings at end of the 2023 Season
actual = ["Chiefs", "49ers", "Ravens", "Lions", "Bills", "Cowboys",
            "Texans", "Browns", "Packers", "Eagles", "Dolphins",
            "Rams", "Buccaneers", "Steelers", "Saints", "Jaguars",
            "Colts", "Bengals", "Seahawks", "Raiders", "Bears", "Vikings",
            "Broncos", "Jets", "Chargers", "Titans", "Falcons",
            "Giants", "Cardinals", "Patriots", "Commanders", "Panthers"]

In [195]:
# Create dictionary with all of the teams and their corresponding scores and rankings
dictionary = {"Team": [], "Negative Score": [], "Positive Score": [], "Neutral Score": [], "Compound Score": [], "Actual Ranking": []}
for team, value in team_scores.items():
    dictionary["Team"].append(team)
    dictionary["Negative Score"].append(value[0])
    dictionary["Positive Score"].append(value[1])
    dictionary["Neutral Score"].append(value[2])
    dictionary["Compound Score"].append(value[3])
    dictionary["Actual Ranking"].append(actual.index(team) + 1)

In [196]:
df = pd.DataFrame.from_dict(dictionary)

## 4. Ranking Teams

Uncomment one of the below lines to rank the teams according to one of the columns. Neutral does not reveal much (if anything, the middle of the pack teams should be at the top)

In [197]:
# df = df.sort_values("Negative Score").reset_index(drop=True)
# df = df.sort_values("Positive Score", ascending=False).reset_index(drop=True)
# df = df.sort_values("Neutral Score", ascending=False).reset_index(drop=True)
df = df.sort_values("Compound Score", ascending=False).reset_index(drop=True)

def get_diff(index, rank):
    return abs(rank - index - 1)

df["Difference"] = df.apply(lambda row: get_diff(row.name, row["Actual Ranking"]), axis=1)
df

Unnamed: 0,Team,Negative Score,Positive Score,Neutral Score,Compound Score,Actual Ranking,Difference
0,Seahawks,0.054387,0.083521,0.862092,0.065563,19,18
1,Ravens,0.054655,0.102648,0.84269,0.092429,3,1
2,Chargers,0.056897,0.086916,0.856206,0.048381,25,22
3,Titans,0.060009,0.092477,0.847495,0.071846,26,22
4,Broncos,0.06137,0.097378,0.841252,0.077689,23,18
5,Falcons,0.064155,0.089364,0.846491,0.03546,27,21
6,Texans,0.064879,0.103545,0.831576,0.109205,7,0
7,Rams,0.064992,0.088055,0.846953,0.038339,12,4
8,Colts,0.066328,0.112434,0.821238,0.090482,17,8
9,Patriots,0.069168,0.098842,0.83204,0.060568,30,20


In [198]:
# Score the model based on how far off the actual ranking was from the estimated ranking
avg_error = df["Difference"].sum() / 32
avg_error

12.1875

Overall, this model turned out to be quite ineffective. It averaged around 9 values away from the actual ranking (out of 32). So it is definitely a little bit better than completely randomized rankings, but still not trustworthy. 