In [None]:
#  Script to generate a sports sentiment database with 1,000+ words

import pandas as pd
import nltk
import requests
from bs4 import BeautifulSoup
from nltk.corpus import wordnet as wn
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from afinn import Afinn

# Download necessary NLTK data
nltk.download('wordnet')

# Initialize Sentiment Analyzers
vader = SentimentIntensityAnalyzer()
afinn = Afinn()

# Base exciting sports keywords
exciting_keywords = {
    "win", "victory", "champion", "record-breaking", "comeback", "triumph",
    "historic", "golden", "trophy", "medal", "legendary", "unbeatable",
    "domination", "clutch", "buzzer-beater", "overtime", "upset", "shock",
    "underdog", "grand slam", "stunning", "miracle", "epic", "thriller",
    "dynasty", "streak", "MVP", "all-star", "perfect game", "no-hitter",
    "hat-trick", "slam dunk", "three-pointer", "fast break", "breakaway",
    "goal", "penalty", "overtime winner", "shootout", "free kick",
    "corner kick", "bicycle kick", "walk-off", "home run", "strikeout",
    "touchdown", "interception", "pick-six", "Hail Mary", "field goal",
    "red zone", "overtime thriller", "power play", "match-winner",
    "serve ace", "match point", "hole-in-one", "photo finish", "drama",
    "intensity", "highlight reel", "record-setting", "career-defining",
    "legacy", "unstoppable", "grit", "perseverance", "controversy",
    "scandal", "injury", "comeback story", "retirement", "farewell",
    "dream season", "superstar", "rivalry", "showdown", "title defense",
    "last-second heroics", "GOAT", "dominating performance", "instant classic"
}

# Function to get sports-related words from WordNet
def get_related_words(base_words, max_words=500):
    related_words = set(base_words)
    
    for word in base_words:
        for syn in wn.synsets(word):
            for lemma in syn.lemmas():
                related_words.add(lemma.name().replace('_', ' '))
                if len(related_words) >= max_words:
                    return list(related_words)

    return list(related_words)

# Expand keywords using WordNet
expanded_keywords = get_related_words(exciting_keywords, max_words=5000)

# Function to scrape sports-related words from sports news websites
def scrape_sports_words(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    words = set()
    for word in soup.get_text().split():
        if len(word) > 3 and word.isalpha():
            words.add(word.lower())

    return words

# Sports news sources
sports_urls = [
    "https://www.espn.com",
    "https://www.bbc.com/sport",
    "https://www.fifa.com",
    "https://www.nba.com",
    "https://www.mlssoccer.com"
]

# Scrape words from multiple sources
scraped_words = set()
for url in sports_urls:
    scraped_words.update(scrape_sports_words(url))

# Function to generate sports-related phrases
import random

sports_nouns = ["goal", "match", "tournament", "champion", "victory", "team"]
sports_adjectives = ["exciting", "record-breaking", "unbeatable", "legendary"]
sports_verbs = ["win", "score", "compete", "dominate", "race", "strike"]

generated_phrases = set()
for _ in range(3000):  # Generate 3000 random phrases
    phrase = f"{random.choice(sports_adjectives)} {random.choice(sports_nouns)}"
    generated_phrases.add(phrase)

# Merge all sources of sports words
all_sports_words = list(set(expanded_keywords) | scraped_words | generated_phrases)
all_sports_words = all_sports_words[:10000]  # Limit to 10,000 words

print(f"Total Sports Words Collected: {len(all_sports_words)}")

# Function to get SentiWordNet sentiment score
def get_sentiwordnet_score(word):
    synsets = wn.synsets(word)
    if synsets:
        synset = synsets[0]
        reference_synset = wn.synsets("good", pos=wn.ADJ)
        if reference_synset:
            return synset.wup_similarity(reference_synset[0]) or 0
    return 0

# Generate sentiment scores
sports_sentiment_data = []
for word in all_sports_words:
    vader_score = vader.polarity_scores(word)["compound"]
    afinn_score = afinn.score(word)
    sentiwordnet_score = get_sentiwordnet_score(word)

    avg_score = (vader_score + afinn_score + (sentiwordnet_score * 10)) / 3  

    sports_sentiment_data.append([word, vader_score, afinn_score, sentiwordnet_score, avg_score])

# Convert to DataFrame
df = pd.DataFrame(sports_sentiment_data, columns=["Word", "VADER", "AFINN", "SentiWordNet", "AverageScore"])

# Save to CSV
df.to_csv("sports_sentiment_expanded.csv", index=False)

print("âœ… Sports Sentiment Database with 10,000+ words saved to 'sports_sentiment_expanded.csv' successfully!")


In [3]:
import pandas as pd

# Load the CSV file
file_path = "sports_sentiment.csv"
df = pd.read_csv(file_path)

# Ensure all sentiment scores are positive
df[["VADER", "AFINN", "SentiWordNet", "AverageScore"]] = df[["VADER", "AFINN", "SentiWordNet", "AverageScore"]].abs()

# Normalize AFINN, SentiWordNet, and AverageScore by dividing by the max value in each column
for col in ["AFINN", "SentiWordNet", "AverageScore"]:
    max_value = df[col].max()
    if max_value > 0:
        df[col] = df[col] / max_value

# Remove rows where AverageScore is 0
df = df[df["AverageScore"] > 0]

# Sort by AverageScore in descending order
df = df.sort_values(by="AverageScore", ascending=False)

# Save the cleaned and sorted data to a new CSV file
output_file = "sports_keywords.csv"
df.to_csv(output_file, index=False)

print(f"Processed data saved to {output_file}")
print("Data cleaning and normalization complete!")

Processed data saved to sports_keywords.csv
Data cleaning and normalization complete!
