In [1]:
import pandas as pd
import praw
import os
from dotenv import load_dotenv
import boto3
import concurrent.futures
import time

In [18]:
load_dotenv()

client_id = os.getenv('REDDIT_CLIENT_ID')
client_secret = os.getenv('REDDIT_CLIENT_SECRET')
user_agent = os.getenv('REDDIT_USER_AGENT')

In [None]:
s3 = boto3.client(
    's3',
    aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
    region_name=os.getenv('AWS_REGION')  
)

In [21]:
reddit = praw.Reddit(
    client_id= client_id,
    client_secret= client_secret,
    user_agent= user_agent
)

In [None]:
import time
import pandas as pd
import concurrent.futures
import boto3
import praw  

s3 = boto3.client('s3')

def fetch_comments(submission):
    submission.comments.replace_more(limit=0)
    return [
        {
            'comment': comment.body,
            'score': comment.score,
            'submission_title': submission.title,
        }
        for comment in submission.comments.list()
    ]

def collect_and_upload_comments(subreddit_name, search_query, csv_file_path, s3_bucket_name, s3_file_name, max_workers=10, limit=50, max_comments=100):
    start_time = time.time()

    comments_data = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(fetch_comments, submission)
            for submission in reddit.subreddit(subreddit_name).search(search_query, sort='new', limit=limit)
        ]
        
        for future in concurrent.futures.as_completed(futures):
            comments_data.extend(future.result())

    comments_data = comments_data[:max_comments]

    comments_df = pd.DataFrame(comments_data)

    comments_df.to_csv(csv_file_path, index=False)

    s3.upload_file(csv_file_path, s3_bucket_name, s3_file_name)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Data uploaded to S3 successfully. Time taken: {elapsed_time:.2f} seconds.")

collect_and_upload_comments(
    subreddit_name="soccer",
    search_query="FC Barcelona",
    csv_file_path="barcelona_comments.csv",
    s3_bucket_name="reddit-football-text",
    s3_file_name="barca.csv"
)


Data uploaded to S3 successfully. Time taken: 8.63 seconds.


In [None]:
from transformers import pipeline

s3_bucket_name = 'reddit-football-text'
s3_file_name = 'man_utd.csv'

obj = s3.get_object(Bucket=s3_bucket_name, Key=s3_file_name)
data = pd.read_csv(obj['Body'])

sentiment_analysis = pipeline("sentiment-analysis", truncation=True, max_length=512)

def encode_sentiment(sentiment):
    return 1 if sentiment == "POSITIVE" else 0

batch_size = 10
encoded_sentiments = []

for i in range(0, len(data), batch_size):
    batch = data["comment"][i:i + batch_size].tolist() 
    batch_results = sentiment_analysis(batch) 

    encoded_sentiments.extend([encode_sentiment(result['label']) for result in batch_results])

average_sentiment = sum(encoded_sentiments) / len(encoded_sentiments)
print(f"Average Sentiment Score: {average_sentiment:.2f}")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


Average Sentiment Score: 0.34


In [None]:
import ScraperFC as sfc
import pandas as pd

ss = sfc.Sofascore()

league_data = ss.get_match_dicts("23/24", "EPL")

processed_matches = []
for match in league_data:
    match_data = {
        "tournament_name": match['tournament']['name'],
        "season_name": match['season']['name'],
        "round": match['roundInfo'].get('round', None),
        "status": match['status']['description'],
        "winner_code": match['winnerCode'],
        "home_team": match['homeTeam']['name'],
        "away_team": match['awayTeam']['name'],
        "home_score": match['homeScore']['display'],
        "away_score": match['awayScore']['display'],
        "injury_time_1": match['time'].get('injuryTime1', 0),
        "injury_time_2": match['time'].get('injuryTime2', 0),
        "start_timestamp": match['startTimestamp'],
        "match_slug": match['slug']
    }
    processed_matches.append(match_data)

# Convert to DataFrame
match_df = pd.DataFrame(processed_matches)
match_df.head()


In [None]:
ss = sfc.FBref()
season = ss.scrape_matches("2024-2025", "EPL")
season

In [37]:
season.head(1)["Home Player Stats"]

0    Summary              Unnamed: 0_level_0 Unname...
Name: Home Player Stats, dtype: object

In [27]:
fb = sfc.FBref()
match = fb.scrape_match('https://fbref.com/en/matches/67ed3ba2/Brentford-Tottenham-Hotspur-August-13-2023-Premier-League')