# Teams Over Time

This notebook explains the 'horizontal' slice of how sentiment analysis depicts a particular team's gamethreads over multiple weeks.

## Imports

In [1]:
# Path hack
import os
# change directory from the current Analysis folder to the top level folder for easier navigation
os.chdir('../')
# confirm we're at /RedditTextAnalysis
print(os.getcwd())

# note that the %load_ext autoreload line only needs to be be run once
%load_ext autoreload

/Users/prcork/MiscDataProjects/collaboration/RedditTextAnalysis


In [2]:
# by including this autoreload command, we only need to re-import if we make/save changes to the original py file
%autoreload
from Functions.nfl_gamethreads import nfl_gamethreads

In [3]:
import pandas as pd
import sqlite3
import altair as alt

## Query Comments For Fan-Level Analysis

In [4]:
with sqlite3.connect("nfl_gamethreads.db") as conn:

    # pull in all comments from weeks 4-9, 2021
    comments_df = pd.read_sql_query('''SELECT comments.comment_id, comments.author, comments.body, comments.author_flair, 
                                              comments.upvotes, comments.utc_time, comments.polarity, comments.subjectivity,
                                              gamethreads.week, gamethreads.season
                                    FROM comments 
                                    INNER JOIN gamethreads ON comments.submission_id = gamethreads.submission_id 
                                    WHERE gamethreads.week <= 9''', conn)

In [5]:
comments_df

Unnamed: 0,comment_id,author,body,author_flair,upvotes,utc_time,polarity,subjectivity,week,season
0,hej5m66,Piano_Fingerbanger,Boo both teams! \n\nLet's get a tie!,buccaneers,2,1.632786e+09,0.000000,0.000000,3,2021
1,hej5n9d,,[deleted],none,5,1.632786e+09,0.000000,0.000000,3,2021
2,hej5n9g,TheBigSm0ke,Good measuring stick game for Hurts. Let’s see...,none,1,1.632786e+09,0.150000,0.500000,3,2021
3,hej5ner,harshalax31,Hopefully the streak of amazing prime time gam...,49ers,2,1.632786e+09,0.600000,0.900000,3,2021
4,hej5nmk,Saitsu,"Oh I'm ready, BRING ON THE TOXICITY!\n\nAs for...",none,3,1.632786e+09,-0.183333,0.366667,3,2021
...,...,...,...,...,...,...,...,...,...,...
636998,hjgrea1,Lieutenant_Meeper,Goddamn the most recent Super Bowl win really ...,broncos,1,1.636146e+09,0.438889,0.452778,9,2021
636999,hjgwm8t,andrewfnluck,It’s all in fun. I would prefer a couple of co...,colts,1,1.636148e+09,0.400000,0.433333,9,2021
637000,hjh8nqv,,[deleted],none,1,1.636152e+09,0.000000,0.000000,9,2021
637001,hjh9rbz,Zivmovic,I forgot the scientists that have made vaccine...,giants,1,1.636153e+09,0.338542,1.000000,9,2021


### Aggregate by Flair Across All Games

This represents the entire fandom across all gamethreads on a weekly basis.

In [6]:
week_sum = comments_df[['polarity','subjectivity','week', 'author_flair']].groupby(['week', 'author_flair']).mean().reset_index()

In [51]:
alt.Chart(data=week_sum).mark_line().encode(
    alt.X("week:N"),
    alt.Color("author_flair"),
    y=("polarity")
).properties(
    width=800,
    height=800
)

### Aggregate by a team's gamethreads over time

In [6]:
with sqlite3.connect("nfl_gamethreads.db") as conn:

    # pull in all comments from weeks 4-9, 2021
    comments_df = pd.read_sql_query('''SELECT comments.comment_id, comments.author, comments.body, comments.author_flair, 
                                              comments.upvotes, comments.utc_time, comments.polarity, comments.subjectivity,
                                              gamethreads.week, gamethreads.season, gamethreads.home_team, gamethreads.away_team
                                    FROM comments 
                                    INNER JOIN gamethreads ON comments.submission_id = gamethreads.submission_id 
                                    WHERE gamethreads.week <= 9''', conn)

In [72]:
titans = comments_df[(comments_df['home_team'] == 'Tennessee Titans') | (comments_df['away_team'] == "Tennessee Titans")].copy()

titans_by_week = titans[['polarity','subjectivity','week', 'author_flair']].groupby(['week']).mean().reset_index()
titans_by_week['team'] = 'Titans'

In [73]:
saints = comments_df[(comments_df['home_team'] == 'New Orleans Saints') | (comments_df['away_team'] == "New Orleans Saints")].copy()

saints_by_week = saints[['polarity','subjectivity','week', 'author_flair']].groupby(['week']).mean().reset_index()
saints_by_week['team'] = 'Saints'

In [78]:
all_by_week = comments_df[['polarity','subjectivity','week', 'author_flair']].groupby(['week']).mean().reset_index()
all_by_week['team'] = 'All'

In [79]:
ten_no = pd.concat([titans_by_week, saints_by_week, all_by_week])

In [81]:
alt.Chart(data=ten_no).mark_line().encode(
    alt.X("week:N"),
    alt.Color("team:N"),
    y=("polarity")
).properties(
    width=800,
    height=800
)

Should we drop 0/0s
can we find a better TA tool?
normalize the polarity and subjectivity
Grab the extremes as polarity

Fan Profiles
    Evolution of network analysis
    Drill down to fans in their teams gamethreads

raw count of positive/negative

Year long week-to-week overtime distribution plot
ML pipeline using last year's data to predict which games will be more volatile than others

In [43]:
comments_df_sub = comments_df[comments_df['body']!='[deleted]']
comments_df_pos = comments_df_sub[comments_df_sub['polarity'] > 0.0]
comments_df_neg = comments_df_sub[comments_df_sub['polarity'] < -0.0]
comments_df_sub = pd.concat([comments_df_pos, comments_df_neg])
comments_df_sub.shape

(377932, 12)

In [44]:
week_sum_sub = comments_df_sub[['polarity','subjectivity','week', 'author_flair']].groupby(['week', 'author_flair']).mean().reset_index()

alt.Chart(data=week_sum_sub).mark_line().encode(
    alt.X("week:N"),
    alt.Color("author_flair"),
    y=("polarity")
).properties(
    width=800,
    height=800
)

In [46]:
week_negative_sum = comments_df_sub[comments_df_sub['polarity']<-.75].groupby(['week', 'author_flair']).count().reset_index().sort_values('polarity', ascending=False)

In [65]:
avg_neg = week_negative_sum[['week','author_flair', 'polarity']].groupby('author_flair').mean().reset_index().sort_values('polarity', ascending=False)
avg_neg.drop('week', axis=1, inplace=True)
avg_neg.rename(columns={'polarity':'avg_polar_comments'}, inplace=True)

In [66]:
avg_neg

Unnamed: 0,author_flair,avg_polar_comments
21,none,183.857143
22,packers,87.857143
12,cowboys,83.571429
24,patriots,82.285714
33,vikings,67.571429
14,eagles,55.142857
3,bills,53.142857
27,ravens,51.714286
1,bears,51.0
29,seahawks,48.857143


In [67]:
total_comments = comments_df[['author_flair', 'week', 'polarity']].groupby(['author_flair', 'week']).count().reset_index().groupby(['author_flair']).mean().reset_index()

total_comments.drop('week', axis=1, inplace=True)
total_comments.rename(columns={'polarity':'avg_comments'}, inplace=True)

In [68]:
com_df = avg_neg.merge(total_comments, how='left', on='author_flair')

In [72]:
com_df['percent_negative'] = (com_df['avg_polar_comments'].round() / com_df['avg_comments'].round())*100

In [73]:
com_df

Unnamed: 0,author_flair,avg_polar_comments,avg_comments,percent_negative
0,none,183.857143,13035.714286,1.411476
1,packers,87.857143,6348.857143,1.386045
2,cowboys,83.571429,5533.428571,1.518164
3,patriots,82.285714,5545.142857,1.47881
4,vikings,67.571429,3559.571429,1.910112
5,eagles,55.142857,3452.285714,1.593279
6,bills,53.142857,3716.0,1.426265
7,ravens,51.714286,3216.857143,1.616413
8,bears,51.0,2790.285714,1.827957
9,seahawks,48.857143,3375.285714,1.451852
