### Imports

In [1]:
# Usual imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Python's OS Package
import os

# sklearn imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import stop_words
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# nltk imports
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

### Reading in CSVs

In [2]:
df = pd.read_csv('../data/model_data.csv')

In [3]:
df['body'].fillna("", inplace=True)

In [4]:
df.shape

(10819, 14)

In [5]:
df.head()

Unnamed: 0,title,body,num_comments,upvote_ratio,url,elapsed_time,subreddit,url_website,all_text,text_in_body,word_count,char_count,pred_target,sentiment
0,"Week 6 Overperformers: Robby Anderson, Hunter ...",,36,0.78,https://www.thefantasyfootballadvice.com/artic...,113.369934,r/fantasyfootball,Other,"Week 6 Overperformers: Robby Anderson, Hunter ...",0,9,61,1,0.0
1,Fantasy Football Week 7: Starts & Sits,,8,0.88,https://www.youtube.com/watch?v=bBNYQpk4ziQ,129.5366,r/fantasyfootball,YouTube,Fantasy Football Week 7: Starts & Sits,0,7,38,1,0.0
2,2019 Accuracy Challenge Week 7,#####Accuracy Challenge Week 7\n\n\n######How ...,0,0.67,https://www.reddit.com/r/fantasyfootball/comme...,137.1366,r/fantasyfootball,Reddit,2019 Accuracy Challenge Week 7#####Accuracy Ch...,1,735,4736,1,0.9986
3,"Based on the first 6 weeks, what does the firs...",Says me:\n\n1. CMC\n2. Dalvin\n3. Saquon\n4. C...,112,0.52,https://www.reddit.com/r/fantasyfootball/comme...,235.953267,r/fantasyfootball,Reddit,"Based on the first 6 weeks, what does the firs...",1,39,183,1,0.0
4,Joe Mixon has run 20 pass routes over the last...,,52,0.88,https://twitter.com/GrahamBarfield/status/1184...,251.553267,r/fantasyfootball,Twitter,Joe Mixon has run 20 pass routes over the last...,0,31,167,1,0.1372


* When reading in the csv, Pandas filled all of the empty body cells with nulls so I have replaced those with empty strings, similar to the previous notebook.

### Predictor/Target

In [6]:
X = df['all_text']
y = df['pred_target']

### Train/Test/Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 7)

### Stop Words

In [8]:
custom_stop = list(ENGLISH_STOP_WORDS)
reddit_omit = ['comments','comment','https','fantasyfootball','help','thread','questions','league',
               'com','helped','following','users','threads','user','table','wdis','week','people','www','reddit','fantasy']
custom_stop.extend(reddit_omit)

### Pipeline & Gridsearch: Count Vectorizer

In [12]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lgr', LogisticRegression())
])

In [13]:
pipe_params = {
    'cvec__stop_words': [ENGLISH_STOP_WORDS, custom_stop],
    'cvec__max_features': [None, 80_000, 90_000, 100_000],
    'cvec__max_df': [0.2, 0.3, 0.4, 0.5],
    'cvec__min_df': [1, 2],
    'cvec__ngram_range': [(1,3), (2,2), (2,3)],
    'lgr__penalty': ['l1', 'l2']
    
}

In [14]:
gs = GridSearchCV(pipe,
                  pipe_params,
                  cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_





























































0.9478678826719251


{'cvec__max_df': 0.2,
 'cvec__max_features': None,
 'cvec__min_df': 1,
 'cvec__ngram_range': (1, 3),
 'cvec__stop_words': frozenset({'a',
            'about',
            'above',
            'across',
            'after',
            'afterwards',
            'again',
            'against',
            'all',
            'almost',
            'alone',
            'along',
            'already',
            'also',
            'although',
            'always',
            'am',
            'among',
            'amongst',
            'amoungst',
            'amount',
            'an',
            'and',
            'another',
            'any',
            'anyhow',
            'anyone',
            'anything',
            'anyway',
            'anywhere',
            'are',
            'around',
            'as',
            'at',
            'back',
            'be',
            'became',
            'because',
            'become',
            'becomes',
            'becoming',
     

In [15]:
gs.score(X_train, y_train)

0.9933448360857776

In [16]:
gs.score(X_test, y_test)

0.966728280961183

### Pipeline & Gridsearch: TFIDF Vectorizer

In [17]:
tpipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lgr', LogisticRegression(max_iter=1000, solver='liblinear'))
])

In [18]:
pipe_params = {
    'tfidf__stop_words': [ENGLISH_STOP_WORDS, custom_stop],
    'tfidf__max_features': [None, 80_000, 90_000, 100_000],
    'tfidf__max_df': [0.2, 0.3, 0.4, 0.5],
    'tfidf__min_df': [1, 2],
    'tfidf__ngram_range': [(1,3), (2,2), (2,3)],
    'lgr__penalty': ['l1', 'l2']
    
}

In [19]:
tgs = GridSearchCV(tpipe,
                  pipe_params,
                  cv=3)
tgs.fit(X_train, y_train)
print(tgs.best_score_)
tgs.best_params_



0.9227261523293073


{'lgr__penalty': 'l2',
 'tfidf__max_df': 0.3,
 'tfidf__max_features': None,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 3),
 'tfidf__stop_words': frozenset({'a',
            'about',
            'above',
            'across',
            'after',
            'afterwards',
            'again',
            'against',
            'all',
            'almost',
            'alone',
            'along',
            'already',
            'also',
            'although',
            'always',
            'am',
            'among',
            'amongst',
            'amoungst',
            'amount',
            'an',
            'and',
            'another',
            'any',
            'anyhow',
            'anyone',
            'anything',
            'anyway',
            'anywhere',
            'are',
            'around',
            'as',
            'at',
            'back',
            'be',
            'became',
            'because',
            'become',
            'becomes',
 

In [20]:
tgs.score(X_train, y_train)

0.9746117821050037

In [21]:
tgs.score(X_test, y_test)

0.9456561922365989

## Number & Text Pipe/Feature Union

### Predictor/Target

In [22]:
X = df[['all_text','num_comments', 'upvote_ratio','text_in_body','word_count','char_count','sentiment']]
y = df['pred_target']

### Train/Test/Split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 7)

In [24]:
# Creating Functions to grab interesting text columns & also interesting numeric columns to feed the pipeline.
def get_text(dataframe):
    return dataframe['all_text']

def get_numeric(dataframe):
    return dataframe[['num_comments', 'upvote_ratio','text_in_body','word_count','char_count','sentiment']]

In [25]:
# Transforming functions for pipeline.
get_text_tf = FunctionTransformer(get_text, validate=False)
get_numeric_tf = FunctionTransformer(get_numeric, validate=False)

In [26]:
pipe2 = Pipeline([
    ('union', FeatureUnion([
        ('numeric', Pipeline([
            ('selector', get_numeric_tf),
            ('ss', StandardScaler())
        ])),
        ('text', Pipeline([
            ('selector', get_text_tf),
            ('tfidf', TfidfVectorizer())
        ]))
    ])),
    ('lgr', LogisticRegression(max_iter=1000, solver='liblinear'))
])

In [27]:
nt_params = {
    'union__text__tfidf__stop_words': [ENGLISH_STOP_WORDS, custom_stop],
    'union__text__tfidf__max_features': [None, 100_000],
    'union__text__tfidf__max_df': [0.2, 0.3],
    'union__text__tfidf__min_df': [1],
    'union__text__tfidf__ngram_range': [(1,3), (2,3)],
    'lgr__penalty': ['l1', 'l2']
    
}

In [28]:
gs2 = GridSearchCV(pipe2, nt_params, cv=5)

In [29]:
gs2.fit(X_train, y_train);

In [30]:
print(gs2.best_score_)

0.9009120039438009


In [31]:
print(gs2.score(X_train, y_train))
print(gs2.score(X_test, y_test))

0.9526743899433079
0.9168207024029574


In [32]:
gs2.best_params_

{'lgr__penalty': 'l2',
 'union__text__tfidf__max_df': 0.3,
 'union__text__tfidf__max_features': 100000,
 'union__text__tfidf__min_df': 1,
 'union__text__tfidf__ngram_range': (1, 3),
 'union__text__tfidf__stop_words': frozenset({'a',
            'about',
            'above',
            'across',
            'after',
            'afterwards',
            'again',
            'against',
            'all',
            'almost',
            'alone',
            'along',
            'already',
            'also',
            'although',
            'always',
            'am',
            'among',
            'amongst',
            'amoungst',
            'amount',
            'an',
            'and',
            'another',
            'any',
            'anyhow',
            'anyone',
            'anything',
            'anyway',
            'anywhere',
            'are',
            'around',
            'as',
            'at',
            'back',
            'be',
            'became',
  

In [33]:
coefs = gs2.best_estimator_.named_steps['lgr'].coef_[0]
features = gs2.best_estimator_.named_steps['union'].transformer_list[1][1].named_steps['tfidf'].get_feature_names()

In [34]:
coef_dict = dict(zip(features, coefs))
coef_df = pd.DataFrame(coef_dict, index=['coefs'])

In [35]:
feat_coef = coef_df.T

In [36]:
feat_coef.reset_index(level=0, inplace=True)

In [37]:
feat_coef.columns = ['feature','coefs']

In [38]:
feat_coef['e_coefs'] = np.exp(feat_coef['coefs'])

In [39]:
feat_coef.sort_values(by='e_coefs', ascending=False).head(20)

Unnamed: 0,feature,coefs,e_coefs
28480,fantasy app extra,6.610348,742.741242
93392,week 10 helps,4.146242,63.19609
97171,wr 51 123,2.713507,15.082074
28529,fantasy football benefit,2.683825,14.640983
39753,injury aftermath,2.675967,14.526386
63893,practice day doing,2.658398,14.273408
67460,rb 31 guys,2.523844,12.47646
83209,targets 11 routes,2.4172,11.214412
92291,waiver chance,2.215545,9.166406
46771,limited dontrell,2.149256,8.578474


In [40]:
feat_coef.sort_values(by='e_coefs', ascending=False).tail(20)

Unnamed: 0,feature,coefs,e_coefs
19376,combined 48,-1.56612,0.208854
74286,schefter facts sunday,-1.619708,0.197956
65438,qb aaron rodgers,-1.644793,0.193053
68747,record 13,-1.647531,0.192525
11353,afc new england,-1.655098,0.191073
28419,fans brees cam,-1.662438,0.189676
84909,teams 20 people,-1.683181,0.185782
99439,years 2018 chicago,-1.691016,0.184332
98226,yards 02 cin,-1.761455,0.171795
43620,kick field,-1.826495,0.160977


### Model Evaluation

In [41]:
preds = gs2.best_estimator_.predict(X)

In [42]:
matrix = confusion_matrix(y, preds)

In [43]:
cm_df = pd.DataFrame(matrix, columns=['Pred NFL', 'Pred FFball'],
            index = ['Actual NFL', 'Actual FFball'])
cm_df

Unnamed: 0,Pred NFL,Pred FFball
Actual NFL,5624,243
Actual FFball,366,4586


In [44]:
accuracy_score(y, preds)

0.9437101395692763

### Conclusion

The Vectorizer that I chose to perform my analysis on along with my model was the TFIDF. This is in part because, due to the high activity on both subreddits, there are several posts that mentions the same word several times in the same post. Though my model suffers some with the TDIDF, it helps in minimizing this issue for analysis purposes. 
The top findings were the sentiment of posts in the r/fantasy football being slightly more positive than the r/NFL posts. Player's names show up a lot more frequently in the posts, meaning that a lot of the focus are on the players and not so much the game.

The NFL should look into integrating this fact into how the game is played and Marketed.