In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# read in final comments file
df = pd.read_csv('../CommentFiles/reddit_comments_final.csv')

In [3]:
# examine dataframe
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,subreddit,body,comment_type,created_utc,author,submission_title
0,0,0,sportsbook,"fuck it, o64.5 1H",,1616963381,dummyacct321,ncaabb_daily_discussion_32821_sunday
1,1,1,sportsbook,Look at this,,1616963365,jtcarl,2021_mlb_betting_primer
2,2,2,sportsbook,"Toronto been baiting everyone all week, I’m st...",,1616963335,hoooesay,nba_daily_discussion_32821_sunday
3,3,3,sportsbook,Fuck Michigan but today they make me money so ...,,1616963310,Dont_Carry_it_All,ncaabb_daily_discussion_32821_sunday
4,4,4,sportsbook,"""follow if you feel like it""",,1616963299,Feardamoo,which_is_the_worst_trend_on_sportsbook


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6400 entries, 0 to 6399
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        6400 non-null   int64  
 1   Unnamed: 0.1      6400 non-null   int64  
 2   subreddit         6400 non-null   object 
 3   body              6400 non-null   object 
 4   comment_type      0 non-null      float64
 5   created_utc       6400 non-null   int64  
 6   author            6400 non-null   object 
 7   submission_title  6400 non-null   object 
dtypes: float64(1), int64(3), object(4)
memory usage: 400.1+ KB


In [5]:
# drop unnamed and comment_type columns, unnamed should not have been in the 
df.drop(columns=['Unnamed: 0','Unnamed: 0.1', 'comment_type'], inplace=True)

In [6]:
# check dataframe
df.head()

Unnamed: 0,subreddit,body,created_utc,author,submission_title
0,sportsbook,"fuck it, o64.5 1H",1616963381,dummyacct321,ncaabb_daily_discussion_32821_sunday
1,sportsbook,Look at this,1616963365,jtcarl,2021_mlb_betting_primer
2,sportsbook,"Toronto been baiting everyone all week, I’m st...",1616963335,hoooesay,nba_daily_discussion_32821_sunday
3,sportsbook,Fuck Michigan but today they make me money so ...,1616963310,Dont_Carry_it_All,ncaabb_daily_discussion_32821_sunday
4,sportsbook,"""follow if you feel like it""",1616963299,Feardamoo,which_is_the_worst_trend_on_sportsbook


In [7]:
df.shape

(6400, 5)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6400 entries, 0 to 6399
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   subreddit         6400 non-null   object
 1   body              6400 non-null   object
 2   created_utc       6400 non-null   int64 
 3   author            6400 non-null   object
 4   submission_title  6400 non-null   object
dtypes: int64(1), object(4)
memory usage: 250.1+ KB


In [9]:
# check classes, to see if balanced
df['subreddit'].value_counts()

dfsports      3200
sportsbook    3200
Name: subreddit, dtype: int64

In [10]:
# take a closer look at title and body
len(df['submission_title'].unique())

525

In [11]:
# only 525 unique submission titles
df['submission_title'].unique()

array(['ncaabb_daily_discussion_32821_sunday', '2021_mlb_betting_primer',
       'nba_daily_discussion_32821_sunday',
       'which_is_the_worst_trend_on_sportsbook', '3rd_pick_nfl_draft',
       'ufc_260_miocic_vs_ngannou_2_march_27th_2021',
       'sportsbookpromosbonuses_daily_questions_32821',
       'nba_props_discussion_32821_sunday',
       'bahrain_grand_prix_discussion', 'pick_of_the_day_32821_sunday',
       'boseman_chadwick_best_actor_award_lock',
       'brag_and_bitch_32821_sunday',
       '93rd_annual_academy_awards_discussion',
       'i_like_bovadas_850_prop_on_oregonusc_going_into',
       'esports_daily_discussion_32821_sunday',
       'houston_4500_future_hedging_advice',
       'survivor_pool_sweet_16_discussion_thread_dk',
       'draftkings_is_showing_8_seconds_left_in_the_game',
       'nascar_food_city_dirt_race',
       'fanduel_not_letting_me_deposit_after_weekly_limit',
       'nhl_daily_discussion_32821_sunday',
       'tennis_daily_discussion_32821_sunday'

In [12]:
# how many submissions per subreddit
sub_filter = df['subreddit'] == 'sportsbook'
len(df[sub_filter]['submission_title'].unique())

66

In [13]:
sub_filter = df['subreddit'] == 'dfsports'
len(df[sub_filter]['submission_title'].unique())

459

## Annotation:
There are a lot more submissions for dfsports than there are sportsbook.  So title of submission may not be a good indicator of subreddit.  Will have to look into the body and use that text since it is split evenly.

In [14]:
# body text
df['body']

0                                       fuck it, o64.5 1H
1                                            Look at this
2       Toronto been baiting everyone all week, I’m st...
3       Fuck Michigan but today they make me money so ...
4                            "follow if you feel like it"
                              ...                        
6395    I would be careful about not paying up for it,...
6396                  Maybe u/gokobe123 can help you out?
6397    Check with a tax preparer. In MA (where I'm fr...
6398          No sir. Just exploring the hypotheticals :)
6399    The entry fee was $4 and I started with an ini...
Name: body, Length: 6400, dtype: object

In [15]:
# quick glance at all text
for text in df['body']:
    print(text)

fuck it, o64.5 1H
Look at this
Toronto been baiting everyone all week, I’m staying away
Fuck Michigan but today they make me money so I’m a fan
"follow if you feel like it"
As a niners fan my hedge for them drafting Mac Jones is jumping off the golden gate bridge
I lost like 1800$ ouch
If you think all of this public information isn’t already factored into the betting line set by Vegas you should stop betting
You're right. I got +191. Good call
It’s a site called Alamo123, but you can’t get in and see anything unless you get assigned a username and password from a bookie. Basically you get credited a certain amount every Monday and that’s your limit until the following Monday, unless you crap out and ask the bookie to increase your limit. The following week I meet up with my bookie and either pay him cash or get handed cash.
They were +150 last night. Futures are only available when there are no games being played (so if they come back tonight it will be after the Oregon USC game, so i

In [16]:
# list of things to focus on:
# emojis
# links
# short comments
# look into removed

In [17]:
#[removed]
counter = 0
for text in df['body']:
    if text == '[removed]':
        counter += 1
counter

127

In [18]:
# drop all removed text
# get series of all removed
removed = df['body'] == '[removed]'
removed_series = df[removed].index
removed_series

Int64Index([ 190,  414,  416,  432,  437,  488,  545,  549,  564,  587,
            ...
            5916, 5995, 6017, 6065, 6082, 6119, 6163, 6165, 6177, 6243],
           dtype='int64', length=127)

In [19]:
# drop all removed with series
df.drop(removed_series, inplace=True)

In [20]:
# check df 
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6273 entries, 0 to 6399
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   subreddit         6273 non-null   object
 1   body              6273 non-null   object
 2   created_utc       6273 non-null   int64 
 3   author            6273 non-null   object
 4   submission_title  6273 non-null   object
dtypes: int64(1), object(4)
memory usage: 294.0+ KB


In [21]:
# 127 rows dropped, double checking classes
df['subreddit'].value_counts(normalize=True)

dfsports      0.501993
sportsbook    0.498007
Name: subreddit, dtype: float64

After rows are dropped still very close to 50/50 split in classes.

## Note:
The following EDA is taking place after first modeling attempt.

In [27]:
link_str = 'https'
#link_str in df['body'][0]
counter = 0
for text in df['body']:
    if link_str in text:
        counter += 1
        print(text)
counter

**NBA Tips Tonight! (So far 85-41-0 here!) Lets go player Props! Took little pause, now back!**

# C. Capela 13,5+ Points

All props are taken from [Sportsbet.io](https://sportsbet.io/)(If you support my work, use my aff link from spreadsheet)

More infos at my server.

[**NBA ONLY STAT**](https://docs.google.com/spreadsheets/d/1AwwIZ4ff_Eg0bldHoDHJcA8ih9kmxdt0SFI7X4TXVBA/edit#gid=1148162658)

[**My stats**](https://docs.google.com/spreadsheets/d/1AwwIZ4ff_Eg0bldHoDHJcA8ih9kmxdt0SFI7X4TXVBA/edit#gid=1148162658) **since 15/12/2020.**

If unit not stated, then it is 1U

My other pick will be posted at *Anything goes daily* thread and at my discord.(Link is in my google spreadsheet \*My stats)\*.

Will post more picks!
**NBA Tips Tonight! (So far 85-41-0 here!) Lets go player Props! Took little pause, now back!**

# C. Capela 13,5+ Points

&amp;#x200B;

I think its pretty easy to see why this pick is must. Capela going against Jokic, one of the worst defenders in the league. Denver takes 

193

# Note:
193 comments have links in them but it so comments are not just links so can not delete row.  Https should just be added to stop words list.

In [22]:
# export df to file for modeling attempt
df.to_csv('../CommentFiles/Comments_EDA_Final.csv', index=False)

In [29]:
# attempt to split each body into list
#print(df['body'][0].split())
body_list = []
for text in df['body']:
    body_list.append(text.split())
body_list
# # create column with list
# df['body_list'] = body_list

[['fuck', 'it,', 'o64.5', '1H'],
 ['Look', 'at', 'this'],
 ['Toronto',
  'been',
  'baiting',
  'everyone',
  'all',
  'week,',
  'I’m',
  'staying',
  'away'],
 ['Fuck',
  'Michigan',
  'but',
  'today',
  'they',
  'make',
  'me',
  'money',
  'so',
  'I’m',
  'a',
  'fan'],
 ['"follow', 'if', 'you', 'feel', 'like', 'it"'],
 ['As',
  'a',
  'niners',
  'fan',
  'my',
  'hedge',
  'for',
  'them',
  'drafting',
  'Mac',
  'Jones',
  'is',
  'jumping',
  'off',
  'the',
  'golden',
  'gate',
  'bridge'],
 ['I', 'lost', 'like', '1800$', 'ouch'],
 ['If',
  'you',
  'think',
  'all',
  'of',
  'this',
  'public',
  'information',
  'isn’t',
  'already',
  'factored',
  'into',
  'the',
  'betting',
  'line',
  'set',
  'by',
  'Vegas',
  'you',
  'should',
  'stop',
  'betting'],
 ["You're", 'right.', 'I', 'got', '+191.', 'Good', 'call'],
 ['It’s',
  'a',
  'site',
  'called',
  'Alamo123,',
  'but',
  'you',
  'can’t',
  'get',
  'in',
  'and',
  'see',
  'anything',
  'unless',
  'you',