# Clean Twitter Data

We have two tasks in this notebook:
1. Ensure consistency of abbreviations between Twitter data and players data
2. Join twitter text and meta dataframes and label columns


In [77]:
import os
import pandas as pd

In [78]:
# Read in text data
tweet_text = pd.DataFrame()
path = '../data/data_raw/tweets_text' # Note that the user needs to run 01_pull_data.ipynb for this to work
for filename in os.listdir(path):
    if '.csv' in filename:
        tweet_text = tweet_text.append(pd.read_csv(os.path.join(path,filename), header=None))
tweet_text.columns=['tweet_id', 'content']
tweet_text.head()

Unnamed: 0,tweet_id,content
0,22671772304,Rice roster move made official: http://bit.ly...
1,22663024404,Some rumblings about #Cowboys WR Patrick Crayt...
2,22667563900,Chargers Preseason: 2010 Depth Chart (Preseaso...
3,22671763401,"The story of the Old man and the Terminator, a..."
4,22671171703,@TRE_HAWKS Josh Wilson was proven...to be medi...


In [79]:
# Read in and label columns for metadata
tweet_meta = pd.DataFrame()
path = '../data/data_raw/tweets_meta'
col_key_file = 'tweets.key.csv'
colnames = pd.read_csv(os.path.join(path,col_key_file)).columns.tolist()
for filename in os.listdir(path):
    if ('.csv' in filename) & (filename != col_key_file):
        year_search = re.search(r'\d\d\d\d', filename)
        tweet_meta = tweet_meta.append(pd.read_csv(os.path.join(path,filename), header=None, names=colnames))

tweet_meta.head()

Unnamed: 0,tweet_id,tweet_UTCtime,team,opponent,week,home_away,score,opponent_score,point_spread,over_under
0,109054198431223809,2011-9-01T00:05:21,KC,BUF,1,HOME,7,41,-4.0,40.0
1,109054336889393153,2011-9-01T00:05:54,STL,PHI,1,HOME,13,31,4.0,44.5
2,109054345147990016,2011-9-01T00:05:56,SF,SEA,1,HOME,33,17,-6.0,37.5
3,109054403985682434,2011-9-01T00:06:10,DAL,NYJ,1,AWAY,24,27,6.0,40.5
4,109054470918377472,2011-9-01T00:06:26,MIN,SD,1,AWAY,17,24,9.0,42.5


In [80]:
# Merge the data
tweets_all = tweet_meta.merge(tweet_text, how='left', on='tweet_id')
print('{} tweets deleted since 2012. {} tweets remaining.'.format(len(tweet_meta) - len(tweet_text), len(tweet_text)))
print(tweets_all.head())

569652 tweets deleted since 2012. 1138623 tweets remaining.
             tweet_id       tweet_UTCtime team opponent  week home_away  \
0  109054470918377472  2011-9-01T00:06:26  MIN       SD     1      AWAY   
1  109054580297437184  2011-9-01T00:06:52  PIT      BAL     1      AWAY   
2  109054756156215297  2011-9-01T00:07:34  PHI      STL     1      AWAY   
3  109055075225305088  2011-9-01T00:08:50  DAL      NYJ     1      AWAY   
4  109055146494930944  2011-9-01T00:09:07   NE      MIA     1      AWAY   

   score  opponent_score  point_spread  over_under  \
0     17              24           9.0        42.5   
1      7              35           1.5        36.5   
2     31              13          -4.0        44.5   
3     24              27           6.0        40.5   
4     38              24          -7.5        45.5   

                                             content  
0  RT @SaveTheVikesOrg: Drumming up #Vikings stad...  
1                   RT @Haydollbaby: #steelersnation  

In [82]:
# Read in defensive stats for team abrreviation comparison
defensive_stats = pd.read_csv('../data/data_modified/teams/defensive_stats.csv')
print(defensive_stats['team'][~defensive_stats['team'].isin(tweets_all['team'])].unique())
print(tweets_all['team'][~tweets_all['team'].isin(defensive_stats['team'])].unique())

['NWE' 'SDG' 'GNB' 'NOR' 'KAN' 'TAM' 'SFO']
['NE' 'GB' 'SF' 'SD' 'TB' 'NO' 'KC']


In [83]:
# Manually map names using domain knowledge
abbr_mappings = {
    'NE':'NWE',
    'GB':'GNB',
    'SF':'SFO',
    'SD':'SDG',
    'TB':'TAM',
    'NO':'NOR',
    'KC':'KAN'}
tweets_all.replace(abbr_mappings, inplace=True)
print(sum(~tweets_all['team'].isin(defensive_stats['team'].unique())))
print(sum(~tweets_all['opponent'].isin(defensive_stats['team'].unique())))

0
0


In [97]:
# Remove escape chars
tweets_all['content'] = [tweet.replace('\r', ' ').replace('\n', ' ') for tweet in tweets_all['content']]

In [99]:
tweets_all.to_csv('../data/data_modified/tweets/tweets.csv', index=False)