The goal of this code is to adapt grades.csv and master_annotations.pkl in such a way that they can also yield information about the teams in general. Many tweets do not specifically pertain to an individual player, and will otherwise not be taken into account. 
The code 5_3_assign_scores has already been written to iterate over the set of players per hashtag, and it is easier to create a new 'player' for each team, rather than adjusting the (slightly complex) code for 5_3_...

Therefore, this file will add the match as a player to each of their appearances in grades.csv, as well as assign each of the tweets that mentions no player to these new rows. The match IDs will look like this: match_ajapsv, to eliminate any possible confusion with player IDs.

# Add teams as players to grades.csv

In [1]:
import eredivisie_nlp as enlp
import numpy as np
import pandas as pd

In [22]:
# load grades data
grades = pd.read_csv(enlp.determine_root() + "/data/grades.csv", index_col=[0])
grades.FM = grades.FM - 1
grades = grades[np.isnan(grades.AD) == False]
grades = grades[np.isnan(grades.VI) == False]
grades = grades[np.isnan(grades.FM) == False]
grades = grades[['datetime', 'player_id', 'AD', 'VI', 'FM', 'hashtag', 'home', 'away', 'squad_abbr']]

In [24]:
abbr = abbreviations = {'AFC Ajax': 'aja', 'AZ Alkmaar': 'az', 'FC Groningen': 'gro', 'FC Twente': 'twe', 'FC Utrecht': 'utr', 'Feyenoord': 'fey', 'Fortuna Sittard': 'for', 'Go Ahead Eagles': 'gae', 'Heracles Almelo': 'her', 'NEC Nijmegen': 'nec', 'PEC Zwolle': 'pec', 'PSV': 'psv', 'RKC Waalwijk': 'rkc', 'SC Cambuur': 'cam', 'Sparta Rotterdam': 'spa', 'Vitesse': 'vit', 'Willem II': 'wil', 'sc Heerenveen': 'hee'}
abbr_flipped = dict((v, k) for k, v in abbr.items())

In [10]:
# create lists of team data
datetimes = []
players = []
algdag = []
vtbint = []
fotmob = []
hashtags = []
home = []
away = []
squad_abbr = []

tags = list(set([ht if '#' in ht else "#" + ht for ht in grades.hashtag]))
for hashtag in tags:
    datetimes.append(grades.datetime[grades.hashtag == hashtag].reset_index(drop=True)[0])
    players.append(f"match_{hashtag[1:]}")
    algdag.append(sum(grades.AD[grades.hashtag == hashtag]) / len(grades.AD[grades.hashtag == hashtag]))
    vtbint.append(sum(grades.VI[grades.hashtag == hashtag]) / len(grades.VI[grades.hashtag == hashtag]))
    fotmob.append(sum(grades.FM[grades.hashtag == hashtag]) / len(grades.FM[grades.hashtag == hashtag]))
    hashtags.append(hashtag)
    home.append(grades.home[grades.hashtag == hashtag].reset_index(drop=True)[0])
    away.append(grades.away[grades.hashtag == hashtag].reset_index(drop=True)[0])
    squad_abbr.append([grades.home[grades.hashtag == hashtag].reset_index(drop=True)[0], grades.away[grades.hashtag == hashtag].reset_index(drop=True)[0]])

# create dataframe of lists with same columns as grades.csv
additional_rows = pd.DataFrame({'datetime': datetimes, 'player_id': players, 'AD': algdag, 'VI': vtbint, 'FM': fotmob, 'hashtag': hashtags, 'home': home, 'away': away, 'squad_abbr': squad_abbr})

# append dataframe to original grades data
grades_new = pd.concat([grades, additional_rows]).reset_index(drop=True)

In [None]:
# save as grades_with_teams.csv
grades_new.to_csv(enlp.determine_root() + "/data/grades_with_teams.csv")

#  Add teams to twitter data (master_annotations.pkl)

In [3]:
twitter = pd.read_pickle(enlp.determine_root() + "/data/master_annotations.pkl")
# add players column to twitter dataframe containing all player mentions
players = []
for text in twitter.text:
    tokenized = text.split()
    token_players = []
    for token in tokenized:
        if "_" in token and ":" not in token:
            # resolve small preprocessing bug with if statement
            if token[-1] in [".", "!", "?", ","]:
                token_players.append(token[:-1])
            else:
                token_players.append(token)
    players.append(token_players)
twitter['players'] = players

In [4]:
new_players = []
for _, row in twitter.iterrows():
    if len(row.players) == 0:
        new_players.append([f"match_{row.hashtag[1:]}"])
    else:
        new_players.append(row.players)

In [14]:
twitter['players'] = new_players
twitter.to_pickle(enlp.determine_root() + "/data/master_annotations_with_teams.pkl")