In [1]:
import requests
from bs4 import BeautifulSoup
from sklearn.pipeline import Pipeline

# Preprocessing LDA

## Get name of players

In [2]:
def get_players(url):
    html=requests.get(url).content
    soup=BeautifulSoup(html)
    table_player=soup.find('table', {'class':'wisbb_standardTable'})
    players=table_player.find_all('a', {'class':'wisbb_fullPlayer'})
    lst_player = [item.find('span').text.strip().split(',') for item in players]
    lst_player=[item.strip().lower() for x in lst_player for item in x]
    return lst_player

In [3]:
def get_links():
    url='https://www.foxsports.com/soccer/players?competition=4&teamId=0&season=2019&position=0&page=1&country=0&grouping=0&weightclass=0'
    html=requests.get(url).content
    soup=BeautifulSoup(html)
    last_page=int(soup.find('div', {'class':'wisbb_paginator'}).find_all('a')[-2].text.strip())
    first_url='https://www.foxsports.com/soccer/players?competition=4&teamId=0&season=2019&position=0&page='
    second_url='&country=0&grouping=0&weightclass=0'
    links = [first_url + str(item) + second_url for item in range(1,last_page+1)]
    return links

In [4]:
my_words=[]

In [5]:
links=get_links()
results=map(get_players,links)
players_names=list(results)
players_names=[item for x in players_names for item in x]
my_words += players_names

## Get name of referees

In [6]:
url='https://www.soccerbase.com/referees/home.sd?comp_id=20'
html=requests.get(url).content
soup=BeautifulSoup(html)
referees=soup.find_all('td', {'class':'first bull'})
referees = [item.text.lower().split(' ') for item in referees]
referees_names = [item for x in referees for item in x]
my_words += referees_names

## Get name of stadiums, locations

In [7]:
def get_links_teams():
    url='https://www.foxsports.com/soccer/standings?competition=4'
    html=requests.get(url).content
    soup=BeautifulSoup(html)
    table_teams=soup.find('table', {'class':'wisbb_standardTable'})
    teams=table_teams.find_all('a', {'class':'wisbb_fullTeam'})
    links=['https://www.foxsports.com'+item['href']+'-schedule' for item in teams]
    return links

In [8]:
def get_stadium_cities_names(url):
    html=requests.get(url).content
    soup=BeautifulSoup(html)
    stadiums = soup.find_all('span', {'class':'wisbb_main'})
    stadiums = [item.text.lower().split(' ') for item in stadiums]
    stadiums_name=list(set([item for x in stadiums for item in x]))
    location=soup.find_all('span', {'class':'wisbb_secondary'})
    location = [item.text.lower().split(',') for item in location]
    location_name=list(set([item.strip() for x in location for item in x]))
    return stadiums_name + location_name

In [9]:
links=get_links_teams()
results=map(get_stadium_cities_names,links)
stadiums=list(results)
stadiums=[item for x in stadiums for item in x]
my_words += stadiums

## Preprocessing My words

In [10]:
import nltk
import pandas as pd
import gensim
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [11]:
names=nltk.corpus.names.words('male.txt')
names=[item.lower() for item in names]
my_words += names

In [12]:
df=pd.read_csv('../../src/data/comments.csv')

In [13]:
my_list=['period','current','half','dusseldorf','monchengladbach','signals','adjej', 'antwi', 'bazee', 'benno', 
         'benteler', 'bruun', 'chang', 'da', 'dicka', 'dong', 'eliyau', 'gebre', 'guzman', 'hoon', 'ilay', 'juste','07','tsg', 
         'larsen', 'levent', 'munir', 'neuberger', 'sarenren', 'schwarzwald', 'selassie', 'spiel', 'st', 'stadion', 'veltins', 'won']

In [14]:
team_words=list(df['team'].unique())
team_words=[item.split(' ') for item in team_words]
team_words=[item.lower() for x in team_words for item in x]
my_words += team_words
my_words += my_list
my_words += gensim.parsing.preprocessing.STOPWORDS

# Pipeline LDA

In [15]:
X = list(df['comm'])
dataprep = Pipeline([('count_vectorizer', CountVectorizer(ngram_range=(1,4), min_df=10, stop_words=my_words))])

pipeline = Pipeline([
    ('dataprep', dataprep),
    ('topic_modelling', LatentDirichletAllocation(n_components=8, random_state=42,n_jobs=-1))
])

pipeline.fit(X)

Pipeline(memory=None,
         steps=[('dataprep',
                 Pipeline(memory=None,
                          steps=[('count_vectorizer',
                                  CountVectorizer(analyzer='word', binary=False,
                                                  decode_error='strict',
                                                  dtype=<class 'numpy.int64'>,
                                                  encoding='utf-8',
                                                  input='content',
                                                  lowercase=True, max_df=1.0,
                                                  max_features=None, min_df=10,
                                                  ngram_range=(1, 4),
                                                  preprocessor=None,
                                                  stop_words=['abdullahi',
                                                              'suleiman',
                                          

In [16]:
"""
import pyLDAvis.sklearn
 
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(pipeline.named_steps.topic_modelling, 
                                 pipeline.named_steps.dataprep.transform(X), 
                                 pipeline.named_steps.dataprep.named_steps.count_vectorizer, 
                                 mds='PcoA')

panel
"""

"\nimport pyLDAvis.sklearn\n \npyLDAvis.enable_notebook()\npanel = pyLDAvis.sklearn.prepare(pipeline.named_steps.topic_modelling, \n                                 pipeline.named_steps.dataprep.transform(X), \n                                 pipeline.named_steps.dataprep.named_steps.count_vectorizer, \n                                 mds='PcoA')\n\npanel\n"

In [17]:
first_topic=pipeline.named_steps.topic_modelling.components_[0]
top_topic_words = first_topic.argsort()[-10:]
topic_values = pipeline.transform(X)
df['labels'] = topic_values.argmax(axis=1)

In [18]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df['team'])
df['team_labels']=le.transform(df['team'])
def who_wins(row):
    result = 'Away' if row['home_goals_final'] < row['away_goals_final'] else 'Home' if row['home_goals_final'] >row['away_goals_final'] else 'Draw'
    return result
df['result']= df.apply(lambda row: who_wins(row), axis=1)
df.drop(columns=['home_goals_final','away_goals_final'], inplace=True)

In [19]:
X=pd.get_dummies(df, columns=['labels'])
X.columns

Index(['url', 'id_game', 'time', 'team', 'comm', 'team_labels', 'result',
       'labels_0', 'labels_1', 'labels_2', 'labels_3', 'labels_4', 'labels_5',
       'labels_6', 'labels_7'],
      dtype='object')

In [20]:
df_group=X.groupby(['id_game','team_labels','result'])[['labels_0', 'labels_1',
       'labels_2', 'labels_3', 'labels_4', 'labels_5', 'labels_6', 'labels_7']].sum().reset_index()
df_home=df_group.iloc[::2]
df_away=df_group.iloc[1::2]

In [21]:
df_home.columns=['id_game', 'team_home', 'result', 'labels_0_home', 'labels_1_home', 'labels_2_home',
       'labels_3_home', 'labels_4_home', 'labels_5_home', 'labels_6_home', 'labels_7_home']

In [22]:
df_away.columns=['id_game', 'team_away', 'result', 'labels_0_away', 'labels_1_away', 'labels_2_away',
       'labels_3_away', 'labels_4_away', 'labels_5_away', 'labels_6_away', 'labels_7_away']

In [23]:
df_result=pd.merge(left=df_home, right=df_away, on=['id_game','result'])