In [127]:
import requests
from bs4 import BeautifulSoup
from sklearn.pipeline import Pipeline
import pickle

# Preprocessing LDA

## Get name of players

In [128]:
def get_players(url):
    html=requests.get(url).content
    soup=BeautifulSoup(html)
    table_player=soup.find('table', {'class':'wisbb_standardTable'})
    players=table_player.find_all('a', {'class':'wisbb_fullPlayer'})
    lst_player = [item.find('span').text.strip().split(',') for item in players]
    lst_player=[item.strip().lower() for x in lst_player for item in x]
    return lst_player

In [129]:
def get_links():
    url='https://www.foxsports.com/soccer/players?competition=4&teamId=0&season=2019&position=0&page=1&country=0&grouping=0&weightclass=0'
    html=requests.get(url).content
    soup=BeautifulSoup(html)
    last_page=int(soup.find('div', {'class':'wisbb_paginator'}).find_all('a')[-2].text.strip())
    first_url='https://www.foxsports.com/soccer/players?competition=4&teamId=0&season=2019&position=0&page='
    second_url='&country=0&grouping=0&weightclass=0'
    links = [first_url + str(item) + second_url for item in range(1,last_page+1)]
    return links

In [130]:
my_words=[]

In [131]:
links=get_links()
results=map(get_players,links)
players_names=list(results)
players_names=[item for x in players_names for item in x]
my_words += players_names

## Get name of referees

In [132]:
url='https://www.soccerbase.com/referees/home.sd?comp_id=20'
html=requests.get(url).content
soup=BeautifulSoup(html)
referees=soup.find_all('td', {'class':'first bull'})
referees = [item.text.lower().split(' ') for item in referees]
referees_names = [item for x in referees for item in x]
my_words += referees_names

## Get name of stadiums, locations

In [133]:
def get_links_teams():
    url='https://www.foxsports.com/soccer/standings?competition=4'
    html=requests.get(url).content
    soup=BeautifulSoup(html)
    table_teams=soup.find('table', {'class':'wisbb_standardTable'})
    teams=table_teams.find_all('a', {'class':'wisbb_fullTeam'})
    links=['https://www.foxsports.com'+item['href']+'-schedule' for item in teams]
    return links

In [134]:
def get_stadium_cities_names(url):
    html=requests.get(url).content
    soup=BeautifulSoup(html)
    stadiums = soup.find_all('span', {'class':'wisbb_main'})
    stadiums = [item.text.lower().split(' ') for item in stadiums]
    stadiums_name=list(set([item for x in stadiums for item in x]))
    location=soup.find_all('span', {'class':'wisbb_secondary'})
    location = [item.text.lower().split(',') for item in location]
    location_name=list(set([item.strip() for x in location for item in x]))
    return stadiums_name + location_name

In [135]:
links=get_links_teams()
results=map(get_stadium_cities_names,links)
stadiums=list(results)
stadiums=[item for x in stadiums for item in x]
my_words += stadiums

## Preprocessing My words

In [136]:
import nltk
import pandas as pd
import gensim
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [137]:
names=nltk.corpus.names.words('male.txt')
names=[item.lower() for item in names]
my_words += names

In [138]:
my_list=['01','02','03','04','05','06','07','08','09','bsc','fsv','period','borussia','current','half','dusseldorf','monchengladbach','signals','adjej', 'antwi', 'bazee', 'benno', 
         'benteler', 'bruun', 'chang', 'da', 'dicka', 'dong', 'eliyau', 'gebre', 'guzman', 'hoon', 'ilay', 'juste','tsg', 'larsen', 'levent', 'munir', 'neuberger', 'sarenren', 'schwarzwald', 'selassie', 'spiel', 'st', 'stadion', 'veltins', 'won']

In [139]:
numbers=[str(i) for i in range(0,10000)]

In [140]:
df=pd.read_csv('../../src/data/comments.csv')
team_words=list(df['team'].unique())
team_words=[item.split(' ') for item in team_words]
team_words=[item.lower() for x in team_words for item in x]
my_words += numbers
my_words += team_words
my_words += my_list
my_words += gensim.parsing.preprocessing.STOPWORDS

In [141]:
with open('words.p', 'wb') as fp:
    pickle.dump(my_words, fp)

# Pipeline LDA

In [142]:
with open ('words.p', 'rb') as fp:
    my_words = pickle.load(fp)

In [143]:
df=pd.read_csv('../../src/data/comments.csv')

In [144]:
X = list(df['comm'])

dataprep = Pipeline([('count_vectorizer', CountVectorizer(ngram_range=(1,3), min_df=10, stop_words=my_words))])

n_com=5

pipeline = Pipeline([
    ('dataprep', dataprep),
    ('topic_modelling', LatentDirichletAllocation(n_components=n_com, random_state=42,n_jobs=-1))])

pipeline.fit(X)

Pipeline(memory=None,
         steps=[('dataprep',
                 Pipeline(memory=None,
                          steps=[('count_vectorizer',
                                  CountVectorizer(analyzer='word', binary=False,
                                                  decode_error='strict',
                                                  dtype=<class 'numpy.int64'>,
                                                  encoding='utf-8',
                                                  input='content',
                                                  lowercase=True, max_df=1.0,
                                                  max_features=None, min_df=10,
                                                  ngram_range=(1, 3),
                                                  preprocessor=None,
                                                  stop_words=['abdullahi',
                                                              'suleiman',
                                          

In [145]:
import pyLDAvis.sklearn
 
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(pipeline.named_steps.topic_modelling, 
                                 pipeline.named_steps.dataprep.transform(X), 
                                 pipeline.named_steps.dataprep.named_steps.count_vectorizer, 
                                 mds='PcoA')

panel

In [146]:
topic_values = pipeline.transform(X)
df['labels'] = topic_values.argmax(axis=1)

In [147]:
def who_wins(row):
    result = 'Away' if row['home_goals_final'] < row['away_goals_final'] else 'Home' if row['home_goals_final'] >row['away_goals_final'] else 'Draw'
    return result
df['result']= df.apply(lambda row: who_wins(row), axis=1)
df.drop(columns=['home_goals_final','away_goals_final'], inplace=True)

In [148]:
X=pd.get_dummies(df, columns=['labels'])
df_group=X.groupby(['id_game','team','result']).sum().reset_index()
df_group.drop(columns=['time'],inplace= True)
df_home=df_group.iloc[::2]
home_columns= ['labels_'+str(i)+'_home' for i in range(0,n_com)]
home_columns=['id_game','team_home','result']+home_columns
df_home.columns=home_columns
df_away=df_group.iloc[1::2]
away_columns= ['labels_'+str(i)+'_away' for i in range(0,n_com)]
away_columns=['id_game','team_away','result']+away_columns
df_away.columns=away_columns
df_result=pd.merge(left=df_home, right=df_away, on=['id_game','result'])
df_result=df_result.drop(columns=['id_game'])
df_result.shape

(232, 13)

In [149]:
df_result.to_csv('df_lda.csv', index=False)