In [42]:
import requests
from bs4 import BeautifulSoup
from sklearn.pipeline import Pipeline
import pickle

import nltk
import pandas as pd
import gensim
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import cross_val_score, TimeSeriesSplit, StratifiedKFold
from yellowbrick.classifier import ConfusionMatrix
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, precision_score, recall_score, plot_roc_curve, accuracy_score, roc_auc_score, roc_curve

from yellowbrick.model_selection import LearningCurve
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing


from yellowbrick.model_selection import FeatureImportances
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import pickle

In [43]:
with open ('words.p', 'rb') as fp:
    my_words = pickle.load(fp)

In [44]:
def ger_df(ngram,n_com):
    df=pd.read_csv('../../src/data/comments.csv')
    X = list(df['comm'])
    dataprep = Pipeline([('count_vectorizer', CountVectorizer(ngram_range=ngram, min_df=10, stop_words=my_words))])
    pipeline = Pipeline([
        ('dataprep', dataprep),
        ('topic_modelling', LatentDirichletAllocation(n_components=n_com, random_state=42,n_jobs=-1))])
    pipeline.fit(X)
    topic_values = pipeline.transform(X)
    df['labels'] = topic_values.argmax(axis=1)
    def who_wins(row):
        result = 'Away' if row['home_goals_final'] < row['away_goals_final'] else 'Home' if row['home_goals_final'] >row['away_goals_final'] else 'Draw'
        return result
    df['result']= df.apply(lambda row: who_wins(row), axis=1)
    df.drop(columns=['home_goals_final','away_goals_final'], inplace=True)
    X=pd.get_dummies(df, columns=['labels'])
    df_group=X.groupby(['id_game','team','result']).sum().reset_index()
    df_group.drop(columns=['time'],inplace= True)
    df_home=df_group.iloc[::2]
    home_columns= ['labels_'+str(i)+'_home' for i in range(0,n_com)]
    home_columns=['id_game','team_home','result']+home_columns
    df_home.columns=home_columns
    df_away=df_group.iloc[1::2]
    away_columns= ['labels_'+str(i)+'_away' for i in range(0,n_com)]
    away_columns=['id_game','team_away','result']+away_columns
    df_away.columns=away_columns
    df_result=pd.merge(left=df_home, right=df_away, on=['id_game','result'])
    df_result=df_result.drop(columns=['id_game'])
    return df_result

In [45]:
def get_results(lda_df):
    def func(x):
        """
        Applying a function that change values of a Draw Match to 0, when Home team won to 1 and Away team won to -1.
        """
        if x == 'Draw':
            return 0
        elif x == 'Home':
            return 1
        else:
            return -1

    def apply_func(dataframe, column):
        dataframe[column] = dataframe[column].apply(func)
    apply_func(lda_df, 'result')
    le = preprocessing.LabelEncoder()
    lda_df[['team_home','team_away']] = lda_df[['team_home','team_away']].apply(le.fit_transform)
    X = lda_df.drop('result', axis=1)
    y = lda_df['result']

    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size = 0.2, 
                                                        random_state = 42,
                                                        stratify=y)
    model = RandomForestClassifier(bootstrap = True, criterion = 'gini', max_depth = 2,
                               max_features = 'auto', min_samples_leaf = 10, 
                               min_samples_split = 5, n_estimators = 4, random_state=42,n_jobs=-1)
    pipeline = Pipeline(steps=[('model', model)])
    pipeline.fit(X_train, y_train)
    pipeline.predict(X_test)
    score=pipeline.score(X_test, y_test).round(3)
    roc=roc_auc_score(y_test, pipeline.predict_proba(X_test), multi_class='ovr').round(3)
    precision=precision_score(y_test, pipeline.predict(X_test), average='weighted').round(3)
    return score, roc, precision

In [46]:
from tqdm import tqdm

In [48]:
n_grams = [(1,1),(1,2),(1,3),(1,4),(2,2)]
components=[5,10,15,20]
for n_gram in tqdm(n_grams):
    for n_com in components:
        score, roc, precision=get_results(ger_df(n_gram,n_com))
        print(f'N_GRAM: {n_gram}, N_com: {n_com}, SCORE: {score},ROC_AUC: {roc},  PRECISION: {precision}')

  0%|          | 0/5 [00:00<?, ?it/s]

N_GRAM: (1, 1), N_com: 5, SCORE: 0.426,ROC_AUC: 0.482,  PRECISION: 0.342


  _warn_prf(average, modifier, msg_start, len(result))


N_GRAM: (1, 1), N_com: 10, SCORE: 0.447,ROC_AUC: 0.645,  PRECISION: 0.342
N_GRAM: (1, 1), N_com: 15, SCORE: 0.404,ROC_AUC: 0.513,  PRECISION: 0.325


 20%|██        | 1/5 [01:27<05:51, 87.91s/it]

N_GRAM: (1, 1), N_com: 20, SCORE: 0.34,ROC_AUC: 0.439,  PRECISION: 0.261
N_GRAM: (1, 2), N_com: 5, SCORE: 0.277,ROC_AUC: 0.599,  PRECISION: 0.257
N_GRAM: (1, 2), N_com: 10, SCORE: 0.362,ROC_AUC: 0.481,  PRECISION: 0.388


  _warn_prf(average, modifier, msg_start, len(result))


N_GRAM: (1, 2), N_com: 15, SCORE: 0.426,ROC_AUC: 0.579,  PRECISION: 0.337


  _warn_prf(average, modifier, msg_start, len(result))
 40%|████      | 2/5 [02:55<04:23, 87.83s/it]

N_GRAM: (1, 2), N_com: 20, SCORE: 0.426,ROC_AUC: 0.511,  PRECISION: 0.335
N_GRAM: (1, 3), N_com: 5, SCORE: 0.489,ROC_AUC: 0.561,  PRECISION: 0.541
N_GRAM: (1, 3), N_com: 10, SCORE: 0.319,ROC_AUC: 0.48,  PRECISION: 0.302


  _warn_prf(average, modifier, msg_start, len(result))


N_GRAM: (1, 3), N_com: 15, SCORE: 0.489,ROC_AUC: 0.483,  PRECISION: 0.388


 60%|██████    | 3/5 [04:34<03:02, 91.02s/it]

N_GRAM: (1, 3), N_com: 20, SCORE: 0.447,ROC_AUC: 0.565,  PRECISION: 0.461


  _warn_prf(average, modifier, msg_start, len(result))


N_GRAM: (1, 4), N_com: 5, SCORE: 0.426,ROC_AUC: 0.445,  PRECISION: 0.335
N_GRAM: (1, 4), N_com: 10, SCORE: 0.362,ROC_AUC: 0.387,  PRECISION: 0.297
N_GRAM: (1, 4), N_com: 15, SCORE: 0.319,ROC_AUC: 0.392,  PRECISION: 0.29


 80%|████████  | 4/5 [06:01<01:30, 90.07s/it]

N_GRAM: (1, 4), N_com: 20, SCORE: 0.383,ROC_AUC: 0.419,  PRECISION: 0.309


  _warn_prf(average, modifier, msg_start, len(result))


N_GRAM: (2, 2), N_com: 5, SCORE: 0.362,ROC_AUC: 0.523,  PRECISION: 0.286
N_GRAM: (2, 2), N_com: 10, SCORE: 0.277,ROC_AUC: 0.441,  PRECISION: 0.243
N_GRAM: (2, 2), N_com: 15, SCORE: 0.404,ROC_AUC: 0.503,  PRECISION: 0.384


100%|██████████| 5/5 [07:07<00:00, 85.60s/it]

N_GRAM: (2, 2), N_com: 20, SCORE: 0.34,ROC_AUC: 0.457,  PRECISION: 0.31



