In [1]:
%load_ext autoreload
%autoreload 2

## Multiple Choice Question Answer

In [2]:
import os
while 'notebooks' in os.getcwd():
    os.chdir("..")

import numpy as np
import pandas as pd 
from src.utils import train_test_split, get_sample_weights, get_samples_per_match, get_eval_set
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel
from src.preprocessing import TextDataset
import torch
from torch.utils.data import DataLoader, Dataset, TensorDataset
from IPython.display import clear_output
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, roc_auc_score
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation, LoggingHandler
import logging
from copy import deepcopy
from sklearn.decomposition import PCA
from huggingface_hub import notebook_login
from sklearn.ensemble import RandomForestClassifier
from peft import get_peft_model, LoraConfig, TaskType
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import StratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.decomposition import PCA
from collections import defaultdict
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import pycountry
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from torch import nn
from src.preprocessing import preprocess_data

tqdm.pandas()

In [3]:
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  # For WordNet Lemmatizer

tqdm.pandas()

# Prepare country and nationality patterns
def prepare_patterns():
    countries = [country.name for country in pycountry.countries]
    country_alpha3 = [country.alpha_3 for country in pycountry.countries]  # 3-letter codes
    country_alpha2 = [country.alpha_2 for country in pycountry.countries]  # 2-letter codes
    nationalities = [getattr(country, 'demonym', None) for country in pycountry.countries if hasattr(country, 'demonym')]
    nationalities = [nat for nat in nationalities if nat]  # Filter out None values

    # Combine all patterns
    patterns = countries + country_alpha3 + country_alpha2 + nationalities
    return sorted(patterns, key=len, reverse=True)

# Compile regex for countries and nationalities
patterns = prepare_patterns()
combined_pattern = re.compile(r'\b(' + '|'.join(re.escape(item) for item in patterns) + r')\b', flags=re.IGNORECASE)

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to process text
def process_text(text, stop_words, pattern=combined_pattern):
    placeholder_count = 0
    placeholder_map = {}

    # Replace countries and nationalities
    def replace_match(match):
        nonlocal placeholder_count
        match_text = match.group(0).lower()
        if match_text not in placeholder_map:
            placeholder_count += 1
            placeholder_map[match_text] = f"Country {chr(64 + placeholder_count)}"
        return placeholder_map[match_text]

    text = pattern.sub(replace_match, text)

    # Remove hashtags
    text = re.sub(r'#\w+', '', text)

    # Tokenize, remove stopwords, and lemmatize
    words = text.split()
    filtered_words = [
        lemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stop_words
    ]

    return ' '.join(filtered_words)

# Main Pandas Workflow
def process_dataframe(df, text_column):
    # Load stopwords
    stop_words = set(stopwords.words('english'))

    # Apply processing to the specified text column
    df['ProcessedText'] = df[text_column].progress_apply(lambda x: process_text(x, stop_words))
    return df

[nltk_data] Downloading package stopwords to
[nltk_data]     /users/eleves-a/2022/pedro.silva/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /users/eleves-a/2022/pedro.silva/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /users/eleves-a/2022/pedro.silva/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
mapper = defaultdict(lambda : 1)
mapper['no'] = 0

In [5]:
all_df = []
train, test = train_test_split()
train = pd.concat(train.values())
train_df =process_dataframe(train, "Tweet")

# df['label'] = df['event'].map(mapper)

# df.loc[(df['EventType'] == 0) & (df['label'] == 1), 'label'] = 0
# wrong_labels

  0%|          | 0/16 [00:00<?, ?it/s]

100%|██████████| 16/16 [00:05<00:00,  3.02it/s]
100%|██████████| 1472980/1472980 [03:18<00:00, 7422.19it/s]


In [6]:
total_test_df = get_eval_set().set_index(["MatchID", "PeriodID"])
test_df = preprocess_data(total_test_df)
test_df =process_dataframe(test_df, "Tweet")

100%|██████████| 4/4 [00:01<00:00,  3.79it/s]
100%|██████████| 362397/362397 [00:48<00:00, 7410.93it/s]


## Training model

In [7]:
def get_features_train_test(df_train, df_test):

    forced_words = [
        "goal", "penalty", "halftime", "full", "yellow", "red",
        "kickoff", "extra", "time", "foul", "offside", "handball",
        "save", "tackle", "dribble", "corner", "substitution", "header",
        "free", "kick", "throw", "assist", "hat", "trick", "own", "victory",
        "defeat", "draw", "win", "loss", "tie", "comeback", "goalkeeper",
        "striker", "midfielder", "defender", "referee", "fans", "var"
    ] 
    custom_vocab = {word: i for i, word in enumerate(forced_words)}

    vectorizer = TfidfVectorizer(vocabulary=custom_vocab)
    
    vectors_train = vectorizer.fit_transform(df_train['ProcessedText'])
    vectors_test = vectorizer.transform(df_test['ProcessedText'])

    num_tweets_train = df_train.groupby(df_train.index)\
        .ID\
        .count()
    
    num_tweets_test = df_test.groupby(df_test.index)\
        .ID\
        .count()

    X_train = pd.DataFrame(
        vectors_train.toarray(), 
        index = df_train.index, 
        columns=vectorizer.get_feature_names_out()
    )

    X_train = X_train.groupby(X_train.index).sum()

    X_test = pd.DataFrame(
        vectors_test.toarray(), 
        index = df_test.index, 
        columns=vectorizer.get_feature_names_out()
    )
    X_test = X_test.groupby(X_test.index).sum()

    X_train['num_tweets'] = num_tweets_train
    X_test['num_tweets'] = num_tweets_test


    y_train = df_train\
        .groupby(df_train.index)\
        .EventType\
        .first()
    
    try:
    
        y_test = df_test\
            .groupby(df_test.index)\
            .EventType\
            .first()
        
        return X_train, X_test, y_train, y_test
    
    except:
        return X_train, X_test, y_train, None

In [11]:
n_estimators = [100]
max_depth = [4]
samples_per_match = get_samples_per_match()
samples_weights = get_sample_weights()
from itertools import product

zipped_parameters = list(product(n_estimators, max_depth))

for n_estimators, max_depth in zipped_parameters:
    print("\n")
    print(f"n_estimators = {n_estimators}, max_depth = {max_depth}")

    acc_total = []
    for seed in np.random.randint(1, 100, size=10):

        train_matches = list(np.random.choice(train_df.MatchID.unique(), size = 12, replace=False))
        test_matches = list(set(train_df.MatchID.unique()).difference(set(train_matches)))
        print(f"MatchID = {test_matches}")

        # test_matches
        train_set = train_df.query(f"MatchID not in {test_matches}")
        train_set = train_set.set_index(["MatchID", "PeriodID"])

        val_set = train_df.query(f"MatchID  in {test_matches}")
        val_set = val_set.set_index(["MatchID", "PeriodID"])
        X_train, X_test, y_train, y_test = get_features_train_test(train_set, val_set)

        # cls_weight = compute_class_weight(y = y_train, class_weight='balanced', classes=np.array([0,1]))

        X_train['num_tweets_total'] = samples_weights[X_train.index] 
        X_test['num_tweets_total'] = samples_weights[X_test.index]

        features = X_train\
            .corrwith(y_train, method = 'spearman')\
            .sort_values()\
            .iloc[-0:]\
            .index


        clf = RandomForestClassifier(n_estimators = n_estimators, max_depth=max_depth,random_state=1, )
        

        # clf = LogisticRegression()
        # clf = KNeighborsClassifier()


        clf.fit(X_train[features], y_train)
        y_pred_train = clf.predict(X_train[features])
        y_pred_test = clf.predict(X_test[features])

        acc_train = accuracy_score(y_train, y_pred_train, sample_weight=samples_weights[X_train.index])
        acc_test = accuracy_score(y_test, y_pred_test, sample_weight=samples_weights[X_test.index])

        acc_total.append(acc_test)

        print(acc_train, acc_test)
        print(confusion_matrix(y_test, y_pred_test))
    
    print(f"\n mean acc = {np.mean(acc_total)}")
    # print(f"\n mean acc = {np.array(acc_total) @ samples_per_match/np.sum(samples_per_match)}")

100%|██████████| 16/16 [00:05<00:00,  3.00it/s]
100%|██████████| 16/16 [00:05<00:00,  3.05it/s]




n_estimators = 100, max_depth = 4
MatchID = [8, 19, 3, 12]
0.776895628088652 0.7398175524091105
[[ 66 139]
 [ 31 251]]
MatchID = [1, 18, 5, 7]
0.7979332003577484 0.588852794983377
[[ 87 147]
 [ 53 233]]
MatchID = [19, 11, 4, 7]
0.7868007033485797 0.6409856028066109
[[124 144]
 [ 62 230]]
MatchID = [8, 17, 3, 7]
0.7849666535163791 0.6726950170931316
[[ 66 176]
 [ 34 244]]
MatchID = [18, 19, 10, 11]
0.7636316213471455 0.5998058390392585
[[ 92 190]
 [ 33 255]]
MatchID = [0, 11, 4, 5]
0.7899778134492895 0.6037649536393255
[[ 85 171]
 [ 66 238]]
MatchID = [17, 10, 4, 5]
0.657820609590978 0.5047892654217279
[[ 32 309]
 [  4 265]]
MatchID = [0, 18, 14, 7]
0.7901971808853074 0.6910274513707023
[[172  42]
 [142 164]]
MatchID = [19, 17, 3, 14]
0.7904198386181311 0.6796362824235076
[[ 63 183]
 [ 32 242]]
MatchID = [17, 10, 12, 14]
0.7008105652365573 0.5758103628696214
[[112 162]
 [ 51 212]]

 mean acc = 0.6297185122056372


## Obtaining predictions

In [32]:
train_df['ProcessedText']

2         v b - current tweets: 852:2711 . live : http:/...
8         country b going country c beat country good to...
11        match updates: world cup group e - country lin...
13        line-up: country v swiss http://t.country b/h7...
14        let's go country a, little help country b coun...
                                ...                        
155519                                            game 🙆⚽️🙌
155480    country v country b country c knockout rounds?...
155478    y'all already worried country going country b ...
155494    holland country going country b country c toug...
155548    know happened country country b tweet country ...
Name: ProcessedText, Length: 1616320, dtype: object

In [12]:
X_train, X_test, y_train, y_test = get_features_train_test(train_df, test_df)

In [13]:
X_test.index = pd.MultiIndex.from_tuples(X_test.index, names = ["MatchID", "PeriodID"])

In [19]:
clf = RandomForestClassifier(n_estimators = 200, random_state=1, )

clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

KeyboardInterrupt: 

In [16]:
predictions = pd.DataFrame(y_pred_test, index = X_test.index, columns=['EventType'])

In [17]:
pd.merge(total_test_df, predictions, left_index=True, right_index=True)[['EventType']].value_counts(normalize=True)

EventType
1            0.997785
0            0.002215
Name: proportion, dtype: float64